# Merge submissions

In [4]:
import os
import pandas as pd
import numpy as np
import hashlib

In [5]:
def enhance_submission(df, verbose = False, enhancement_2=True):
    print(df.columns)
    df.columns = df.columns.astype(int)
    series = df.idxmax(axis=1).astype(int)
    duplicates_dict = series.groupby(series).apply(lambda x: x.index.tolist()).to_dict()

    # Filter the dictionary to include only duplicates
    duplicates_dict = {key: value for key, value in duplicates_dict.items() if len(value) > 1}

    missing_values = [value for value in range(3301) if value not in series.values]
    if verbose:
        print(f"{len(duplicates_dict)} molecules have multiple descriptions.")
        print(f"{len(missing_values)} molecules have no description.")

    df_opti = df.copy()
    delta = 1e-4
    tol = 1e-5
    for molecule, descriptions in duplicates_dict.items():
        if enhancement_2 and len(descriptions)>=3:
            best_diff = 0
            for desc in descriptions:
                diff = -df.loc[desc].nlargest(2).diff().iloc[1]
                if diff > best_diff:
                    best_desc = desc
                    best_diff = diff
            for desc in descriptions:
                largest2 = df.loc[desc].nlargest(2)
                if desc != best_desc: 
                    if largest2.index[1] in missing_values:
                        df.iloc[desc, largest2.index[0]] = df.iloc[desc, largest2.index[1]]
                        df.iloc[desc, largest2.index[1]] = df.iloc[desc, largest2.index[0]]
                    else:
                        df.iloc[desc, largest2.index[0]] = df.iloc[desc, largest2.index[1]] 
        if len(descriptions)==2:
            largest2_0 = df.loc[descriptions[0]].nlargest(2)
            largest2_1 = df.loc[descriptions[1]].nlargest(2)
            diff_0 = -largest2_0.diff().iloc[1]
            diff_1 = -largest2_1.diff().iloc[1]
            if diff_0 > diff_1 + tol:
                if largest2_1.index[1] in missing_values:
                    if verbose:
                        print(f"\n0")
                        print(largest2_0)
                        print(largest2_1)
                        print(f"{largest2_1.index[1]} in missing values {largest2_1.index[1] in missing_values}")
                    df_opti.iloc[descriptions[0], largest2_0.index[0]] = df.iloc[descriptions[0], largest2_0.index[0]] + delta
                    df_opti.iloc[descriptions[0], largest2_0.index[1]] = df.iloc[descriptions[0], largest2_0.index[1]] - delta
                    df_opti.iloc[descriptions[1], largest2_1.index[0]] = df.iloc[descriptions[1], largest2_1.index[1]] - delta
                    df_opti.iloc[descriptions[1], largest2_1.index[1]] = df.iloc[descriptions[1], largest2_1.index[0]] + delta
                    if verbose:
                        print(f"{largest2_0.index[0]}:{df_opti.iloc[descriptions[0], largest2_0.index[0]]:.4f}", f"{largest2_0.index[1]}:{df_opti.iloc[descriptions[0], largest2_0.index[1]]:.4f}")
                        print(f"{largest2_1.index[0]}:{df_opti.iloc[descriptions[1], largest2_1.index[0]]:.4f}", f"{largest2_1.index[1]}:{df_opti.iloc[descriptions[1], largest2_1.index[1]]:.4f}\n")
            if tol + diff_0 < diff_1:
                if largest2_0.index[1] in missing_values:
                    if verbose:
                        print(f"\n1")
                        print(largest2_1)
                        print(largest2_0)
                        print(f"{largest2_0.index[1]} in missing values {largest2_0.index[1] in missing_values}")
                    df_opti.iloc[descriptions[1], largest2_1.index[0]] = df.iloc[descriptions[1], largest2_1.index[0]] + delta
                    df_opti.iloc[descriptions[1], largest2_1.index[1]] = df.iloc[descriptions[1], largest2_1.index[1]] - delta
                    df_opti.iloc[descriptions[0], largest2_0.index[0]] = df.iloc[descriptions[0], largest2_0.index[1]] - delta
                    df_opti.iloc[descriptions[0], largest2_0.index[1]] = df.iloc[descriptions[0], largest2_0.index[0]] + delta
                    if verbose:
                        print(f"{largest2_1.index[0]}:{df_opti.iloc[descriptions[1], largest2_1.index[0]]:.4f}", f"{largest2_1.index[1]}:{df_opti.iloc[descriptions[1], largest2_1.index[1]]:.4f}")
                        print(f"{largest2_0.index[0]}:{df_opti.iloc[descriptions[0], largest2_0.index[0]]:.4f}", f"{largest2_0.index[1]}:{df_opti.iloc[descriptions[0], largest2_0.index[1]]:.4f}\n")
    df_opti['ID'] = df_opti.index
    df_opti = df_opti[['ID'] + [col for col in df_opti.columns if col!='ID']]
    return df_opti

In [6]:
submissions_path = '/Data/altegradsubmissions/'

l = os.listdir(submissions_path)

l.sort()
l

['submission_ 0.8544.csv',
 'submission_0.8499.csv',
 'submission_0.8672.csv',
 'submission_CLIP_Chembert_GraphTrans4blocks_temp007_700epoch_plus400fineclip.csv',
 'submission_CLIP_TLearn_GraphPNA2_Chembert.csv',
 'submission_CLIP_TLearn_GraphPNA2_Chembert_fineclip.csv',
 'submission_CLIP_TLearn_GraphResGIN_6blocks.csv',
 'submission_CLIP_TLearn_GraphResGIN_7blocks.csv',
 'submission_CLIP_TLearn_GraphResGIN_8blocks.csv',
 'submission_CLIP_TLearn_GraphResGIN_8blocks_v2.csv',
 'submission_CLIP_TLearn_GraphUnet_Chembert_fineclip.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans2_4blocks.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans2_4blocks_fineclip.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans_2blocks.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans_3blocks.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans_5blocks.csv',
 'submission_CLIP_Tlearn_Chembert_GraphTrans_6blocks.csv',
 'submission_FineCLIP_T7e-2_Chembert_GUnet2_finecliped_twice.csv',
 'submission_Graph_Transformer

In [9]:
submissions_to_keep = [0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,17,19, 20, 21, 23, 24, 25]
for i in submissions_to_keep:
    print(l[i])

submission_ 0.8544.csv
submission_0.8499.csv
submission_0.8672.csv
submission_CLIP_Chembert_GraphTrans4blocks_temp007_700epoch_plus400fineclip.csv
submission_CLIP_TLearn_GraphPNA2_Chembert_fineclip.csv
submission_CLIP_TLearn_GraphResGIN_6blocks.csv
submission_CLIP_TLearn_GraphResGIN_7blocks.csv
submission_CLIP_TLearn_GraphResGIN_8blocks.csv
submission_CLIP_TLearn_GraphResGIN_8blocks_v2.csv
submission_CLIP_TLearn_GraphUnet_Chembert_fineclip.csv
submission_CLIP_Tlearn_Chembert_GraphTrans2_4blocks.csv
submission_CLIP_Tlearn_Chembert_GraphTrans2_4blocks_fineclip.csv
submission_CLIP_Tlearn_Chembert_GraphTrans_2blocks.csv
submission_CLIP_Tlearn_Chembert_GraphTrans_3blocks.csv
submission_CLIP_Tlearn_Chembert_GraphTrans_5blocks.csv
submission_FineCLIP_T7e-2_Chembert_GUnet2_finecliped_twice.csv
submission_Graph_Transformer4blocks_temp7e-2_700epochs.csv
submission_model_cbert_gatres_4blocks_2heads.pt.csv
submission_model_cbert_hyb_gatv2_gin_2.pt.csv
submission_model_cbert_hyb_trans_sage3_09208_f

In [11]:
weights = [1,1,1,1,2,2,2,2,2,1,1,1,1,1,1,2,1, 2, 2, 2, 2, 2]
len(submissions_to_keep), len(weights)

(22, 22)

In [12]:
pre_enhance = False
post_enhance = True
np_sim_list = []
for i in submissions_to_keep:
    if pre_enhance:
        df = enhance_submission(pd.read_csv(submissions_path + l[i], index_col=0))
    else:
        df = pd.read_csv(submissions_path + l[i], index_col=0)
    np_sim_list.append(df.values)

In [13]:
## Renormalize each similarity list to the range 0, 1
for i in range(len(np_sim_list)):
    np_sim_list[i] = (np_sim_list[i] - np_sim_list[i].min()) / (np_sim_list[i].max() - np_sim_list[i].min())

## Compute the mean of the similarity lists
#np_sim_mean = np.mean(np_sim_list, axis=0)
np_sim_mean = np.zeros_like(np_sim_list[0])
for i in range(len(np_sim_list)):
    np_sim_mean += weights[i] * np_sim_list[i]
np_sim_mean /= np.sum(weights)

In [14]:
np_sim_mean.shape

(3301, 3301)

In [15]:
all_names = ""
for i in submissions_to_keep:
    all_names += l[i][:-4] + '_'
hash = hashlib.md5(all_names.encode('utf-8')).hexdigest() 
if pre_enhance and not post_enhance:
    submission_name = 'submission_moe_{}_pre_enhanced.csv'.format(hash)
elif not pre_enhance and post_enhance:
    submission_name = 'submission_moe_{}_post_enhanced.csv'.format(hash)
elif pre_enhance and post_enhance:
    submission_name = 'submission_moe_{}_pre_post_enhanced.csv'.format(hash)
else:
    submission_name = 'submission_moe_{}.csv'.format(hash)
submission_name

'submission_moe_fd46186e15924352d462f174d3fb8c99_post_enhanced.csv'

In [20]:
solution = pd.DataFrame(np_sim_mean)
solution['ID'] = solution.index
solution = solution[['ID'] + [col for col in solution.columns if col!='ID']]
#Get filename of ckpt then use it as filename of submission
solution.to_csv('/Data/altegradsubmissions/' + submission_name, index=False, float_format='%.5f')

In [19]:
if post_enhance:
    solution = pd.DataFrame(np_sim_mean)
    solution = enhance_submission(solution, verbose=True)
    solution['ID'] = solution.index
    solution = solution[['ID'] + [col for col in solution.columns if col!='ID']]
    solution.to_csv('/Data/altegradsubmissions/' + submission_name, index=False, float_format='%.9f')

RangeIndex(start=0, stop=3301, step=1)
218 molecules have multiple descriptions.
236 molecules have no description.

0
35      0.900528
2401    0.893349
Name: 1409, dtype: float64
35      0.892518
2401    0.891597
Name: 1903, dtype: float64
2401 in missing values True
35:0.9006 2401:0.8932
35:0.8915 2401:0.8926


1
62      0.913119
2549    0.847416
Name: 2498, dtype: float64
62      0.881748
2549    0.879328
Name: 1098, dtype: float64
2549 in missing values True
62:0.9132 2549:0.8473
62:0.8792 2549:0.8818


0
69     0.919852
628    0.919810
Name: 274, dtype: float64
69     0.918515
628    0.918485
Name: 1863, dtype: float64
628 in missing values True
69:0.9200 628:0.9197
69:0.9184 628:0.9186


0
79    0.917798
3     0.908093
Name: 790, dtype: float64
79    0.909982
3     0.905761
Name: 2963, dtype: float64
3 in missing values True
79:0.9179 3:0.9080
79:0.9057 3:0.9101


1
100     0.913039
2369    0.913009
Name: 2115, dtype: float64
100     0.913009
2369    0.912996
Name: 952, dtype: fl

In [20]:
strg = ""
for i in submissions_to_keep:
    print(l[i][11:-4])
    strg += l[i][11:-4] + ' '

 0.8544
0.8499
0.8672
CLIP_Chembert_GraphTrans4blocks_temp007_700epoch_plus400fineclip
CLIP_TLearn_GraphPNA2_Chembert_fineclip
CLIP_TLearn_GraphResGIN_6blocks
CLIP_TLearn_GraphResGIN_7blocks
CLIP_TLearn_GraphResGIN_8blocks
CLIP_TLearn_GraphResGIN_8blocks_v2
CLIP_TLearn_GraphUnet_Chembert_fineclip
CLIP_Tlearn_Chembert_GraphTrans2_4blocks
CLIP_Tlearn_Chembert_GraphTrans2_4blocks_fineclip
CLIP_Tlearn_Chembert_GraphTrans_2blocks
CLIP_Tlearn_Chembert_GraphTrans_3blocks
CLIP_Tlearn_Chembert_GraphTrans_5blocks
FineCLIP_T7e-2_Chembert_GUnet2_finecliped_twice
Graph_Transformer4blocks_temp7e-2_700epochs
model_cbert_gatres_4blocks_2heads.pt
model_cbert_hyb_gatv2_gin_2.pt
model_cbert_hyb_trans_sage3_09208_full.pt
model_cbert_sageres_09084.pt
model_cbert_transf_4blocks_2heads_2_09175.pt


In [21]:
strg+str(weights)

' 0.8544 0.8499 0.8672 CLIP_Chembert_GraphTrans4blocks_temp007_700epoch_plus400fineclip CLIP_TLearn_GraphPNA2_Chembert_fineclip CLIP_TLearn_GraphResGIN_6blocks CLIP_TLearn_GraphResGIN_7blocks CLIP_TLearn_GraphResGIN_8blocks CLIP_TLearn_GraphResGIN_8blocks_v2 CLIP_TLearn_GraphUnet_Chembert_fineclip CLIP_Tlearn_Chembert_GraphTrans2_4blocks CLIP_Tlearn_Chembert_GraphTrans2_4blocks_fineclip CLIP_Tlearn_Chembert_GraphTrans_2blocks CLIP_Tlearn_Chembert_GraphTrans_3blocks CLIP_Tlearn_Chembert_GraphTrans_5blocks FineCLIP_T7e-2_Chembert_GUnet2_finecliped_twice Graph_Transformer4blocks_temp7e-2_700epochs model_cbert_gatres_4blocks_2heads.pt model_cbert_hyb_gatv2_gin_2.pt model_cbert_hyb_trans_sage3_09208_full.pt model_cbert_sageres_09084.pt model_cbert_transf_4blocks_2heads_2_09175.pt [1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2]'