In [1]:
import warnings
warnings.simplefilter('ignore')
from utils import *
import pickle
import pandas as pd
from collections import defaultdict
from scipy.stats import pearsonr

from sklearn.model_selection import KFold
from sklearn import ensemble
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, make_scorer

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit import DataStructs

import umap

import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator

In [2]:
sns.set_style('ticks', 
              {'font.sans-serif':['Arial'], 
                'text.color': 'black',
                'xtick.color': 'black',
                'ytick.color': 'black',
                })
plt.rcParams.update({'font.size':20})
colors = sns.color_palette('Paired')

In [3]:
cell_ls = ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC']
with open('../data/LINCS2020/KPGT_emb2304.pickle', 'rb') as f:
    smi2emb = pickle.load(f)
    
with open('../data/LINCS2020/idx2smi.pickle', 'rb') as f:
    idx2smi = pickle.load(f)
with open('../data/LINCS2020/smi2idx.pickle', 'rb') as f:
    smi2idx = pickle.load(f)
print(len(smi2emb.keys()))

8316


### TranSiGen ΔX' (test_external)

In [4]:
pred_dir = '../results/baseline/trained_models_7_cell_smiles_split/895834/feature_KPGT_init_pretrain_shRNA/predict/'
data_LINCS = load_from_HDF(pred_dir + 'test_external_prediction_profile.h5')
smi_ls = []    
for idx in range(data_LINCS['cid'].shape[0]):
    smi = idx2smi[data_LINCS['cp_id'][idx]]
    smi_ls.append(smi)
data_LINCS['canonical_smiles'] = np.array(smi_ls)

for k in data_LINCS.keys():
    print(k, data_LINCS[k].shape)

cid (25126,)
cp_id (25126,)
sig (25126,)
x1 (25126, 978)
x2 (25126, 978)
x2_pred (25126, 978)
x2_rec (25126, 978)
canonical_smiles (25126,)


In [5]:
ECFP_array = []
for smi in data_LINCS['canonical_smiles']:
    mol = Chem.MolFromSmiles(smi)
    ECFP = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    ECFP_array.append(ECFP)
data_LINCS['ECFP4'] = np.array(ECFP_array)

KPGT_array = []
for smi in data_LINCS['canonical_smiles']:
    KPGT_array.append(smi2emb[smi])
data_LINCS['KPGT'] = np.array(KPGT_array)

DEG_array = data_LINCS['x2'] - data_LINCS['x1']
DEG_rec_array = data_LINCS['x2_rec'] - data_LINCS['x1']
DEG_pred_array = data_LINCS['x2_pred'] - data_LINCS['x1']
ECFP_array = data_LINCS['ECFP4']
KPGT_array = data_LINCS['KPGT']

In [6]:
## information for all cell line, molecules
df_LINCS = pd.DataFrame(data_LINCS['cid'], columns=['cid'])
df_LINCS['canonical_smiles'] = data_LINCS['canonical_smiles']
df_LINCS['cp_id'] = df_LINCS['canonical_smiles'].apply(lambda x: smi2idx[x])
df_LINCS

Unnamed: 0,cid,canonical_smiles,cp_id
0,PC3,CN(C)C(=O)[C@H]1[C@H](CO)[C@H]2Cn3c(ccc(C4=CCC...,2472
1,PC3,COc1cc(ccc1Nc2ncc3N(C)C(=O)CCN(C4CCC(O)CC4)c3n...,3693
2,MCF7,COc1ccc2C(=O)/C(=Cc3ccc(OC)c(OC)c3)/COc2c1,4168
3,MCF7,OC(=O)CCC(NS(=O)(=O)c1ccc(cc1)c2ccccc2)C(=O)O,7413
4,MCF7,Oc1ccc(-c2nn[nH]n2)c2cccnc12,8063
...,...,...,...
25121,PC3,C[C@H](CO)N1C[C@@H](C)[C@@H](CN(C)S(=O)(=O)c2c...,5038
25122,MCF7,COc1ccc(cc1)C(=O)NC(c2ccc(OC)c(OC)c2)c3cc(Cl)c...,3992
25123,PC3,CCN(C)C(=O)Oc1cccc(c1)[C@H](C)N(C)C,1834
25124,A375,CC[C@@H]1C=C(C)C[C@H](C)C[C@H](OC)[C@@H]2O[C@]...,2230


### Annotate target information

In [10]:
target_df = pd.read_csv('../data/LINCS2020_PubChem_target_info.csv')
target_df

Unnamed: 0,canonical_smiles,target,target_id,label,source
0,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,ALOX15,P16050,1,pubchem
1,CC1(C)[C@H](CC[C@@]2(C)[C@H]1CC[C@]3(C)[C@@H]2...,HSD11B1,P28845,1,pubchem
2,CC(Cc1ccc(O)c(O)c1)C(C)Cc1ccc(O)c(O)c1,ALOX12,P18054,1,pubchem
3,COc1ccc(NC(=O)c2ccc(cc2)-c2ccc(cc2C)-c2noc(C)n...,HTR1D,P28221,1,pubchem
4,NCCc1c[nH]c2ccc(O)cc12,HTR1A,P08908,1,pubchem
...,...,...,...,...,...
18495,COc1ccc(cc1COc1c(C)cccc1C)C1Nc2ccccc2C(=O)N1Cc...,TSHR,P16473,1,LINCS2020
18496,COc1ccc(cc1)S(=O)(=O)N(CCO)c2ccccc2CN(C)C/C=C/...,CAMK2A,Q9UQM7,1,LINCS2020
18497,COc1ccc(cc1)C(=C2C(=O)Nc3ccccc23)c4ccc(OC)cc4,CAMK2A,Q9UQM7,1,LINCS2020
18498,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,GNRHR,P30968,1,LINCS2020


In [11]:
df_target_for_cid = pd.DataFrame(columns=['target'])
for selected_cid in cell_ls:
    LINCS_smiles_ls_for_cid = sorted(list(set(df_LINCS[df_LINCS['cid'] == selected_cid]['canonical_smiles'])))
    target_df_for_cid = target_df[(target_df['canonical_smiles'].isin(LINCS_smiles_ls_for_cid))]  
    df_tmp = target_df_for_cid.value_counts('target').reset_index().head(20)
    df_tmp.columns= ['target', selected_cid]
    df_target_for_cid = pd.merge(df_target_for_cid, df_tmp, left_on=None, how='outer')
df_target_for_cid

Unnamed: 0,target,A375,HA1E,HELA,HT29,MCF7,PC3,YAPC
0,HTR2A,88.0,137.0,64.0,76.0,158.0,162.0,60.0
1,DRD2,78.0,130.0,55.0,66.0,142.0,144.0,49.0
2,ADRA2A,76.0,106.0,51.0,70.0,128.0,131.0,47.0
3,KCNH2,72.0,89.0,58.0,53.0,104.0,113.0,44.0
4,DRD3,72.0,115.0,49.0,63.0,125.0,126.0,45.0
5,HTR2B,71.0,111.0,56.0,61.0,120.0,127.0,47.0
6,HTR2C,69.0,119.0,52.0,60.0,133.0,136.0,45.0
7,ADRA2B,64.0,87.0,45.0,56.0,103.0,106.0,40.0
8,SLC6A4,62.0,87.0,51.0,53.0,110.0,114.0,41.0
9,CYP2D6,60.0,68.0,44.0,46.0,82.0,89.0,35.0


### DLEPS

In [12]:
df_DLEPS_external = pd.read_csv('../results/baseline/DLEPS/smiles_split/result_test_external_seed343.csv')
DLEPS_pred_array = np.array(df_DLEPS_external.iloc[:, 1:])
print(DLEPS_pred_array.shape, type(DLEPS_pred_array))
DLEPS_pred_array

(25126, 978) <class 'numpy.ndarray'>


array([[-0.49215972,  0.54963338,  0.29133952, ..., -0.84925705,
        -0.00142619,  0.4664081 ],
       [-0.84851676,  0.55724311,  0.34684628, ..., -1.13050663,
         0.01257842,  0.44271934],
       [ 1.24347758,  0.93602711, -0.12212732, ...,  1.16335428,
        -0.12323196,  0.72062916],
       ...,
       [-0.11247773,  0.64319599,  0.21654259, ..., -0.27496064,
        -0.04910432,  0.52765197],
       [-0.75984836,  0.33571479,  0.32759017, ..., -1.07286894,
         0.01936504,  0.28974941],
       [ 0.434286  ,  0.51672924, -0.01642013, ...,  0.38941333,
        -0.04644619,  0.39385477]])

### DeepCE

In [13]:
df_DeepCE_external = pd.read_csv('../results/baseline/DeepCE/smiles_split/result_test_external_seed343.csv')
DeepCE_pred_array = np.array(df_DeepCE_external.iloc[:, 5:])
print(DeepCE_pred_array.shape, type(DeepCE_pred_array))
DeepCE_pred_array

(25126, 978) <class 'numpy.ndarray'>


array([[-0.97417456,  0.94665819, -0.09210257, ..., -0.98393347,
        -0.0360845 ,  0.32205338],
       [ 0.82055187,  0.93450588,  0.58601191, ..., -0.347447  ,
         0.1559524 ,  1.15640839],
       [ 0.94290865,  1.22360912, -0.06593236, ...,  0.29021719,
         0.01664902,  1.11445657],
       ...,
       [ 0.93245108,  0.92178554,  0.21250249, ...,  2.34908056,
        -0.33434472,  0.53529965],
       [-1.32865398, -0.10063478,  0.14201809, ..., -1.04868236,
        -0.20480477, -0.23018641],
       [ 1.06971797,  1.37444122, -0.40474831, ...,  2.15661409,
        -0.39273902,  0.52712886]])

### CIGER

In [14]:
df_CIGER_external = pd.read_csv('../results/baseline/CIGER/smiles_split/result_test_external_seed343_real.csv')
CIGER_pred_array = np.array(df_CIGER_external.iloc[:, 5:])
print(CIGER_pred_array.shape, type(CIGER_pred_array))
CIGER_pred_array

(25126, 978) <class 'numpy.ndarray'>


array([[-2.19572544,  1.5308708 ,  0.03873972, ...,  0.38758346,
         0.76600462, -1.19404328],
       [ 5.49974155,  3.44760108,  1.09678066, ...,  6.2509346 ,
         0.36355799,  2.71384668],
       [ 1.54190314,  2.68659329,  0.48239705, ...,  0.05228177,
         0.44299695,  2.21594024],
       ...,
       [ 3.55539727,  1.89573467,  0.69547939, ...,  5.47067928,
         0.85276121,  3.90251541],
       [-0.86126018,  1.45067167, -0.09999803, ...,  0.05743099,
        -0.59741366,  0.84147489],
       [ 2.24885273,  5.6389184 , -1.67175472, ...,  6.15553093,
         1.96859515,  1.8116442 ]])

## perturbational representations-based model 

In [13]:
target = 'HTR2A'
df_data = pd.read_csv('../results/4.Ligand_based_virtual_screening/{}/data.csv'.format(target))
smiles_train, smiles_test = list(df_data[df_data['split']=='train']['canonical_smiles']), list(df_data[df_data['split']=='test']['canonical_smiles'])
label_train, label_test = np.array(list(df_data[df_data['split']=='train']['label'])), np.array(list(df_data[df_data['split']=='test']['label']))
smi_idx_train, smi_idx_test = [smi2idx[smi] for smi in smiles_train], [smi2idx[smi] for smi in smiles_test]

df_grid_search_results = pd.read_csv('../results/4.Ligand_based_virtual_screening/{}/grid_search_param.csv'.format(target))

df_metrics = pd.DataFrame(columns=['AUROC', 'AUPR', 'data', 'target', 'cid', 'random_seed'])
predict_result = defaultdict(list)
# random_seed_ls = [random.randint(0, 1000000) for i in range(5)]
random_seed_ls = [808431, 510395, 584403, 630680, 532783]

for idx in range(len(cell_ls)):
    selected_cid = cell_ls[idx]
    print(selected_cid)
    index_train = [df_LINCS[(df_LINCS['canonical_smiles']==smi) & (df_LINCS['cid']== selected_cid)].index[0] for smi in smiles_train]
    index_test = [df_LINCS[(df_LINCS['canonical_smiles']==smi) & (df_LINCS['cid']== selected_cid)].index[0] for smi in smiles_test]
    print('TRAIN:', len(index_train), 'TEST:', len(index_test))
    for data_name, data_sub in zip(['$\mathit{\Delta}X$', '$\mathit{\Delta}\widehat{X}$', 'TranSiGen', 'CIGER', 'DeepCE', 'DLEPS'], 
                           [DEG_array, DEG_rec_array, DEG_pred_array, CIGER_pred_array, DeepCE_pred_array, DLEPS_pred_array]):
        
        best_params = df_grid_search_results[(df_grid_search_results['data']== data_name) & (df_grid_search_results['target']== target)
                                  & (df_grid_search_results['cid']== selected_cid)].reset_index(drop=True).to_dict(orient='index')[0]
        metrics_dict = defaultdict(list)
        run_idx = 0
        data_train, data_test = data_sub[index_train], data_sub[index_test]
        
        ## early fusion feature
        if data_name == 'TranSiGen':
            try:
                data_train_concat = np.concatenate((data_train_concat, data_train),axis=1) 
                data_test_concat = np.concatenate((data_test_concat, data_test),axis=1) 
            except:
                data_train_concat = data_train
                data_test_concat = data_test

        for run_random_seed in random_seed_ls:
            clf = ensemble.RandomForestClassifier(max_depth=best_params['max_depth'], 
                                                  n_estimators=best_params['n_estimators'], 
                                                  max_features='auto', 
                                                  criterion=best_params['criterion'], 
                                                  oob_score=best_params['oob_score'],
                                                  random_state=run_random_seed)
            clf = clf.fit(data_train, label_train)
            data_test_pred = clf.predict_proba(data_test)
            if data_name == 'TranSiGen':
                predict_result['label'] += [list(label_test)]
                predict_result['cp_id'] += [list(smi_idx_test)]
                predict_result['cid'] += [[selected_cid] *len(list(label_test))]
                predict_result['target'] += [[target] *len(list(label_test))]
                predict_result['random_seed'] += [[run_random_seed] *len(list(label_test))]
                predict_result['idx'] += [[run_idx] *len(list(label_test))]
            
            predict_result[data_name] += [list(data_test_pred[:, 1])]
            metrics_dict['AUROC'] += [roc_auc_score(label_test, data_test_pred[:, 1])]
            precision, recall, _thresholds = precision_recall_curve(label_test, data_test_pred[:, 1])
            metrics_dict['AUPR'] += [auc(recall, precision)]
            metrics_dict['random_seed'] += [run_random_seed]
            run_idx +=1

        df_metrics_tmp = pd.DataFrame.from_dict(metrics_dict)
        df_metrics_tmp.loc[:, 'data'] = data_name
        df_metrics_tmp.loc[:, 'cid'] = selected_cid
        df_metrics_tmp.loc[:, 'target'] = target
        df_metrics = pd.concat([df_metrics, df_metrics_tmp])

print('=========late fusion========')
for data_name in ['$\mathit{\Delta}X$', '$\mathit{\Delta}\widehat{X}$', 'TranSiGen']:
    metrics_dict = defaultdict(list)
    for idx in range(len(random_seed_ls)):
        ensemble_dict = defaultdict(list)
        for cid_idx in range(len(cell_ls)):
            result_idx = cid_idx * 5 + idx
            ensemble_dict[cid_idx] += predict_result[data_name][result_idx]
        pred_ensemble = np.mean(np.array(list(ensemble_dict.values())) , axis=0)
        label_test = predict_result['label'][result_idx]

        predict_result[data_name + ' (late fusion)'] += [list(pred_ensemble)]
        metrics_dict['AUROC'] += [roc_auc_score(label_test, pred_ensemble)]
        precision, recall, _thresholds = precision_recall_curve(label_test, pred_ensemble)
        metrics_dict['AUPR'] += [auc(recall, precision)]

    df_metrics_tmp = pd.DataFrame.from_dict(metrics_dict)
    df_metrics_tmp.loc[:, 'data'] = data_name + ' (late fusion)'
    df_metrics_tmp.loc[:, 'cid'] = 'all'
    df_metrics_tmp.loc[:, 'target'] = target
    df_metrics = pd.concat([df_metrics, df_metrics_tmp])

print('=========early fusion========')
data_name = 'TranSiGen (early fusion)'
selected_cid = 'all'
metrics_dict = defaultdict(list)
best_params = df_grid_search_results[(df_grid_search_results['data']== data_name) & (df_grid_search_results['target']== target)
                          & (df_grid_search_results['cid']== selected_cid)].reset_index(drop=True).to_dict(orient='index')[0]
for run_random_seed in random_seed_ls:
    clf = ensemble.RandomForestClassifier(max_depth=best_params['max_depth'], 
                                          n_estimators=best_params['n_estimators'], 
                                          max_features='auto', 
                                          criterion=best_params['criterion'], 
                                          oob_score=best_params['oob_score'],
                                          random_state=run_random_seed)

    clf = clf.fit(data_train_concat, label_train)
    data_test_pred = clf.predict_proba(data_test_concat)
    predict_result[data_name] += [list(data_test_pred[:, 1])]
    metrics_dict['AUROC'] += [roc_auc_score(label_test, data_test_pred[:, 1])]
    precision, recall, _thresholds = precision_recall_curve(label_test, data_test_pred[:, 1])
    metrics_dict['AUPR'] += [auc(recall, precision)]
    metrics_dict['random_seed'] += [run_random_seed]

df_metrics_tmp = pd.DataFrame.from_dict(metrics_dict)
df_metrics_tmp.loc[:, 'data'] = data_name
df_metrics_tmp.loc[:, 'cid'] = selected_cid
df_metrics_tmp.loc[:, 'target'] = target
df_metrics = pd.concat([df_metrics, df_metrics_tmp])

del data_train_concat, data_test_concat
# with open('../results/4.Ligand_based_virtual_screening/{}/prediction.pickle'.format(target) , 'wb') as f:
#     pickle.dump(predict_result, f)
# df_metrics.to_csv('../results/4.Ligand_based_virtual_screening/{}/results.csv'.format(target), index=False)

# df_metrics_kfold_mean = df_metrics.groupby(['data', 'target', 'cid']).mean().reset_index()
# df_summary = df_metrics_kfold_mean.groupby(['target', 'data']).agg(func=['mean', 'std'])
# round(df_summary,3) 

A375
TRAIN: 197 TEST: 49
HA1E
TRAIN: 197 TEST: 49
HELA
TRAIN: 197 TEST: 49
HT29
TRAIN: 197 TEST: 49
MCF7
TRAIN: 197 TEST: 49
PC3
TRAIN: 197 TEST: 49
YAPC
TRAIN: 197 TEST: 49


Unnamed: 0_level_0,Unnamed: 1_level_0,AUROC,AUROC,AUPR,AUPR
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
target,data,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
HTR2A,$\mathit{\Delta}X$,0.678,0.103,0.384,0.092
HTR2A,$\mathit{\Delta}X$ (late fusion),0.807,,0.499,
HTR2A,$\mathit{\Delta}\widehat{X}$,0.706,0.077,0.368,0.073
HTR2A,$\mathit{\Delta}\widehat{X}$ (late fusion),0.812,,0.541,
HTR2A,CIGER,0.614,0.086,0.397,0.105
HTR2A,DLEPS,0.566,0.0,0.265,0.0
HTR2A,DeepCE,0.527,0.041,0.225,0.03
HTR2A,TranSiGen,0.889,0.041,0.731,0.092
HTR2A,TranSiGen (early fusion),0.927,,0.768,
HTR2A,TranSiGen (late fusion),0.938,,0.785,


## structure representation-based model (ECFP4, KPGT)

In [14]:
selected_cid = 'all'
for data_name, data_sub in zip(['ECFP4', 'KPGT'], [ECFP_array, KPGT_array]):
    best_params = df_grid_search_results[(df_grid_search_results['data']== data_name) & (df_grid_search_results['target']== target)
                              & (df_grid_search_results['cid']== selected_cid)].reset_index(drop=True).to_dict(orient='index')[0]
    metrics_dict = defaultdict(list)
    data_train, data_test = data_sub[index_train], data_sub[index_test]
    for run_random_seed in random_seed_ls:
        clf = ensemble.RandomForestClassifier(max_depth=best_params['max_depth'], 
                                              n_estimators=best_params['n_estimators'], 
                                              max_features='auto', 
                                              criterion=best_params['criterion'], 
                                              oob_score=best_params['oob_score'],
                                              random_state=run_random_seed)
        clf = clf.fit(data_train, label_train)
        data_test_pred = clf.predict_proba(data_test)

        predict_result[data_name] += [list(data_test_pred[:, 1])]
        metrics_dict['AUROC'] += [roc_auc_score(label_test, data_test_pred[:, 1])]
        precision, recall, _thresholds = precision_recall_curve(label_test, data_test_pred[:, 1])
        metrics_dict['AUPR'] += [auc(recall, precision)]
        metrics_dict['random_seed'] += [run_random_seed]
        run_idx +=1

    df_metrics_tmp = pd.DataFrame.from_dict(metrics_dict)
    df_metrics_tmp.loc[:, 'data'] = data_name
    df_metrics_tmp.loc[:, 'cid'] = selected_cid
    df_metrics_tmp.loc[:, 'target'] = target
    df_metrics = pd.concat([df_metrics, df_metrics_tmp])

In [17]:
df_metrics_kfold_mean = df_metrics.groupby(['data', 'target', 'cid']).mean().reset_index()
df_summary = df_metrics_kfold_mean.groupby(['target', 'data']).agg(func=['mean', 'std'])

In [16]:
with open('../results/4.Ligand_based_virtual_screening/{}/prediction.pkl'.format(target) , 'wb') as f:
    pickle.dump(predict_result, f)
df_metrics.to_csv('../results/4.Ligand_based_virtual_screening/{}/results.csv'.format(target), index=False)

In [18]:
model_order = ['$\mathit{\Delta}X$ (early fusion)', '$\mathit{\Delta}X$ (late fusion)', 
               '$\mathit{\Delta}\widehat{X}$ (early fusion)', '$\mathit{\Delta}\widehat{X}$(late fusion)', 
               'TranSiGen (early fusion)', 'TranSiGen (late fusion)',
              ]
df_metrics_tmp = df_metrics[(df_metrics['data'].isin(model_order))]
df_metrics_funsion = df_metrics[(df_metrics['data'].isin(['$\mathit{\Delta}X$','$\mathit{\Delta}\widehat{X}$', 'TranSiGen'] ))
                               ].groupby(['random_seed','data', 'target']).mean().reset_index()
df_metrics_funsion['data'] = df_metrics_funsion['data'] + ' (7 cells)'
df_metrics_funsion['cid'] = 'all'
df_metrics_funsion = df_metrics_funsion[df_metrics_tmp.columns]
df_metrics_funsion = pd.concat([df_metrics_funsion, df_metrics_tmp])
df_metrics_funsion

Unnamed: 0,AUROC,AUPR,data,target,cid,random_seed
0,0.663919,0.385169,$\mathit{\Delta}X$ (7 cells),HTR2A,all,510395.0
1,0.694872,0.352182,$\mathit{\Delta}\widehat{X}$ (7 cells),HTR2A,all,510395.0
2,0.887546,0.734065,TranSiGen (7 cells),HTR2A,all,510395.0
3,0.678571,0.3676,$\mathit{\Delta}X$ (7 cells),HTR2A,all,532783.0
4,0.706593,0.356286,$\mathit{\Delta}\widehat{X}$ (7 cells),HTR2A,all,532783.0
5,0.882967,0.722346,TranSiGen (7 cells),HTR2A,all,532783.0
6,0.681319,0.379109,$\mathit{\Delta}X$ (7 cells),HTR2A,all,584403.0
7,0.71978,0.383251,$\mathit{\Delta}\widehat{X}$ (7 cells),HTR2A,all,584403.0
8,0.887546,0.724046,TranSiGen (7 cells),HTR2A,all,584403.0
9,0.695055,0.411114,$\mathit{\Delta}X$ (7 cells),HTR2A,all,630680.0


In [20]:
df_sim_and_result_all = pd.DataFrame(columns=['label', 'cp_id', 'ECFP_max_similarity', 'idx', 
                                              'ECFP4', 'KPGT', 'TranSiGen (early fusion)', 'TranSiGen (late fusion)'])

for te_fold_nums in range(5):
    train_ECFP_array = []
    for smi in smiles_train:
        mol = Chem.MolFromSmiles(smi)
        ECFP = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        train_ECFP_array.append(ECFP)

    test_ECFP_array = []
    for smi in smiles_test:
        mol = Chem.MolFromSmiles(smi)
        ECFP = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        test_ECFP_array.append(ECFP)

    max_ECFP_sims_in_train = []
    for idx in range(len(test_ECFP_array)):
        sims = DataStructs.BulkTanimotoSimilarity(test_ECFP_array[idx],train_ECFP_array)
        max_ECFP_sims_in_train.append(max(sims))


    df_sim_and_result = pd.DataFrame(predict_result['label'][te_fold_nums], columns=['label'])
    df_sim_and_result['cp_id'] = predict_result['cp_id'][te_fold_nums]
    df_sim_and_result['ECFP_max_similarity'] = max_ECFP_sims_in_train
    df_sim_and_result['idx'] = predict_result['idx'][te_fold_nums]

    for data_name in ['ECFP4', 'KPGT', 'TranSiGen (early fusion)', 'TranSiGen (late fusion)']:
        df_sim_and_result[data_name] = predict_result[data_name][te_fold_nums]
    df_sim_and_result_all = pd.concat([df_sim_and_result_all, df_sim_and_result])
    df_sim_and_result_all['label'] = df_sim_and_result_all['label'].astype('float64')

threshold_range = [0, 0.3, 1]
cuts = pd.cut(df_sim_and_result_all['ECFP_max_similarity'], bins=threshold_range)
df_sim_and_result_all['ECFP_max_similarity_threshold'] = cuts

df_result_by_sim_threshold = pd.DataFrame(columns=['ECFP_max_similarity_threshold', 'model', 'fold', 'AUROC', 'AUPR', 'active', 'inactive', 'count'])
for (idx, threshold), group in df_sim_and_result_all.groupby(by=['idx', 'ECFP_max_similarity_threshold']):   
    print(threshold, 'active:', group[group['label'] == 1].shape[0], 'inactive:', group[group['label'] == 0].shape[0],)
    for data_name in ['TranSiGen (early fusion)', 'TranSiGen (late fusion)', 'ECFP4', 'KPGT']:

        AUROC = roc_auc_score(group['label'], group[data_name])
        precision, recall, _thresholds = precision_recall_curve(group['label'], group[data_name])
        AUPR = auc(recall, precision)
        if data_name == 'TranSiGen (early fusion)': data_name = 'TranSiGen_EF'
        elif data_name == 'TranSiGen (late fusion)': data_name = 'TranSiGen_LF'
        df_result_by_sim_threshold.loc[df_result_by_sim_threshold.shape[0],:] = [threshold, data_name, idx, AUROC, AUPR,
                                                                                group[group['label'] == 1].shape[0], 
                                                                                group[group['label'] == 0].shape[0],
                                                                                group.shape[0] ]
df_result_by_sim_threshold

(0.0, 0.3] active: 4 inactive: 28
(0.3, 1.0] active: 6 inactive: 11
(0.0, 0.3] active: 4 inactive: 28
(0.3, 1.0] active: 6 inactive: 11
(0.0, 0.3] active: 4 inactive: 28
(0.3, 1.0] active: 6 inactive: 11
(0.0, 0.3] active: 4 inactive: 28
(0.3, 1.0] active: 6 inactive: 11
(0.0, 0.3] active: 4 inactive: 28
(0.3, 1.0] active: 6 inactive: 11


Unnamed: 0,ECFP_max_similarity_threshold,model,fold,AUROC,AUPR,active,inactive,count
0,"(0.0, 0.3]",TranSiGen_EF,0,0.964286,0.795833,4,28,32
1,"(0.0, 0.3]",TranSiGen_LF,0,0.991071,0.94375,4,28,32
2,"(0.0, 0.3]",ECFP4,0,0.915179,0.56875,4,28,32
3,"(0.0, 0.3]",KPGT,0,0.866071,0.639583,4,28,32
4,"(0.3, 1.0]",TranSiGen_EF,0,0.909091,0.878042,6,11,17
5,"(0.3, 1.0]",TranSiGen_LF,0,0.954545,0.906151,6,11,17
6,"(0.3, 1.0]",ECFP4,0,0.924242,0.788095,6,11,17
7,"(0.3, 1.0]",KPGT,0,0.984848,0.974206,6,11,17
8,"(0.0, 0.3]",TranSiGen_EF,1,0.982143,0.908333,4,28,32
9,"(0.0, 0.3]",TranSiGen_LF,1,0.982143,0.870833,4,28,32
