In [1]:
from dataset import TranSiGenDataset_screening
from utils import *

from cmapPy.pandasGEXpress.parse import parse
import os
import pickle
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import roc_curve, auc


from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator

In [2]:
sns.set_style('ticks', 
              {'font.sans-serif':[ 'Arial'],
                'text.color': 'black',
                'xtick.color': 'black',
                'ytick.color': 'black',
                })
plt.rcParams.update({'font.size':24})
colors = sns.color_palette('Paired')

### gene information and infer weight

In [3]:
df_gene = pd.read_csv('../data/LINCS2020/geneinfo_processed.csv')
# df_landmark_gene = df_gene[(df_gene['pr_is_bing'] == 1) & (df_gene['pr_is_lm']==1)]
# df_best_infer_gene = df_gene[(df_gene['pr_is_bing'] == 1) & (df_gene['pr_is_lm']==0)]
# landmark_ids = df_landmark_gene['pr_id'].tolist()
# best_infer_ids = df_best_infer_gene['pr_id'].tolist()

# weight_path = '../data/LINCS2020/infer_weight.gctx'
# infer_weight = parse(weight_path, cid=['OFFSET']+landmark_ids, rid=best_infer_ids)
# infer_weight_df_tmp = infer_weight.data_df
# infer_weight_df = infer_weight_df_tmp[['OFFSET'] + landmark_ids]
# infer_weight_df = infer_weight_df.loc[best_infer_ids]

In [17]:
with open('../data/LINCS2020/modz_x1.pickle', 'rb') as f:
    dict_modz_x1_all_cid = pickle.load(f)
    
filename = '../results/trained_models_164_cell_smiles_split/364039/feature_KPGT_init_pretrain_shRNA/best_model.pt'
dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = torch.load(filename, map_location='cpu')
model.dev = torch.device(dev)
model.to(dev)
print(model)

def predict_profile(df_screening, selected_cid, pred_seed):
    emb_array = []
    smi_idx_array = []
    for idx, row in df_screening.iterrows():
        smi = row['canonical_smiles']
        emb_array.append(smi2emb[smi]) 
        smi_idx_array.append(row['cp_id'])
    emb_array = np.array(emb_array)
    smi_idx_array = np.array(smi_idx_array)
    cid_array = np.array([selected_cid] * emb_array.shape[0])
    x1_array = dict_modz_x1_all_cid[selected_cid]
    x1_array = np.repeat(x1_array, emb_array.shape[0], axis=0).astype(np.float32) 
    test = TranSiGenDataset_screening(x1=x1_array, mol_feature=emb_array, mol_id=smi_idx_array, cid=cid_array)
    test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=64, shuffle=False, drop_last=False, num_workers=4, worker_init_fn=seed_worker)

    setup_seed(pred_seed) 
    x2_pred_array, cp_id_array, cid_array = model.predict_profile_for_x1(test_loader)
    ddict_data = dict()
    ddict_data['x1'] = x1_array
    ddict_data['x2_pred'] = x2_pred_array
    ddict_data['cp_id'] = cp_id_array
    ddict_data['cid'] = cid_array

    sig_ls = []
    for idx in range(ddict_data['cid'].shape[0]):
        sig_ls.append(str(int(ddict_data['cp_id'][idx])) + '_' + ddict_data['cid'][idx])
    ddict_data['sig'] = np.array(sig_ls)

    ## infer gene
    for data_type in ['x1', 'x2_pred']: 
        x = ddict_data[data_type] 
        inferred = np.dot(x, np.array(infer_weight_df.T[1:])) + np.array(infer_weight_df.T.loc['OFFSET'])  #(18539, 9196)
        x_tmp = np.concatenate((x, inferred), axis=1)
        ddict_data['{}_inferred'.format(data_type)] = x_tmp
    
    # save_to_HDF('../results/6.Phenotype_based_drug_repurposing/prediction_profile_{}_{}.h5'.format(selected_cid, pred_seed), ddict_data)
    
    return ddict_data

TranSiGen(
  (encoder_x2): Sequential(
    (0): Linear(in_features=978, out_features=1200, bias=True)
    (1): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
  )
  (encoder_x1): Sequential(
    (0): Linear(in_features=978, out_features=1200, bias=True)
    (1): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
  )
  (mu_z2): Sequential(
    (0): Linear(in_features=1200, out_features=100, bias=True)
  )
  (logvar_z2): Sequential(
    (0): Linear(in_features=1200, out_features=100, bias=True)
  )
  (mu_z1): Sequential(
    (0): Linear(in_features=1200, out_features=100, bias=True)
  )
  (logvar_z1): Sequential(
    (0): Linear(in_features=1200, out_features=100, bias=True)
  )
  (decoder_x2): Sequential(
    (0): Linear(in_features=100, out_features=800, bias=True)
    (1): BatchNorm1d(800, eps=1e-05, momentum=0.1



## screening data

In [4]:
df_PRISM_external = pd.read_csv('../data/PRISM/screening_compound.csv')
df_PRISM_external

Unnamed: 0,canonical_smiles,cp_id,PRISM_auc,name,moa,target
0,CC(NC(=O)C1CSCN1C(=O)c1ccccc1)c1ccccc1,0,-1.000000,RS-0481,immunostimulant,
1,C/C=C1\C(OC2OC(CO)C(O)C(O)C2O)OC=C(C(=O)OC)C1C...,1,-1.000000,oleuropein,estrogen receptor agonist,GPER1
2,CCC(C)C(N)C(=O)O,2,-1.000000,isoleucine,,"ACADSB, BCAT1, BCAT2, IARS, IARS2"
3,CC(N)Cc1cccc(O)c1,3,-1.000000,gepefrine,adrenergic receptor agonist,
4,CC(C)(C)NCC(O)COc1cc(Cl)ccc1Cl,4,-1.000000,cloranolol,adrenergic receptor antagonist,"ADRB1, ADRB2, ADRB3"
...,...,...,...,...,...,...
1620,O=[Si]=O,1620,-1.000000,talc,,
1621,CC1=C(C(=O)OCCN(Cc2ccccc2)c2ccccc2)C(c2cccc([N...,1621,0.941037,efonidipine-monoethanolate,"L-type calcium channel blocker, T-type calcium...","CACNA1C, CACNA1G"
1622,CC(C)(C)CC(C)(C)c1ccc(O)cc1,1622,-1.000000,tyloxapol,NFkB pathway inhibitor,"LPL, NFKB2"
1623,C=CCN,1623,1.097299,sevelamer,phosphate antagonist,


In [5]:
with open('../data/PRISM/KPGT_emb2304.pickle', 'rb') as f:
    smi2emb = pickle.load(f)

with open('../data/PRISM/idx2smi.pickle', 'rb') as f:
    idx2smi = pickle.load(f)
with open('../data/PRISM/smi2idx.pickle', 'rb') as f:
    smi2idx = pickle.load(f)
print(len(smi2idx.keys()), len(idx2smi.keys()))   

1625 1625


## TranSiGen(drug)

In [16]:
df_approved = pd.read_csv('../results/6.Phenotype_based_drug_repurposing/approved_drugs.csv')
df_DEG_approved = pd.read_csv('../results/6.Phenotype_based_drug_repurposing/DEGs_approved_drugs.csv', index_col=0)

In [29]:
selected_cid = 'YAPC'
pred_seed_ls = [100, 1000, 10000] 
cs_df_approved_summary = pd.DataFrame(columns=['sig'])
for drug_name in df_DEG_approved.columns:
    cs_df_approved = pd.DataFrame(columns=['sig', 'cs_pred'])
    for pred_seed in pred_seed_ls:
        
        save_dir = '../results/6.Phenotype_based_drug_repurposing/prediction_profile_{}_{}.h5'.format(selected_cid, pred_seed)
        isExists = os.path.exists(save_dir)
        if isExists:
            print('loading.....')
            ddict_data = load_from_HDF(save_dir)
        else:
            ddict_data = predict_profile(df_PRISM_external, selected_cid, pred_seed)

        ## GSEA
        expr = ddict_data['x2_pred_inferred'] - ddict_data['x1_inferred']
        expr_df = pd.DataFrame(expr, columns=df_gene['pr_gene_symbol'], index=ddict_data['sig'])
        df_DEG_approved.sort_values(by=[drug_name], ascending=False, inplace=True)
        
        ups = df_DEG_approved[df_DEG_approved[drug_name]>0].index.tolist()
        downs = df_DEG_approved[df_DEG_approved[drug_name]<0].index.tolist()
        cs_df_pred = computecs(ups, downs, expr_df.T)
        cs_df_pred.reset_index(inplace=True)
        cs_df_pred.columns = ['sig', 'cs_pred']
        cs_df_approved = pd.concat([cs_df_approved, cs_df_pred], axis=0)
    cs_df_approved_summary_tmp = cs_df_approved.groupby('sig').mean().reset_index().sort_values('cs_pred', ascending=False, ignore_index=True)
    cs_df_approved_summary_tmp = cs_df_approved_summary_tmp.rename(columns={'cs_pred': drug_name})
    cs_df_approved_summary = cs_df_approved_summary.merge(cs_df_approved_summary_tmp[['sig', drug_name]], on='sig', how='right')
cs_df_approved_summary

loading.....
loading.....
loading.....
loading.....
loading.....
loading.....
loading.....
loading.....
loading.....


Unnamed: 0,sig,Erlotinib,Olaparib,Gemcitabine
0,1507_YAPC,0.106045,0.038869,0.225969
1,1184_YAPC,0.097100,0.038508,0.212709
2,1580_YAPC,0.103580,0.097317,0.202382
3,1356_YAPC,0.039694,0.062592,0.202369
4,168_YAPC,0.054987,0.071948,0.200777
...,...,...,...,...
1620,1336_YAPC,0.190908,0.188762,-0.193232
1621,307_YAPC,0.318351,0.253276,-0.196776
1622,56_YAPC,0.418324,0.277315,-0.197615
1623,1096_YAPC,0.240011,0.197803,-0.198387


In [30]:
cs_pred_approved_ls = []
for idx, row in cs_df_approved_summary.iterrows():
    if sum(row[df_DEG_approved.columns]>0) == 3 :  
        cs_pred_approved_ls.append(row[df_DEG_approved.columns].max(axis=0))
    elif sum(row[df_DEG_approved.columns]<0) == 3: 
        cs_pred_approved_ls.append(row[df_DEG_approved.columns].min(axis=0))
    else:
        cs_pred_approved_ls.append(row[df_DEG_approved.columns].mean(axis=0))
cs_df_approved_summary['cs_pred_approved_drug'] = cs_pred_approved_ls
cs_df_approved_summary = cs_df_approved_summary[['sig', 'cs_pred_approved_drug']]
cs_df_approved_summary

Unnamed: 0,sig,cs_pred_approved_drug
0,1507_YAPC,0.225969
1,1184_YAPC,0.212709
2,1580_YAPC,0.202382
3,1356_YAPC,0.202369
4,168_YAPC,0.200777
...,...,...
1620,1336_YAPC,0.062146
1621,307_YAPC,0.124951
1622,56_YAPC,0.166008
1623,1096_YAPC,0.079809


## TranSiGen(disease)

In [31]:
df_disease = pd.read_csv('../results/6.Phenotype_based_drug_repurposing/DEGs_disease.csv')
ups = df_disease[df_disease['log2FoldChange'] > 1.5]['pr_gene_symbol'].tolist()
downs = df_disease[df_disease['log2FoldChange'] < -1.5]['pr_gene_symbol'].tolist()
print('ups gene:', len(ups), 'down gene:', len(downs))  #ups gene: 114 down gene: 124

ups gene: 293 down gene: 168


In [32]:
pred_seed_ls = [100, 1000, 10000] 
selected_cid = 'YAPC'
cs_df_disease_summary = pd.DataFrame(columns=['sig', 'cs_pred'])
for pred_seed in pred_seed_ls:
    save_dir = '../results/6.Phenotype_based_drug_repurposing/prediction_profile_{}_{}.h5'.format(selected_cid, pred_seed)
    isExists = os.path.exists(save_dir)
    if isExists:
        print('loading.....')
        ddict_data = load_from_HDF(save_dir)
    else:
        ddict_data = predict_profile(df_PRISM_external, selected_cid, pred_seed)
    
    ## GSEA
    expr = ddict_data['x2_pred_inferred'] - ddict_data['x1_inferred']
    expr_df = pd.DataFrame(expr, columns=df_gene['pr_gene_symbol'], index=ddict_data['sig'])
    cs_df_pred = computecs(ups, downs, expr_df.T)  
    cs_df_pred.reset_index(inplace=True)
    cs_df_pred.columns = ['sig', 'cs_pred']
    cs_df_disease_summary = pd.concat([cs_df_disease_summary, cs_df_pred], axis=0)

cs_df_disease_summary = cs_df_disease_summary.groupby('sig').mean().reset_index().sort_values('cs_pred', ascending=False, ignore_index=True)
cs_df_disease_summary

loading.....
loading.....
loading.....


Unnamed: 0,sig,cs_pred
0,1089_YAPC,0.382168
1,892_YAPC,0.377509
2,39_YAPC,0.368387
3,1529_YAPC,0.360639
4,1085_YAPC,0.359700
...,...,...
1620,1172_YAPC,-0.288337
1621,775_YAPC,-0.318648
1622,637_YAPC,-0.323425
1623,1184_YAPC,-0.324841


In [33]:
df_screening_result = cs_df_disease_summary.merge(cs_df_approved_summary, on='sig', how='left')
df_screening_result['cid'] = df_screening_result['sig'].apply(lambda x: x.split('_')[1])
df_screening_result['canonical_smiles'] = df_screening_result['sig'].apply(lambda x: idx2smi[int(x.split('_')[0])])
df_screening_result = df_screening_result.merge(df_PRISM_external, on ='canonical_smiles')
df_screening_result

Unnamed: 0,sig,cs_pred,cs_pred_approved_drug,cid,canonical_smiles,cp_id,PRISM_auc,name,moa,target
0,1089_YAPC,0.382168,-0.063690,YAPC,CCCCCCCC/C=C\CCCCCCCC(=O)N[C@@H](CO)Cc1ccc(O)cc1,1089,-1.000000,OMDM-2,FAAH inhibitor,GPR119
1,892_YAPC,0.377509,0.083774,YAPC,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CC=C5C[C@@H](...,892,-1.000000,diosgenin,steroid,
2,39_YAPC,0.368387,-0.083731,YAPC,CC(C)=CCCC(C)(O)[C@H]1CC[C@]2(C)[C@@H]1[C@H](O...,39,-1.000000,ginsenoside-rg3,"angiogenesis inhibitor, apoptosis stimulant",KCNH2
3,1529_YAPC,0.360639,-0.083443,YAPC,C(=C/c1ccccc1)\CN1CCN(CCOC(c2ccccc2)c2ccccc2)CC1,1529,-1.000000,GBR-12783,dopamine uptake inhibitor,
4,1085_YAPC,0.359700,-0.010767,YAPC,CN1[C@@H]2CC[C@@H]1CC(OC(c1ccccc1)c1ccccc1)C2,1085,-1.000000,benztropine-mesylate,acetylcholine receptor antagonist,"CHRM1, HRH1, SLC6A3"
...,...,...,...,...,...,...,...,...,...,...
1620,1172_YAPC,-0.288337,0.062811,YAPC,O=C(/C=C/c1ccc(CN(CCO)CCc2c[nH]c3ccccc23)cc1)NO,1172,0.833664,dacinostat,HDAC inhibitor,"HDAC1, HDAC2, HDAC3, HDAC4, HDAC5, HDAC6, HDAC..."
1621,775_YAPC,-0.318648,0.053236,YAPC,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,775,0.874131,belinostat,HDAC inhibitor,"HDAC1, HDAC10, HDAC11, HDAC2, HDAC3, HDAC4, HD..."
1622,637_YAPC,-0.323425,0.055213,YAPC,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,637,0.728402,panobinostat,HDAC inhibitor,"HDAC1, HDAC2, HDAC3, HDAC4, HDAC6, HDAC7, HDAC..."
1623,1184_YAPC,-0.324841,0.212709,YAPC,C=C(NC(=O)C(=C)NC(=O)c1csc(C2=N[C@@H]3c4csc(n4...,1184,-1.000000,thiostrepton,"FOXM1 inhibitor, protein synthesis inhibitor",FOXM1


## ECFP4(drug)

In [34]:
train_smiles = list(df_approved['canonical_smiles'])
test_smiles = list(df_screening_result['canonical_smiles'])
train_ECFP_array = []
for smi in train_smiles:
    mol = Chem.MolFromSmiles(smi)
    ECFP = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    train_ECFP_array.append(ECFP)
max_ECFP_sims_in_train = []
for smi in test_smiles:
    mol = Chem.MolFromSmiles(smi)
    ECFP = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    sims = DataStructs.BulkTanimotoSimilarity(ECFP,train_ECFP_array)
    max_ECFP_sims_in_train.append(max(sims))

df_screening_result['ECFP_max_similarity'] = max_ECFP_sims_in_train
df_screening_result.head(5)

Unnamed: 0,sig,cs_pred,cs_pred_approved_drug,cid,canonical_smiles,cp_id,PRISM_auc,name,moa,target,ECFP_max_similarity
0,1089_YAPC,0.382168,-0.06369,YAPC,CCCCCCCC/C=C\CCCCCCCC(=O)N[C@@H](CO)Cc1ccc(O)cc1,1089,-1.0,OMDM-2,FAAH inhibitor,GPR119,0.106667
1,892_YAPC,0.377509,0.083774,YAPC,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CC=C5C[C@@H](...,892,-1.0,diosgenin,steroid,,0.079545
2,39_YAPC,0.368387,-0.083731,YAPC,CC(C)=CCCC(C)(O)[C@H]1CC[C@]2(C)[C@@H]1[C@H](O...,39,-1.0,ginsenoside-rg3,"angiogenesis inhibitor, apoptosis stimulant",KCNH2,0.132653
3,1529_YAPC,0.360639,-0.083443,YAPC,C(=C/c1ccccc1)\CN1CCN(CCOC(c2ccccc2)c2ccccc2)CC1,1529,-1.0,GBR-12783,dopamine uptake inhibitor,,0.139241
4,1085_YAPC,0.3597,-0.010767,YAPC,CN1[C@@H]2CC[C@@H]1CC(OC(c1ccccc1)c1ccccc1)C2,1085,-1.0,benztropine-mesylate,acetylcholine receptor antagonist,"CHRM1, HRH1, SLC6A3",0.121622


In [35]:
df_PRISM_filter_with_AUC = df_PRISM_external[df_PRISM_external['PRISM_auc'] != -1]
df_PRISM_filter_with_AUC.sort_values(by=['PRISM_auc'], ascending=True, inplace=True)
AUC_threshold = df_PRISM_filter_with_AUC.iloc[int(df_PRISM_filter_with_AUC.shape[0]*0.2),:]['PRISM_auc']
print('PRISM_auc_threshold:', AUC_threshold)

auc_label = []
for idx, row in df_screening_result.iterrows():
    if row['PRISM_auc'] == -1:
        auc_label.append(0)
    elif row['PRISM_auc'] < AUC_threshold:
        auc_label.append(1)
    elif row['PRISM_auc'] >= AUC_threshold:
        auc_label.append(0)
df_screening_result['PRISM_auc_label'] = auc_label
df_screening_result

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


PRISM_auc_threshold: 0.770811061


Unnamed: 0,sig,cs_pred,cs_pred_approved_drug,cid,canonical_smiles,cp_id,PRISM_auc,name,moa,target,ECFP_max_similarity,PRISM_auc_label
0,1089_YAPC,0.382168,-0.063690,YAPC,CCCCCCCC/C=C\CCCCCCCC(=O)N[C@@H](CO)Cc1ccc(O)cc1,1089,-1.000000,OMDM-2,FAAH inhibitor,GPR119,0.106667,0
1,892_YAPC,0.377509,0.083774,YAPC,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CC=C5C[C@@H](...,892,-1.000000,diosgenin,steroid,,0.079545,0
2,39_YAPC,0.368387,-0.083731,YAPC,CC(C)=CCCC(C)(O)[C@H]1CC[C@]2(C)[C@@H]1[C@H](O...,39,-1.000000,ginsenoside-rg3,"angiogenesis inhibitor, apoptosis stimulant",KCNH2,0.132653,0
3,1529_YAPC,0.360639,-0.083443,YAPC,C(=C/c1ccccc1)\CN1CCN(CCOC(c2ccccc2)c2ccccc2)CC1,1529,-1.000000,GBR-12783,dopamine uptake inhibitor,,0.139241,0
4,1085_YAPC,0.359700,-0.010767,YAPC,CN1[C@@H]2CC[C@@H]1CC(OC(c1ccccc1)c1ccccc1)C2,1085,-1.000000,benztropine-mesylate,acetylcholine receptor antagonist,"CHRM1, HRH1, SLC6A3",0.121622,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1620,1172_YAPC,-0.288337,0.062811,YAPC,O=C(/C=C/c1ccc(CN(CCO)CCc2c[nH]c3ccccc23)cc1)NO,1172,0.833664,dacinostat,HDAC inhibitor,"HDAC1, HDAC2, HDAC3, HDAC4, HDAC5, HDAC6, HDAC...",0.153846,0
1621,775_YAPC,-0.318648,0.053236,YAPC,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,775,0.874131,belinostat,HDAC inhibitor,"HDAC1, HDAC10, HDAC11, HDAC2, HDAC3, HDAC4, HD...",0.118421,0
1622,637_YAPC,-0.323425,0.055213,YAPC,Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1,637,0.728402,panobinostat,HDAC inhibitor,"HDAC1, HDAC2, HDAC3, HDAC4, HDAC6, HDAC7, HDAC...",0.159091,1
1623,1184_YAPC,-0.324841,0.212709,YAPC,C=C(NC(=O)C(=C)NC(=O)c1csc(C2=N[C@@H]3c4csc(n4...,1184,-1.000000,thiostrepton,"FOXM1 inhibitor, protein synthesis inhibitor",FOXM1,0.086294,0


In [None]:
df_screening_result.to_csv('../results/6.Phenotype_based_drug_repurposing/screening_result.csv', index=False)

# df_screening_result = pd.read_csv('../results/6.Phenotype_based_drug_repurposing/screening_result.csv')
# df_screening_result

In [37]:
N_total = df_screening_result.shape[0]
Hit_stotal = df_screening_result[(df_screening_result['PRISM_auc_label'] == 1)].shape[0]

df_result = pd.DataFrame(columns=['type', 'N_total', 'Hits_total', 'N_sampled', 'Hits_sampled', 'EF', 'data'])
df_result_similarity = pd.DataFrame(columns=['ECFP_max_similarity', 'type', 'data'])

df_screening_result.sort_values(by=['cs_pred_approved_drug'], ascending=False, inplace=True) 
df_screening_result.reset_index(drop=True, inplace=True)
for threshold in [1, 2, 5, 10, 20]:
    N_sampled = int(df_screening_result.shape[0] * threshold / 100 )
    df_sampled = df_screening_result.head(N_sampled)
    Hits_sampled = df_sampled[(df_sampled['PRISM_auc_label']==1)].shape[0]
    EF = (Hits_sampled/N_sampled) / (Hit_stotal/N_total)
    df_result.loc[df_result.shape[0]] = ['EF{}%'.format(str(threshold)), N_total, Hit_stotal, N_sampled, Hits_sampled, EF, 'TranSiGen_DRUG']
    
    df_temp = df_sampled[(df_sampled['PRISM_auc_label']==1)][['ECFP_max_similarity']]
    df_temp['type'] = 'Hit{}%'.format(str(threshold))
    df_temp['data'] = 'TranSiGen_DRUG'
    df_result_similarity = pd.concat([df_result_similarity, df_temp])

df_screening_result.sort_values(by=['cs_pred'], ascending=True, inplace=True)
df_screening_result.reset_index(drop=True, inplace=True)
for threshold in [1, 2, 5, 10, 20]:
    N_sampled = int(df_screening_result.shape[0] * threshold / 100 )
    df_sampled = df_screening_result.head(N_sampled)
    Hits_sampled = df_sampled[(df_sampled['PRISM_auc_label']==1)].shape[0]
    EF = (Hits_sampled/N_sampled) / (Hit_stotal/N_total)
    df_result.loc[df_result.shape[0]] = ['EF{}%'.format(str(threshold)), N_total, Hit_stotal, N_sampled, Hits_sampled, EF, 'TranSiGen_DISEASE']
    
    df_temp = df_sampled[(df_sampled['PRISM_auc_label']==1)][['ECFP_max_similarity']]
    df_temp['type'] = 'Hit{}%'.format(str(threshold))
    df_temp['data'] = 'TranSiGen_DISEASE'
    df_result_similarity = pd.concat([df_result_similarity, df_temp])

df_screening_result.sort_values(by=['ECFP_max_similarity'], ascending=False, inplace=True)
df_screening_result.reset_index(drop=True, inplace=True)
for threshold in [1, 2, 5, 10, 20]:
    N_sampled = int(df_screening_result.shape[0] * threshold / 100 )
    df_sampled = df_screening_result.head(N_sampled)
    Hits_sampled = df_sampled[(df_sampled['PRISM_auc_label']==1)].shape[0]
    EF = (Hits_sampled/N_sampled) / (Hit_stotal/N_total)
    df_result.loc[df_result.shape[0]] = ['EF{}%'.format(str(threshold)), N_total, Hit_stotal, N_sampled, Hits_sampled, EF, 'ECFP4_DRUG']
    
    df_temp = df_sampled[(df_sampled['PRISM_auc_label']==1)][['ECFP_max_similarity']]
    df_temp['type'] = 'Hit{}%'.format(str(threshold))  #$_{real}$
    df_temp['data'] = 'ECFP4_DRUG'
    df_result_similarity = pd.concat([df_result_similarity, df_temp])
round(df_result,3)

Unnamed: 0,type,N_total,Hits_total,N_sampled,Hits_sampled,EF,data
0,EF1%,1625,86,16,3,3.543,TranSiGen_DRUG
1,EF2%,1625,86,32,7,4.133,TranSiGen_DRUG
2,EF5%,1625,86,81,16,3.732,TranSiGen_DRUG
3,EF10%,1625,86,162,24,2.799,TranSiGen_DRUG
4,EF20%,1625,86,325,41,2.384,TranSiGen_DRUG
5,EF1%,1625,86,16,5,5.905,TranSiGen_DISEASE
6,EF2%,1625,86,32,11,6.495,TranSiGen_DISEASE
7,EF5%,1625,86,81,20,4.666,TranSiGen_DISEASE
8,EF10%,1625,86,162,25,2.916,TranSiGen_DISEASE
9,EF20%,1625,86,325,34,1.977,TranSiGen_DISEASE
