# Import packages

## General

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Personal

In [2]:
from rbm.utils import Predictions, Compiler

# Define vars

In [4]:
metrics = ['precision', 'recall', 'f1', 'auc']
modelType = 'fsCDR'
basePath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cdr/v2'

# Define Funcs

## precision@k

In [7]:
def countDrugsK(df, k=1):
    drugCount = {}
    wrong = []
    for cell, subdf in df.groupby(by='cell_line'):
        sortDF = subdf.sort_values(by='pred', ascending=False).reset_index(drop=True)
        drugs = sortDF.loc[:k, 'drug']
        for drug in drugs:
            if drug in drugCount.keys():
                drugCount[drug] += 1
            else:
                drugCount[drug] = 1
        drug = drugs[0]

        if sortDF.iloc[:k, :].true.sum() == 0:
            wrong.append(cell)
            print(f"\nNo true effective drugs identified in top {k} for {cell}")
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}\n")
        else:
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}")
    return drugCount, wrong

In [6]:
def clPrecision(df, modelName=None, verbose=True, getResults=False, thresh=0.5):
    p1 = []
    p2 = []
    p3 = []
    p4 = []
    p5 = []
    p0 = []
    cellLines = []
    for cell, subdf in df.groupby(by='cell_line'):
        nEff = subdf.true.sum()
        if nEff < 5:
            continue
        cellLines.append(cell)
        sortDF = subdf.sort_values(by='pred', ascending=False)
        p1.append(sortDF.iloc[:1, :].true.sum() / 1)
        p2.append(sortDF.iloc[:2, :].true.sum() / 2)
        p3.append(sortDF.iloc[:3, :].true.sum() / 3)
        p4.append(sortDF.iloc[:4, :].true.sum() / 4)
        p5.append(sortDF.iloc[:5, :].true.sum() / 5)
        if sortDF.true.sum() >= 10:
            p0.append(sortDF.iloc[:9, :].true.sum() / 10)

    if np.mean(p1) >= thresh:
        thresh = np.mean(p1)
        if verbose:
            print(f"Model: {modelName}")
            print(f"\tPrecision@1: {round(thresh, 4)}")
            print(f"\tPrecision@2: {round(np.mean(p2), 4)}")
            print(f"\tPrecision@3: {round(np.mean(p3), 4)}")
            print(f"\tPrecision@4: {round(np.mean(p4), 4)}")
            print(f"\tPrecision@5: {round(np.mean(p5), 4)}")
            print(f"\tPrecision@10: {round(np.mean(p0), 4)}\n\n")
            
    if getResults:
        return [thresh, np.mean(p2), np.mean(p3), np.mean(p4), np.mean(p5)]
    if verbose:
        return thresh

def precision(predPath, modelName, thresh, by='cellLine'):
    predDF = pd.read_csv(predPath, index_col=0) 
    if by == 'cellLine':
        return clPrecision(predDF, modelName, thresh=thresh)
    else:
        cancers = {}
        for ct, subdf in predDF.groupby(by = 'cancer_type'):
            cancers[ct] = clPrecision(subdf, verbose=False, getResults=True)
        return pd.DataFrame(cancers, index=['p1', 'p2', 'p3', 'p4', 'p5']).T
        
    
def iteratePrecision(basePath, file=None, thresh=0.7, k=1, by='cellLine'):
    if file != None:
        if by == 'cellLine':
            print('Average Cell Line precision @ k')
            precision(os.path.join(basePath, file), file, thresh, by)
            df = pd.read_csv(os.path.join(basePath, file), index_col=0).sort_values(by='pred', ascending = False).reset_index(drop=True)
            print("Top ranked drug for each cell line:")
            counts, wrong = countDrugsK(df, k)
            print(f"\n# of times each drug recommended in top-{k}:")
            counts = sorted(counts.items(), key=lambda x:x[1], reverse=True)
            for drug, cnt in counts:
                print(f"{drug}: {cnt}")
            return df, wrong
        else:
            df = precision(os.path.join(basePath, file), file, thresh, by)
            df.sort_values(by=['p1','p1','p3','p4','p5'], ascending=False, inplace=True)
            return df
        
    else:
        files = os.listdir(basePath)
        print('Average Cell Line precision @ k')
        for f in files:
            thresh = precision(os.path.join(basePath, f), f, thresh, by='cellLine')
            
        print(thresh)
        

# FS-CDR

Applied on embedded drug and embedded rna pairs combined via contrastive fusion

## Test data

### Precision@k for all models

In [8]:
# Average of 52 cell lines with cancer types seen during training
iteratePrecision(os.path.join(basePath, 'test_preds'), thresh=0.7)

Average Cell Line precision @ k
Model: Fused-FewShotCDR_NL32_8_DO0-0_AFsigmoid_LR0-01_DR0-99_DS500_preds.csv
	Precision@1: 0.7059
	Precision@2: 0.5882
	Precision@3: 0.6078
	Precision@4: 0.5784
	Precision@5: 0.5373
	Precision@10: 0.4256


Model: Fused-FewShotCDR_NL64_8_DO0-1_AFrelu_LR0-01_DR0-99_DS50_preds.csv
	Precision@1: 0.7059
	Precision@2: 0.6078
	Precision@3: 0.549
	Precision@4: 0.5245
	Precision@5: 0.4627
	Precision@10: 0.4077


Model: Fused-FewShotCDR_NL64_32_DO0-1_AFrelu_LR0-01_DR0-99_DS50_preds.csv
	Precision@1: 0.7843
	Precision@2: 0.6373
	Precision@3: 0.5752
	Precision@4: 0.5343
	Precision@5: 0.4902
	Precision@10: 0.4231


Model: Fused-FewShotCDR_NL64_16_DO0-1_AFsigmoid_LR0-01_DR0-99_DS500_preds.csv
	Precision@1: 0.8039
	Precision@2: 0.6471
	Precision@3: 0.6078
	Precision@4: 0.5735
	Precision@5: 0.5294
	Precision@10: 0.4333


0.803921568627451


### Define best model via above

In [9]:
best = 'Fused-FewShotCDR_NL64_16_DO0-1_AFsigmoid_LR0-01_DR0-99_DS500_preds.csv'

### Get precision@k for cell lines

In [10]:
testPred, wrongTest = iteratePrecision(os.path.join(basePath, 'test_preds'), file=best, k=3)

Average Cell Line precision @ k
Model: Fused-FewShotCDR_NL64_16_DO0-1_AFsigmoid_LR0-01_DR0-99_DS500_preds.csv
	Precision@1: 0.8039
	Precision@2: 0.6471
	Precision@3: 0.6078
	Precision@4: 0.5735
	Precision@5: 0.5294
	Precision@10: 0.4333


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: epothilone-b
Cell line: ACH-000062; Top drug: echinomycin
Cell line: ACH-000086; Top drug: YM-155
Cell line: ACH-000161; Top drug: YM-155

No true effective drugs identified in top 3 for ACH-000164
Cell line: ACH-000164; Top drug: epothilone-b

Cell line: ACH-000222; Top drug: genz-644282
Cell line: ACH-000280; Top drug: YM-155
Cell line: ACH-000305; Top drug: YM-155
Cell line: ACH-000316; Top drug: YM-155
Cell line: ACH-000320; Top drug: pardoprunox

No true effective drugs identified in top 3 for ACH-000329
Cell line: ACH-000329; Top drug: 10-hydroxycamptothecin

Cell line: ACH-000347; Top drug: YM-155
Cell line: ACH-000368; Top drug: echinomycin
Cell line: ACH-000376; Top drug: ne

In [11]:
wrongTest

['ACH-000164', 'ACH-000329', 'ACH-000824']

In [14]:
testPred[testPred.cell_line == 'ACH-000164'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
104,ACH-000164,Pancreatic Cancer,epothilone-b,0,0.465003
115,ACH-000164,Pancreatic Cancer,cabazitaxel,0,0.45754
188,ACH-000164,Pancreatic Cancer,JNJ-26481585,0,0.414392
340,ACH-000164,Pancreatic Cancer,TAS-103,0,0.341208
348,ACH-000164,Pancreatic Cancer,genz-644282,1,0.337309
485,ACH-000164,Pancreatic Cancer,topotecan,0,0.30443
517,ACH-000164,Pancreatic Cancer,emetine,0,0.297959
607,ACH-000164,Pancreatic Cancer,beta-lapachone,0,0.283552
614,ACH-000164,Pancreatic Cancer,10-hydroxycamptothecin,0,0.282083
671,ACH-000164,Pancreatic Cancer,rubitecan,0,0.27256


In [16]:
testPred[testPred.cell_line == 'ACH-000329'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
177,ACH-000329,Brain Cancer,10-hydroxycamptothecin,0,0.420269
242,ACH-000329,Brain Cancer,genz-644282,0,0.394343
317,ACH-000329,Brain Cancer,YM-155,0,0.348318
431,ACH-000329,Brain Cancer,parbendazole,0,0.31604
449,ACH-000329,Brain Cancer,nemorubicin,1,0.312611
472,ACH-000329,Brain Cancer,paclitaxel,0,0.306957
654,ACH-000329,Brain Cancer,JNJ-26481585,1,0.276106
754,ACH-000329,Brain Cancer,ABT-751,0,0.256016
772,ACH-000329,Brain Cancer,CUDC-907,0,0.253964
784,ACH-000329,Brain Cancer,beta-lapachone,0,0.251462


In [17]:
testPred[testPred.cell_line == 'ACH-000824'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
105,ACH-000824,Esophageal Cancer,valnemulin,0,0.46364
219,ACH-000824,Esophageal Cancer,givinostat,0,0.405322
221,ACH-000824,Esophageal Cancer,JNJ-26481585,0,0.404414
232,ACH-000824,Esophageal Cancer,AR-42,0,0.398628
233,ACH-000824,Esophageal Cancer,pyroxamide,0,0.398442
244,ACH-000824,Esophageal Cancer,resminostat,0,0.393733
247,ACH-000824,Esophageal Cancer,CYT-997,0,0.390642
250,ACH-000824,Esophageal Cancer,belinostat,0,0.386602
259,ACH-000824,Esophageal Cancer,nemorubicin,1,0.384454
280,ACH-000824,Esophageal Cancer,epothilone-b,1,0.371479


### precision @ k by cancer type

In [18]:
cancerTest = iteratePrecision(os.path.join(basePath, 'test_preds'), file=best, by='cancer')

In [19]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Colon/Colorectal Cancer,1.0,0.875,0.833333,0.75,0.6
Endometrial/Uterine Cancer,1.0,0.833333,0.777778,0.75,0.666667
Liver Cancer,1.0,0.75,0.666667,0.625,0.7
Bladder Cancer,1.0,0.666667,0.555556,0.5,0.466667
Breast Cancer,1.0,0.5,0.444444,0.333333,0.333333
Lung Cancer,0.923077,0.769231,0.666667,0.653846,0.615385
Skin Cancer,0.8,0.6,0.666667,0.65,0.52
Ovarian Cancer,0.75,0.625,0.666667,0.625,0.55
Brain Cancer,0.75,0.625,0.5,0.5625,0.5
Head and Neck Cancer,0.5,0.666667,0.666667,0.583333,0.533333


In [20]:
testPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Lung Cancer                   13
Brain Cancer                   5
Skin Cancer                    5
Ovarian Cancer                 4
Pancreatic Cancer              4
Colon/Colorectal Cancer        4
Bladder Cancer                 3
Breast Cancer                  3
Endometrial/Uterine Cancer     3
Esophageal Cancer              3
Head and Neck Cancer           3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New Cancer data

### Precision@k for all models

### Get precision@k for cell lines

In [21]:
newPred, wrongNew = iteratePrecision(os.path.join(basePath, 'newcancer_preds'), file=best, k=3)

Average Cell Line precision @ k
Model: Fused-FewShotCDR_NL64_16_DO0-1_AFsigmoid_LR0-01_DR0-99_DS500_preds.csv
	Precision@1: 0.7077
	Precision@2: 0.6308
	Precision@3: 0.6
	Precision@4: 0.5846
	Precision@5: 0.5262
	Precision@10: 0.4588


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: YM-155
Cell line: ACH-000046; Top drug: genz-644282
Cell line: ACH-000052; Top drug: vindesine
Cell line: ACH-000054; Top drug: echinomycin
Cell line: ACH-000087; Top drug: echinomycin
Cell line: ACH-000090; Top drug: epothilone-b
Cell line: ACH-000096; Top drug: echinomycin
Cell line: ACH-000099; Top drug: 10-hydroxycamptothecin
Cell line: ACH-000141; Top drug: 10-hydroxycamptothecin
Cell line: ACH-000159; Top drug: genz-644282
Cell line: ACH-000169; Top drug: echinomycin

No true effective drugs identified in top 3 for ACH-000171
Cell line: ACH-000171; Top drug: 10-hydroxycamptothecin

Cell line: ACH-000172; Top drug: YM-155
Cell line: ACH-000174; Top drug: genz-644282
Cell line: ACH-

In [22]:
wrongNew

['ACH-000171',
 'ACH-000189',
 'ACH-000209',
 'ACH-000268',
 'ACH-000433',
 'ACH-000484',
 'ACH-000649']

In [23]:
newPred[newPred.cell_line == 'ACH-000171'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
91,ACH-000171,Kidney Cancer,10-hydroxycamptothecin,0,0.502167
112,ACH-000171,Kidney Cancer,YM-155,0,0.489094
139,ACH-000171,Kidney Cancer,topotecan,0,0.471652
295,ACH-000171,Kidney Cancer,echinomycin,1,0.398089
328,ACH-000171,Kidney Cancer,irinotecan,0,0.380412
370,ACH-000171,Kidney Cancer,CUDC-907,1,0.367319
375,ACH-000171,Kidney Cancer,cobimetinib,0,0.365631
475,ACH-000171,Kidney Cancer,beta-lapachone,0,0.335327
588,ACH-000171,Kidney Cancer,nemorubicin,1,0.312191
611,ACH-000171,Kidney Cancer,vindesine,0,0.309082


In [24]:
newPred[newPred.cell_line == 'ACH-000189'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
220,ACH-000189,Kidney Cancer,genz-644282,0,0.430754
279,ACH-000189,Kidney Cancer,AR-42,0,0.403467
282,ACH-000189,Kidney Cancer,TAS-103,0,0.402783
359,ACH-000189,Kidney Cancer,topotecan,0,0.372674
421,ACH-000189,Kidney Cancer,JNJ-26481585,0,0.348409
470,ACH-000189,Kidney Cancer,AT13387,0,0.336559
485,ACH-000189,Kidney Cancer,10-hydroxycamptothecin,1,0.332878
504,ACH-000189,Kidney Cancer,cabazitaxel,0,0.327176
602,ACH-000189,Kidney Cancer,alvespimycin,0,0.310395
604,ACH-000189,Kidney Cancer,epothilone-b,1,0.310181


In [25]:
newPred[newPred.cell_line == 'ACH-000209'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
222,ACH-000209,Bile Duct Cancer,topotecan,0,0.429182
248,ACH-000209,Bile Duct Cancer,beta-lapachone,0,0.417774
260,ACH-000209,Bile Duct Cancer,10-hydroxycamptothecin,0,0.412902
366,ACH-000209,Bile Duct Cancer,JNJ-26481585,1,0.368954
696,ACH-000209,Bile Duct Cancer,epothilone-d,0,0.29378
713,ACH-000209,Bile Duct Cancer,fenbendazole,0,0.29121
744,ACH-000209,Bile Duct Cancer,rubitecan,0,0.28533
805,ACH-000209,Bile Duct Cancer,vindesine,0,0.275921
1101,ACH-000209,Bile Duct Cancer,sangivamycin,1,0.232069
1149,ACH-000209,Bile Duct Cancer,verubulin,0,0.225541


In [26]:
newPred[newPred.cell_line == 'ACH-000433'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
145,ACH-000433,Kidney Cancer,YM-155,0,0.468284
288,ACH-000433,Kidney Cancer,topotecan,0,0.401538
312,ACH-000433,Kidney Cancer,pardoprunox,0,0.390599
352,ACH-000433,Kidney Cancer,10-hydroxycamptothecin,0,0.37495
373,ACH-000433,Kidney Cancer,beta-lapachone,0,0.366052
467,ACH-000433,Kidney Cancer,atiprimod,1,0.337056
500,ACH-000433,Kidney Cancer,JNJ-26481585,1,0.328182
773,ACH-000433,Kidney Cancer,tanespimycin,0,0.280908
994,ACH-000433,Kidney Cancer,alvespimycin,1,0.246784
1051,ACH-000433,Kidney Cancer,verubulin,0,0.238414


In [27]:
newPred[newPred.cell_line == 'ACH-000484'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
17,ACH-000484,Kidney Cancer,epothilone-b,0,0.586921
92,ACH-000484,Kidney Cancer,topotecan,0,0.499619
131,ACH-000484,Kidney Cancer,genz-644282,0,0.477618
331,ACH-000484,Kidney Cancer,YM-155,0,0.380101
351,ACH-000484,Kidney Cancer,OTS167,0,0.375022
409,ACH-000484,Kidney Cancer,nemorubicin,0,0.354022
444,ACH-000484,Kidney Cancer,beta-lapachone,0,0.341964
993,ACH-000484,Kidney Cancer,verubulin,0,0.246931
1433,ACH-000484,Kidney Cancer,litronesib,0,0.188649
1475,ACH-000484,Kidney Cancer,PF-03758309,0,0.185116


In [29]:
newPred[newPred.cell_line == 'ACH-000649'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
98,ACH-000649,Kidney Cancer,genz-644282,0,0.496779
232,ACH-000649,Kidney Cancer,topotecan,0,0.423336
285,ACH-000649,Kidney Cancer,TAS-103,0,0.402016
540,ACH-000649,Kidney Cancer,beta-lapachone,0,0.320277
550,ACH-000649,Kidney Cancer,JNJ-26481585,0,0.318919
557,ACH-000649,Kidney Cancer,alvespimycin,1,0.317728
577,ACH-000649,Kidney Cancer,nanchangmycin,0,0.313543
632,ACH-000649,Kidney Cancer,NSC-319726,0,0.305778
655,ACH-000649,Kidney Cancer,YM-155,0,0.300382
677,ACH-000649,Kidney Cancer,AT13387,0,0.297024


### precision @ k by cancer type

In [30]:
cancerNew = iteratePrecision(os.path.join(basePath, 'newcancer_preds'), file=best, by='cancer')

In [31]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Sarcoma,1.0,0.833333,0.833333,0.708333,0.666667
Neuroblastoma,1.0,1.0,0.777778,0.666667,0.6
Rhabdoid,1.0,0.75,0.583333,0.5,0.55
Gallbladder Cancer,1.0,0.5,0.333333,0.25,0.2
Gastric Cancer,0.928571,0.714286,0.690476,0.678571,0.557143
Bone Cancer,0.666667,0.666667,0.592593,0.611111,0.555556
Thyroid Cancer,0.625,0.625,0.666667,0.625,0.525
Prostate Cancer,0.5,0.5,0.666667,0.75,0.6
Bile Duct Cancer,0.5,0.5,0.466667,0.6,0.6
Kidney Cancer,0.5,0.384615,0.384615,0.384615,0.369231


In [32]:
newPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Gastric Cancer        14
Kidney Cancer         14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64