# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Data

## Load cell lines

In [3]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [4]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [5]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [6]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

In [7]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [8]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [9]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Model performance

In [10]:
drugEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/'
modelDir = os.path.join(drugEncoderPath, 'models')

In [11]:
thresh = 0.55
bestModel = ''
for m in os.listdir(modelDir):
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                 fusionPath=None, drugPath=os.path.join(modelDir, m), rnaPath=None)

    if evalLog.drugEncoder == None:
        continue
        
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                    modelName=m, at=1, thresh=thresh, returnThresh=True)
    if currentThresh > thresh:
        thresh = currentThresh
        bestModel = m

KeyboardInterrupt: 

In [12]:
thresh = 0.5
bestModel = ''
for m in os.listdir(modelDir):
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                 fusionPath=None, drugPath=os.path.join(modelDir, m), rnaPath=None)

    if evalLog.drugEncoder == None:
        continue
        
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                    modelName=m, at=5, thresh=thresh, returnThresh=True)
    if currentThresh > thresh:
        thresh = currentThresh
        bestModel = m

Model: DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
	Precision@1: 0.5882
	Precision@2: 0.6176
	Precision@3: 0.6078
	Precision@4: 0.6029
	Precision@5: 0.5647
	Precision@10: 0.5231



In [13]:
bestModel1 = 'DrugFewShot_Layers2_Hidden16_DO0-1_AFrelu_LR0-01_DR0-99_DS1000'

In [14]:
drugPath = os.path.join(modelDir, bestModel1)
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=None, drugPath=drugPath, rnaPath=None)

In [15]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                             newDF=newTemp.copy(),
                                                                             thresh=0.2,
                                                                             at=1)


Average Cell Line precision @ k on test set
	Precision@1: 0.7451
	Precision@2: 0.6471
	Precision@3: 0.5621
	Precision@4: 0.4902
	Precision@5: 0.4588
	Precision@10: 0.3667

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.7385
	Precision@2: 0.6615
	Precision@3: 0.6
	Precision@4: 0.5231
	Precision@5: 0.4954
	Precision@10: 0.4294


Test set:
No true effective drugs identified in top 3 for ACH-000305 (top drug: YM-155)
No true effective drugs identified in top 3 for ACH-000329 (top drug: YM-155)
No true effective drugs identified in top 3 for ACH-000368 (top drug: TAS-103)
No true effective drugs identified in top 3 for ACH-000510 (top drug: YM-155)
No true effective drugs identified in top 3 for ACH-000650 (top drug: YM-155)

	# of cell lines without effective drug among top-3 recs: 5
	# of unique drugs among top-3 predictions: 18

New cancer set
No true effective drugs identified in top 3 for ACH-000171 (top drug: YM-155)
No true effective drugs identified in top 3 for AC

In [16]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0014


In [17]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0025


In [16]:
testCounts

Unnamed: 0,1,2,3,total
YM-155,38,3,0,41
rubitecan,4,22,3,29
genz-644282,3,15,19,37
bitopertin,3,0,0,3
camptothecin,2,2,2,6
TAS-103,1,2,13,16
nemorubicin,1,0,0,1
topotecan,0,2,6,8
emetine,0,1,2,3
echinomycin,0,1,1,2


In [20]:
testDF[testDF.cell_line == 'ACH-000305'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
1622,ACH-000305,Esophageal Cancer,YM-155,0,0.51613
1541,ACH-000305,Esophageal Cancer,genz-644282,0,0.394091
1707,ACH-000305,Esophageal Cancer,TAS-103,0,0.388987
1676,ACH-000305,Esophageal Cancer,10-hydroxycamptothecin,1,0.374119
1557,ACH-000305,Esophageal Cancer,emetine,0,0.353927
1724,ACH-000305,Esophageal Cancer,beta-lapachone,0,0.341567
1712,ACH-000305,Esophageal Cancer,nemorubicin,0,0.319647
1632,ACH-000305,Esophageal Cancer,bruceantin,0,0.305143
1582,ACH-000305,Esophageal Cancer,epirubicin,0,0.28429
1614,ACH-000305,Esophageal Cancer,voreloxin,0,0.283325


In [21]:
testDF[testDF.cell_line == 'ACH-000368'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
3391,ACH-000368,Brain Cancer,TAS-103,0,0.517309
3404,ACH-000368,Brain Cancer,topotecan,0,0.512401
3353,ACH-000368,Brain Cancer,emetine,0,0.479764
3393,ACH-000368,Brain Cancer,nemorubicin,1,0.441628
3343,ACH-000368,Brain Cancer,doxorubicin,1,0.400725
3356,ACH-000368,Brain Cancer,daunorubicin,1,0.384589
3390,ACH-000368,Brain Cancer,echinomycin,1,0.377547
3345,ACH-000368,Brain Cancer,teniposide,0,0.355464
3397,ACH-000368,Brain Cancer,brivaracetam,0,0.348722
3368,ACH-000368,Brain Cancer,mitomycin-c,0,0.339354


In [17]:
newDF[newDF.cell_line == 'ACH-000268']

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9844,ACH-000268,Bile Duct Cancer,rubitecan,0,0.505358
9834,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.290892
9846,ACH-000268,Bile Duct Cancer,verubulin,0,0.28676
9847,ACH-000268,Bile Duct Cancer,VE-822,0,0.221877
9833,ACH-000268,Bile Duct Cancer,GDC-0980,0,0.221771
9839,ACH-000268,Bile Duct Cancer,BGT226,0,0.221261
9838,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.218839
9841,ACH-000268,Bile Duct Cancer,LY3023414,1,0.21829
9848,ACH-000268,Bile Duct Cancer,panobinostat,0,0.193249
9836,ACH-000268,Bile Duct Cancer,SB-939,0,0.191913


## Cancer precision

In [31]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [32]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Endometrial/Uterine Cancer,1.0,0.833333,0.777778,0.666667,0.6
Bladder Cancer,1.0,0.833333,0.777778,0.666667,0.533333
Breast Cancer,1.0,0.666667,0.444444,0.333333,0.333333
Liver Cancer,1.0,0.5,0.5,0.5,0.5
Lung Cancer,0.923077,0.692308,0.615385,0.5,0.476923
Ovarian Cancer,0.75,0.75,0.666667,0.5,0.55
Esophageal Cancer,0.666667,0.666667,0.555556,0.5,0.4
Skin Cancer,0.6,0.6,0.466667,0.5,0.4
Colon/Colorectal Cancer,0.5,0.625,0.666667,0.625,0.65
Pancreatic Cancer,0.5,0.625,0.5,0.4375,0.4


In [43]:
round(cancerTest.mean(), 4)

p1    0.7311
p2    0.6389
p3    0.5554
p4    0.4913
p5    0.4564
dtype: float64

In [33]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Rhabdoid,1.0,0.875,0.666667,0.6875,0.65
Neuroblastoma,1.0,0.666667,0.666667,0.5,0.466667
Gastric Cancer,0.928571,0.785714,0.714286,0.589286,0.585714
Bone Cancer,0.777778,0.666667,0.62963,0.583333,0.577778
Thyroid Cancer,0.75,0.75,0.75,0.59375,0.525
Sarcoma,0.666667,0.75,0.611111,0.583333,0.566667
Kidney Cancer,0.615385,0.5,0.435897,0.365385,0.338462
Prostate Cancer,0.5,0.75,0.666667,0.5,0.5
Bile Duct Cancer,0.4,0.3,0.333333,0.4,0.32
Gallbladder Cancer,0.0,0.5,0.333333,0.25,0.2


In [42]:
round(cancerNew.mean(), 4)

p1    0.6638
p2    0.6544
p3    0.5808
p4    0.5053
p5    0.4730
dtype: float64

# Check pred range by drug

In [34]:
testPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in testDF.groupby(by='drug'):
    testPredRange['drug'].append(drug)
    testPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    testPredRange['predRange'].append(maxPred-minPred)
    
testPredRange = pd.DataFrame(testPredRange).sort_values(by='predRange', ascending=False)
testPredRange.reset_index(drop=True, inplace=True)
testPredRange[testPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,YM-155,41,0.445689
1,rubitecan,31,0.421916
2,genz-644282,41,0.414715
3,TAS-103,29,0.411773
4,10-hydroxycamptothecin,38,0.402858
5,emetine,34,0.389928
6,topotecan,32,0.388482
7,nemorubicin,32,0.365816
8,beta-lapachone,36,0.358876
9,camptothecin,6,0.352469


In [35]:
testPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
231,alvocidib,46,0.098036
365,bortezomib,44,0.061509
573,selinexor,43,0.024193
117,FK-866,42,0.162204
332,napabucasin,42,0.071022
11,echinomycin,41,0.321616
71,LY3023414,41,0.214623
297,ganetespib,41,0.079222
26,epothilone-b,41,0.277473
0,YM-155,41,0.445689


In [36]:
testPredRange[testPredRange.predCount>1].predRange.mean()

0.09124295879455577

In [37]:
newPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in newDF.groupby(by='drug'):
    newPredRange['drug'].append(drug)
    newPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    newPredRange['predRange'].append(maxPred-minPred)
    
newPredRange = pd.DataFrame(newPredRange).sort_values(by='predRange', ascending=False)
newPredRange.reset_index(drop=True, inplace=True)
newPredRange[newPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,YM-155,45,0.565515
1,rubitecan,49,0.525881
2,genz-644282,57,0.518892
3,TAS-103,34,0.487045
4,topotecan,45,0.480699
5,10-hydroxycamptothecin,48,0.477824
6,irinotecan,28,0.476655
7,nemorubicin,44,0.46908
8,beta-lapachone,39,0.457231
9,epirubicin,36,0.439311


In [38]:
newPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
316,ganetespib,60,0.11229
12,echinomycin,58,0.421159
367,bortezomib,57,0.090874
2,genz-644282,57,0.518892
56,verubulin,52,0.305564
95,XL888,52,0.265461
68,LY3023414,51,0.295303
234,alvocidib,51,0.142375
290,NVP-AUY922,51,0.119909
28,paclitaxel,50,0.352208


In [39]:
newPredRange[newPredRange.predCount>1].predRange.mean()

0.12425895235275122