# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Define

## vars

In [3]:
fusionPath = None
drugPath = None
rnaPath = '../../models/cellEncoders/CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'


# Data

## Load cell lines

In [4]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [5]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [6]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [7]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

In [8]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [9]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [10]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Model performance

In [11]:
rnaEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/'
modelDir = os.path.join(rnaEncoderPath, 'models')

In [12]:
drugEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/models'
drugOpts = ['DrugFewShot_Layers2_Hidden16_DO0-1_AFrelu_LR0-01_DR0-99_DS1000', # best combined 1/5
            'DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000', # best at 5
            'DrugFewShot_Layers2_Hidden64_DO0-3_AFsigmoid_LR0-0001_DR0-99_DS1000'] # best at 1

In [24]:
thresh = 0.5
bestModel = ''
for c in ['CellLineFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-0001_DR0-99_DS1000']:
    for d in os.listdir(drugEncoderPath):
        drugPath = os.path.join(drugEncoderPath, d)
        evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                     fusionPath=None, drugPath=drugPath, rnaPath=os.path.join(modelDir, c))

        currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                        modelName=d, at=1, thresh=thresh, returnThresh=True)
        if currentThresh > thresh:
            thresh = currentThresh
            bestModel = d

Model: DrugFewShot_Layers2_Hidden32_DO0-3_AFsigmoid_LR0-01_DR0-99_DS1000
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8693
	Precision@4: 0.8529
	Precision@5: 0.8275
	Precision@10: 0.8077

Model: DrugFewShot_Layers2_Hidden16_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8693
	Precision@4: 0.8529
	Precision@5: 0.8275
	Precision@10: 0.8077

Model: DrugFewShot_Layers1_Hidden16_DO0-3_AFrelu_LR0-0001_DR0-99_DS1000
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8693
	Precision@4: 0.8529
	Precision@5: 0.8275
	Precision@10: 0.8077

Model: DrugFewShot_Layers1_Hidden16_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8693
	Precision@4: 0.8529
	Precision@5: 0.8275
	Precision@10: 0.8077

Model: DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8693
	Precision@4: 0.8529
	Precision@5: 0.8275
	Precision@10: 0.8

In [13]:
bestCell1 = 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
# bestCell1 = 'CellLineFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-0001_DR0-99_DS1000'
bestDrug1 = 'DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'

In [14]:
rnaPath = os.path.join(modelDir, bestCell1)
drugPath = os.path.join(drugEncoderPath, bestDrug1)
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=None, drugPath=drugPath, rnaPath=rnaPath)

In [20]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                             newDF=newTemp.copy())


Average Cell Line precision @ k on test set
	Precision@1: 0.9412
	Precision@2: 0.902
	Precision@3: 0.8497
	Precision@4: 0.848
	Precision@5: 0.8275
	Precision@10: 0.8128

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.9538
	Precision@2: 0.9077
	Precision@3: 0.9026
	Precision@4: 0.8731
	Precision@5: 0.8615
	Precision@10: 0.8078


Test set:

	# of cell lines without effective drug among top-3 recs: 0
	# of unique drugs among top-3 predictions: 15

New cancer set
No true effective drugs identified in top 3 for ACH-000268 (top drug: sangivamycin)

	# of cell lines without effective drug among top-3 recs: 1
	# of unique drugs among top-3 predictions: 14


In [None]:
newDF[newDF.cell_line == 'ACH-000268']

In [17]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.001


In [18]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0025


In [29]:
testCounts

Unnamed: 0,1,2,3,total
dolastatin-10,35,0,0,35
romidepsin,10,21,1,32
echinomycin,4,16,20,40
10-hydroxycamptothecin,2,5,12,19
nemorubicin,1,0,0,1
YM-155,0,4,5,9
genz-644282,0,3,2,5
maytansinol-isobutyrate,0,1,2,3
sangivamycin,0,1,0,1
UK-383367,0,1,0,1


In [30]:
testDF[testDF.cell_line == 'ACH-000650'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
3580,ACH-000650,Skin Cancer,dolastatin-10,1,0.922163
3605,ACH-000650,Skin Cancer,10-hydroxycamptothecin,1,0.853022
3526,ACH-000650,Skin Cancer,YM-155,0,0.777788
3443,ACH-000650,Skin Cancer,genz-644282,0,0.755185
3516,ACH-000650,Skin Cancer,epothilone-b,1,0.741371
3439,ACH-000650,Skin Cancer,cabazitaxel,0,0.630667
3646,ACH-000650,Skin Cancer,nemorubicin,1,0.596284
3495,ACH-000650,Skin Cancer,sangivamycin,1,0.590728
3556,ACH-000650,Skin Cancer,gemcitabine,1,0.558798
3653,ACH-000650,Skin Cancer,bafetinib,0,0.527063


In [31]:
newCounts

Unnamed: 0,1,2,3,total
dolastatin-10,46,0,0,46
romidepsin,12,34,1,47
echinomycin,7,19,31,57
triptolide,1,0,2,3
sangivamycin,1,0,1,2
10-hydroxycamptothecin,0,7,14,21
maytansinol-isobutyrate,0,2,4,6
YM-155,0,2,4,6
genz-644282,0,1,6,7
UK-383367,0,1,0,1


In [32]:
newDF[newDF.cell_line == 'ACH-000268']

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9832,ACH-000268,Bile Duct Cancer,sangivamycin,0,0.667095
9839,ACH-000268,Bile Duct Cancer,BGT226,0,0.559429
9844,ACH-000268,Bile Duct Cancer,rubitecan,0,0.515812
9834,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.431504
9846,ACH-000268,Bile Duct Cancer,verubulin,0,0.427305
9835,ACH-000268,Bile Duct Cancer,delanzomib,0,0.237386
9838,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.214383
9847,ACH-000268,Bile Duct Cancer,VE-822,0,0.156771
9843,ACH-000268,Bile Duct Cancer,LY2606368,0,0.136426
9840,ACH-000268,Bile Duct Cancer,CGS-15943,0,0.110569


## Cancer precision

In [33]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [34]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Bladder Cancer,1.0,1.0,1.0,1.0,1.0
Liver Cancer,1.0,1.0,1.0,1.0,0.9
Head and Neck Cancer,1.0,1.0,1.0,0.833333,0.8
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.916667,0.866667
Breast Cancer,1.0,1.0,0.777778,0.833333,0.8
Skin Cancer,1.0,0.9,0.8,0.8,0.84
Colon/Colorectal Cancer,1.0,0.875,0.916667,0.9375,0.9
Ovarian Cancer,1.0,0.875,0.833333,0.8125,0.85
Brain Cancer,1.0,0.75,0.666667,0.6875,0.7
Lung Cancer,0.923077,0.884615,0.871795,0.884615,0.846154


In [42]:
round(cancerTest.mean(), 4)

p1    0.9450
p2    0.9161
p3    0.8569
p4    0.8505
p5    0.8280
dtype: float64

In [35]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Rhabdoid,1.0,1.0,1.0,1.0,0.95
Sarcoma,1.0,1.0,1.0,1.0,0.933333
Prostate Cancer,1.0,1.0,1.0,1.0,0.9
Gastric Cancer,1.0,1.0,1.0,0.946429,0.942857
Gallbladder Cancer,1.0,1.0,1.0,0.75,0.6
Bone Cancer,1.0,0.944444,0.888889,0.861111,0.844444
Thyroid Cancer,1.0,0.875,0.875,0.875,0.9
Neuroblastoma,1.0,0.833333,0.888889,0.916667,0.933333
Kidney Cancer,0.846154,0.769231,0.74359,0.711538,0.707692
Bile Duct Cancer,0.8,0.8,0.866667,0.8,0.84


In [43]:
round(cancerNew.mean(), 4)

p1    0.9646
p2    0.9222
p3    0.9263
p4    0.8861
p5    0.8552
dtype: float64

# Check pred range by drug

In [36]:
testPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in testDF.groupby(by='drug'):
    testPredRange['drug'].append(drug)
    testPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    testPredRange['predRange'].append(maxPred-minPred)
    
testPredRange = pd.DataFrame(testPredRange).sort_values(by='predRange', ascending=False)
testPredRange.reset_index(drop=True, inplace=True)
testPredRange[testPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,gemcitabine,23,0.374246
1,nemorubicin,32,0.372967
2,alvespimycin,38,0.351715
3,JNJ-26481585,26,0.351391
4,epothilone-b,41,0.327766
5,genz-644282,41,0.319586
6,epothilone-d,24,0.315383
7,YM-155,41,0.302402
8,PF-03758309,29,0.284653
9,mitoxantrone,34,0.277368


In [37]:
testPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
368,alvocidib,46,0.00429
40,bortezomib,44,0.133682
315,selinexor,43,0.007473
74,FK-866,42,0.082333
354,napabucasin,42,0.004826
7,YM-155,41,0.302402
17,echinomycin,41,0.228603
344,ganetespib,41,0.005661
251,LY3023414,41,0.015761
5,genz-644282,41,0.319586


In [38]:
testPredRange[testPredRange.predCount>1].predRange.mean()

0.02898067740605632

In [39]:
newPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in newDF.groupby(by='drug'):
    newPredRange['drug'].append(drug)
    newPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    newPredRange['predRange'].append(maxPred-minPred)
    
newPredRange = pd.DataFrame(newPredRange).sort_values(by='predRange', ascending=False)
newPredRange.reset_index(drop=True, inplace=True)
newPredRange[newPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,OTS167,50,0.551336
1,nemorubicin,44,0.523953
2,peruvoside,44,0.486992
3,daunorubicin,47,0.474407
4,verubulin,52,0.471637
5,epothilone-b,45,0.429051
6,epothilone-d,41,0.420397
7,genz-644282,57,0.415926
8,GSK2126458,35,0.3801
9,epothilone-a,33,0.37506


In [40]:
newPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
280,ganetespib,60,0.015785
31,echinomycin,58,0.28601
17,bortezomib,57,0.348287
7,genz-644282,57,0.415926
270,XL888,52,0.017949
4,verubulin,52,0.471637
268,LY3023414,51,0.018358
306,alvocidib,51,0.01174
159,NVP-AUY922,51,0.048845
0,OTS167,50,0.551336


In [41]:
newPredRange[newPredRange.predCount>1].predRange.mean()

0.044278942687357555