# Load packages

In [2]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [1]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Define

## vars

In [11]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/embedDrug_embedRNA_V2/'
drugPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/models/DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
rnaPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/models/CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'



# Data

## Load cell lines

In [4]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [5]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [6]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [7]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [8]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [9]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [10]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [12]:
modelPath = os.path.join(fusionPath, 'models')
models = os.listdir(modelPath)

In [28]:
thresh1 = 0.9
bestModel1 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=drugPath, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=1, thresh=thresh1, returnThresh=True)
    
    if (currentThresh > thresh1):
        thresh1 = currentThresh
        bestModel1 = m

print(f"Best model in terms of precision@1:\n{bestModel1} ({round(thresh1, 4)})")

Model: FusionFewShotEmbedDrugEmbedCell_NL128_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.9412
	Precision@2: 0.8725
	Precision@3: 0.8039
	Precision@4: 0.7941
	Precision@5: 0.7765
	Precision@10: 0.741

Best model in terms of precision@1:
FusionFewShotEmbedDrugEmbedCell_NL128_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna (0.9412)


In [29]:
thresh5 = 0.7
bestModel5 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=drugPath, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=5, thresh=thresh5, returnThresh=True)
    
    if (currentThresh > thresh5):
        thresh5 = currentThresh
        bestModel5 = m
        
print(f"Best model in terms of precision@5:\n{bestModel5} ({round(thresh5, 4)})")

Model: FusionFewShotEmbedDrugEmbedCell_NL64_64_DO0-0_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.8431
	Precision@2: 0.7451
	Precision@3: 0.7647
	Precision@4: 0.7549
	Precision@5: 0.7333
	Precision@10: 0.6846

Model: FusionFewShotEmbedDrugEmbedCell_NL128_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.9412
	Precision@2: 0.8725
	Precision@3: 0.8039
	Precision@4: 0.7941
	Precision@5: 0.7765
	Precision@10: 0.741

Best model in terms of precision@5:
FusionFewShotEmbedDrugEmbedCell_NL128_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna (0.7765)


In [30]:
print(f"Best model same @ k=1 and k=5: {bestModel1==bestModel5}")

Best model same @ k=1 and k=5: True


In [31]:
# bestModel is both bestModel1 and bestModel5
bestModel = 'FusionFewShotEmbedDrugEmbedCell_NL128_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna'
fusionPath = os.path.join(modelPath, bestModel)

In [32]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=fusionPath, drugPath=drugPath, rnaPath=rnaPath)

In [33]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                            newDF=newTemp.copy(),
                                                                            thresh=0.1)


Average Cell Line precision @ k on test set
	Precision@1: 0.9412
	Precision@2: 0.8725
	Precision@3: 0.8039
	Precision@4: 0.7941
	Precision@5: 0.7765
	Precision@10: 0.741

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.9538
	Precision@2: 0.9
	Precision@3: 0.8462
	Precision@4: 0.8192
	Precision@5: 0.7846
	Precision@10: 0.7275


Test set:
No true effective drugs identified in top 3 for ACH-000823 (top drug: dolastatin-10)
No true effective drugs identified in top 3 for ACH-000899 (top drug: genz-644282)

	# of cell lines without effective drug among top-3 recs: 2
	# of unique drugs among top-3 predictions: 13

New cancer set
No true effective drugs identified in top 3 for ACH-000250 (top drug: dolastatin-10)
No true effective drugs identified in top 3 for ACH-000268 (top drug: epothilone-d)
No true effective drugs identified in top 3 for ACH-000428 (top drug: echinomycin)

	# of cell lines without effective drug among top-3 recs: 3
	# of unique drugs among top-3 predicti

In [34]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0


In [35]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0004


In [68]:
testCounts

Unnamed: 0,1,2,3,total
dolastatin-10,35,0,0,35
echinomycin,7,28,6,41
triptolide,5,8,0,13
nemorubicin,2,5,19,26
genz-644282,2,2,4,8
romidepsin,1,2,10,13
OTS167,0,4,2,6
vincristine,0,2,2,4
epothilone-d,0,1,1,2
epothilone-b,0,0,4,4


In [22]:
testDF[testDF.cell_line == 'ACH-000823'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
1463,ACH-000823,Esophageal Cancer,dolastatin-10,0,0.821118
1481,ACH-000823,Esophageal Cancer,echinomycin,0,0.75661
1499,ACH-000823,Esophageal Cancer,nemorubicin,0,0.676524
1374,ACH-000823,Esophageal Cancer,romidepsin,1,0.658472
1384,ACH-000823,Esophageal Cancer,genz-644282,1,0.602556
1466,ACH-000823,Esophageal Cancer,OTS167,1,0.588572
1424,ACH-000823,Esophageal Cancer,JNJ-26481585,0,0.550937
1436,ACH-000823,Esophageal Cancer,YM-155,1,0.506562
1454,ACH-000823,Esophageal Cancer,maytansinol-isobutyrate,1,0.452083
1501,ACH-000823,Esophageal Cancer,KPT-185,0,0.433743


In [21]:
print(testDF[testDF.cell_line == 'ACH-000823'].true.sum())
print(len(testDF[testDF.cell_line == 'ACH-000823']))

9
146


While none in top3, over half of the GT effective drugs are in top 10 recs

In [71]:
testDF[testDF.cell_line == 'ACH-000899'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4496,ACH-000899,Skin Cancer,genz-644282,0,0.602534
4541,ACH-000899,Skin Cancer,OTS167,0,0.588493
4521,ACH-000899,Skin Cancer,epothilone-b,0,0.583782
4528,ACH-000899,Skin Cancer,alvespimycin,1,0.581571
4515,ACH-000899,Skin Cancer,JNJ-26481585,0,0.550949
4545,ACH-000899,Skin Cancer,10-hydroxycamptothecin,1,0.499723
4531,ACH-000899,Skin Cancer,CUDC-907,0,0.462212
4565,ACH-000899,Skin Cancer,dabrafenib,1,0.385834
4547,ACH-000899,Skin Cancer,trichostatin-a,0,0.379966
4514,ACH-000899,Skin Cancer,sangivamycin,1,0.376427


In [72]:
print(testDF[testDF.cell_line == 'ACH-000899'].true.sum())
print(len(testDF[testDF.cell_line == 'ACH-000899']))

10
92


## Cancer

In [23]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [24]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Head and Neck Cancer,1.0,1.0,1.0,1.0,0.933333
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.833333,0.866667
Bladder Cancer,1.0,1.0,0.888889,0.833333,0.8
Liver Cancer,1.0,1.0,0.833333,0.875,0.8
Ovarian Cancer,1.0,1.0,0.833333,0.8125,0.75
Colon/Colorectal Cancer,1.0,0.875,0.833333,0.875,0.9
Brain Cancer,1.0,0.875,0.833333,0.875,0.8
Pancreatic Cancer,1.0,0.875,0.75,0.75,0.7
Breast Cancer,1.0,0.833333,0.777778,0.666667,0.666667
Lung Cancer,0.923077,0.846154,0.820513,0.788462,0.769231


In [25]:
round(cancerTest.mean(), 4)

p1    0.9491
p2    0.8837
p3    0.8068
p4    0.7994
p5    0.7810
dtype: float64

In [26]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,1.0,1.0,1.0
Rhabdoid,1.0,1.0,1.0,1.0,0.95
Gastric Cancer,1.0,1.0,1.0,0.964286,0.928571
Bone Cancer,1.0,1.0,0.925926,0.805556,0.8
Gallbladder Cancer,1.0,1.0,0.666667,0.5,0.4
Thyroid Cancer,1.0,0.9375,0.833333,0.84375,0.825
Sarcoma,1.0,0.916667,0.777778,0.833333,0.733333
Neuroblastoma,1.0,0.833333,0.777778,0.833333,0.8
Bile Duct Cancer,1.0,0.7,0.666667,0.6,0.68
Kidney Cancer,0.769231,0.730769,0.692308,0.673077,0.6


In [27]:
round(cancerNew.mean(), 4)

p1    0.9769
p2    0.9118
p3    0.8340
p4    0.8053
p5    0.7717
dtype: float64

In [83]:
newDF[newDF.drug == 'verubulin']

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9040,ACH-000099,Neuroblastoma,verubulin,1,0.744472
4126,ACH-000312,Neuroblastoma,verubulin,0,0.744322
388,ACH-000804,Neuroblastoma,verubulin,1,0.371728
3419,ACH-000375,Kidney Cancer,verubulin,0,0.350642
4695,ACH-000313,Kidney Cancer,verubulin,0,0.349951
5785,ACH-000189,Kidney Cancer,verubulin,0,0.348735
7156,ACH-000169,Sarcoma,verubulin,1,0.34855
4836,ACH-000649,Kidney Cancer,verubulin,0,0.348521
9335,ACH-000364,Bone Cancer,verubulin,0,0.34851
972,ACH-001321,Thyroid Cancer,verubulin,0,0.348345


In [84]:
newCDR[(newCDR.cancer_type == 'Neuroblastoma') & (newCDR.name == 'PIK-75')]

Unnamed: 0,DepMap_ID,cancer_type,name,moa,target,indication,phase,r2,ic50,auc,lower_limit,effectiveCont,effective
207,ACH-000804,Neuroblastoma,PIK-75,"DNA protein kinase inhibitor, PI3K inhibitor","PIK3CA, PIK3CB, PIK3CD, PIK3CG, PRKDC",,Preclinical,0.835274,0.117902,0.542702,0.000697,8.911734,1
4037,ACH-000312,Neuroblastoma,PIK-75,"DNA protein kinase inhibitor, PI3K inhibitor","PIK3CA, PIK3CB, PIK3CD, PIK3CG, PRKDC",,Preclinical,0.735792,0.119954,0.557068,0.029572,5.186187,0


In [85]:
newDF[newDF.cell_line == 'ACH-000804'].head(15)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
244,ACH-000804,Neuroblastoma,dolastatin-10,1,0.818631
322,ACH-000804,Neuroblastoma,echinomycin,1,0.750352
340,ACH-000804,Neuroblastoma,nemorubicin,1,0.672994
233,ACH-000804,Neuroblastoma,vincristine,1,0.638624
43,ACH-000804,Neuroblastoma,genz-644282,1,0.599665
120,ACH-000804,Neuroblastoma,epothilone-b,1,0.585357
253,ACH-000804,Neuroblastoma,OTS167,1,0.578488
426,ACH-000804,Neuroblastoma,alvespimycin,0,0.577298
356,ACH-000804,Neuroblastoma,NSC-697923,1,0.540885
417,ACH-000804,Neuroblastoma,paclitaxel,1,0.52311


Seems to be a few outliers for each drug cell line pair, but largely clustered around very small range