# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Define

## vars

In [3]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/rawDrug_rawRNA/'
drugPath = None
rnaPath = None


# Data

## Load cell lines

In [4]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [5]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [6]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [7]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,...,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,1,0,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0


In [8]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [9]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [10]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [14]:
modelPath = os.path.join(fusionPath, 'models')
models = os.listdir(modelPath)

In [12]:
thresh1 = 0.2
bestModel1 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=drugPath, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=1, thresh=thresh1, returnThresh=True)
    
    if (currentThresh > thresh1):
        thresh1 = currentThresh
        bestModel1 = m

print(f"Best model in terms of precision@1:\n{bestModel1} ({round(thresh1, 4)})")

Model: FusionFewShotRawDrugRawCell_NL128_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.3137
	Precision@2: 0.3431
	Precision@3: 0.3725
	Precision@4: 0.4265
	Precision@5: 0.4353
	Precision@10: 0.5462

Model: FusionFewShotRawDrugRawCell_NL64_32_16_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.3137
	Precision@2: 0.3431
	Precision@3: 0.3529
	Precision@4: 0.3676
	Precision@5: 0.3765
	Precision@10: 0.4538

Model: FusionFewShotRawDrugRawCell_NL64_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.3529
	Precision@2: 0.3431
	Precision@3: 0.3333
	Precision@4: 0.3333
	Precision@5: 0.3529
	Precision@10: 0.4949

Best model in terms of precision@1:
FusionFewShotRawDrugRawCell_NL64_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna (0.3529)


In [13]:
thresh5 = 0.2
bestModel5 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=drugPath, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=5, thresh=thresh5, returnThresh=True)
    
    if (currentThresh > thresh5):
        thresh5 = currentThresh
        bestModel5 = m
        
print(f"Best model in terms of precision@5:\n{bestModel5} ({round(thresh5, 4)})")

Model: FusionFewShotRawDrugRawCell_NL128_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.3137
	Precision@2: 0.3431
	Precision@3: 0.3725
	Precision@4: 0.4265
	Precision@5: 0.4353
	Precision@10: 0.5462

Best model in terms of precision@5:
FusionFewShotRawDrugRawCell_NL128_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna (0.4353)


In [19]:
print(f"Best model same @ k=1 and k=5: {bestModel1==bestModel5}")

Best model same @ k=1 and k=5: False


In [16]:
bestModel = 'FusionFewShotRawDrugRawCell_NL128_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna'
fusionPath = os.path.join(modelPath, bestModel)

In [17]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=fusionPath, drugPath=drugPath, rnaPath=rnaPath)

In [18]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                            newDF=newTemp.copy(),
                                                                            thresh=0.1)


Average Cell Line precision @ k on test set
	Precision@1: 0.3137
	Precision@2: 0.3431
	Precision@3: 0.3725
	Precision@4: 0.4265
	Precision@5: 0.4353
	Precision@10: 0.5462

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.4154
	Precision@2: 0.4308
	Precision@3: 0.4462
	Precision@4: 0.4731
	Precision@5: 0.4862
	Precision@10: 0.5941


Test set:
No true effective drugs identified in top 3 for ACH-000161 (top drug: docetaxel)
No true effective drugs identified in top 3 for ACH-000280 (top drug: docetaxel)
No true effective drugs identified in top 3 for ACH-000329 (top drug: tanespimycin)
No true effective drugs identified in top 3 for ACH-000493 (top drug: mitoxantrone)
No true effective drugs identified in top 3 for ACH-000510 (top drug: tanespimycin)
No true effective drugs identified in top 3 for ACH-000527 (top drug: epothilone-d)
No true effective drugs identified in top 3 for ACH-000573 (top drug: docetaxel)
No true effective drugs identified in top 3 for ACH-000650 (t

In [19]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0036


In [20]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0045


In [21]:
testCounts.head()

Unnamed: 0,1,2,3,total
mitoxantrone,7,3,1,11
tanespimycin,6,1,4,11
PF-03758309,5,4,1,10
docetaxel,5,2,1,8
brilliant-green,3,1,0,4


In [19]:
# poor cell line
testDF[testDF.cell_line == 'ACH-000161'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
6234,ACH-000161,Lung Cancer,docetaxel,0,0.596557
6272,ACH-000161,Lung Cancer,PF-03758309,0,0.596553
6289,ACH-000161,Lung Cancer,AZD8330,0,0.593314
6261,ACH-000161,Lung Cancer,epothilone-d,1,0.587971
6259,ACH-000161,Lung Cancer,sangivamycin,1,0.58127
6278,ACH-000161,Lung Cancer,brilliant-green,1,0.581036
6235,ACH-000161,Lung Cancer,cabazitaxel,0,0.580048
6262,ACH-000161,Lung Cancer,JNJ-26481585,0,0.579571
6284,ACH-000161,Lung Cancer,gemcitabine,1,0.578958
6274,ACH-000161,Lung Cancer,YM-155,1,0.57823


In [20]:
# poor cell line
testDF[testDF.cell_line == 'ACH-000899'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4505,ACH-000899,Skin Cancer,mitoxantrone,0,0.598364
4527,ACH-000899,Skin Cancer,sirolimus,0,0.597629
4538,ACH-000899,Skin Cancer,tanespimycin,0,0.579903
4496,ACH-000899,Skin Cancer,genz-644282,0,0.523898
4545,ACH-000899,Skin Cancer,10-hydroxycamptothecin,1,0.51545
4528,ACH-000899,Skin Cancer,alvespimycin,1,0.429023
4534,ACH-000899,Skin Cancer,gemcitabine,0,0.423734
4521,ACH-000899,Skin Cancer,epothilone-b,0,0.423228
4515,ACH-000899,Skin Cancer,JNJ-26481585,0,0.410858
4541,ACH-000899,Skin Cancer,OTS167,0,0.404912


## Cancer precision

In [25]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [26]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Head and Neck Cancer,1.0,0.833333,0.888889,0.75,0.733333
Endometrial/Uterine Cancer,0.666667,0.5,0.444444,0.5,0.6
Pancreatic Cancer,0.5,0.5,0.583333,0.625,0.55
Brain Cancer,0.5,0.375,0.416667,0.4375,0.5
Bladder Cancer,0.333333,0.166667,0.444444,0.5,0.466667
Breast Cancer,0.333333,0.166667,0.111111,0.25,0.2
Colon/Colorectal Cancer,0.25,0.125,0.333333,0.4375,0.5
Lung Cancer,0.230769,0.384615,0.384615,0.442308,0.446154
Skin Cancer,0.2,0.3,0.333333,0.35,0.36
Esophageal Cancer,0.0,0.333333,0.222222,0.166667,0.133333


In [1]:
round(cancerTest.mean(), 4)

NameError: name 'cancerTest' is not defined

In [28]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Gallbladder Cancer,1.0,1.0,0.666667,0.75,0.8
Gastric Cancer,0.642857,0.5,0.52381,0.535714,0.528571
Prostate Cancer,0.5,0.75,0.666667,0.75,0.7
Sarcoma,0.5,0.5,0.5,0.5,0.5
Kidney Cancer,0.461538,0.423077,0.410256,0.403846,0.384615
Bone Cancer,0.333333,0.5,0.592593,0.583333,0.622222
Neuroblastoma,0.333333,0.333333,0.444444,0.416667,0.4
Rhabdoid,0.25,0.375,0.416667,0.5625,0.6
Bile Duct Cancer,0.2,0.2,0.266667,0.4,0.4
Thyroid Cancer,0.125,0.25,0.208333,0.25,0.35


In [29]:
round(cancerNew.mean(), 4)

p1    0.4346
p2    0.4831
p3    0.4696
p4    0.5152
p5    0.5285
dtype: float64

# Check pred range by drug

In [30]:
testPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in testDF.groupby(by='drug'):
    testPredRange['drug'].append(drug)
    testPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    testPredRange['predRange'].append(maxPred-minPred)
    
testPredRange = pd.DataFrame(testPredRange).sort_values(by='predRange', ascending=False)
testPredRange.reset_index(drop=True, inplace=True)
testPredRange[testPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,delanzomib,28,0.56812
1,ouabain,22,0.566837
2,daunorubicin,33,0.557075
3,echinomycin,41,0.555796
4,nemorubicin,32,0.551254
5,genz-644282,41,0.550814
6,BGT226,34,0.55045
7,paclitaxel,40,0.550424
8,mitoxantrone,34,0.550331
9,bortezomib,44,0.550175


In [34]:
testPredRange.sort_values(by='predCount', ascending=False).head(5)

Unnamed: 0,drug,predCount,predRange
250,alvocidib,46,0.043839
9,bortezomib,44,0.550175
217,selinexor,43,0.054523
126,FK-866,42,0.155546
502,napabucasin,42,0.001936


In [35]:
testPredRange[testPredRange.predCount>1].predRange.mean()

0.08832005814637249

In [32]:
newPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in newDF.groupby(by='drug'):
    newPredRange['drug'].append(drug)
    newPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    newPredRange['predRange'].append(maxPred-minPred)
    
newPredRange = pd.DataFrame(newPredRange).sort_values(by='predRange', ascending=False)
newPredRange.reset_index(drop=True, inplace=True)
newPredRange.head(10)

Unnamed: 0,drug,predCount,predRange
0,nemorubicin,44,0.560565
1,rubitecan,49,0.557356
2,BGT226,39,0.556675
3,dolastatin-10,46,0.55589
4,10-hydroxycamptothecin,48,0.554633
5,peruvoside,44,0.552549
6,ixazomib-citrate,19,0.5523
7,genz-644282,57,0.550053
8,tanespimycin,49,0.549958
9,mitoxantrone,38,0.548151


In [33]:
newPredRange[newPredRange.predCount>1].predRange.mean()

0.08788447095602113