# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Define

# Data

## Load cell lines

In [3]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [4]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [5]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [6]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

In [7]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [8]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [9]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Model performance

In [10]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                             fusionPath=None, drugPath=None, rnaPath=None)

In [11]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                             newDF=newTemp.copy())


Average Cell Line precision @ k on test set
	Precision@1: 0.8235
	Precision@2: 0.8824
	Precision@3: 0.8824
	Precision@4: 0.8382
	Precision@5: 0.7843
	Precision@10: 0.7

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.8923
	Precision@2: 0.9154
	Precision@3: 0.8615
	Precision@4: 0.8423
	Precision@5: 0.8
	Precision@10: 0.7157


Test set:

	# of cell lines without effective drug among top-3 recs: 0
	# of unique drugs among top-3 predictions: 13

New cancer set
No true effective drugs identified in top 3 for ACH-000268 (top drug: rubitecan)

	# of cell lines without effective drug among top-3 recs: 1
	# of unique drugs among top-3 predictions: 14


In [12]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0031


In [13]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0032


In [13]:
testCounts

Unnamed: 0,1,2,3,total
echinomycin,41,0,0,41
dolastatin-10,6,29,0,35
romidepsin,2,9,21,32
10-hydroxycamptothecin,2,5,14,21
nemorubicin,1,0,0,1
alvespimycin,0,4,5,9
epothilone-b,0,2,5,7
YM-155,0,1,2,3
genz-644282,0,1,2,3
epothilone-d,0,1,0,1


In [14]:
newCounts

Unnamed: 0,1,2,3,total
echinomycin,58,0,0,58
dolastatin-10,4,42,0,46
romidepsin,3,12,32,47
alvespimycin,1,0,8,9
rubitecan,1,0,1,2
10-hydroxycamptothecin,0,8,15,23
genz-644282,0,2,2,4
epothilone-b,0,1,3,4
thiostrepton,0,1,1,2
epothilone-d,0,1,0,1


In [15]:
newDF[newDF.cell_line == 'ACH-000268']

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9844,ACH-000268,Bile Duct Cancer,rubitecan,0,0.459691
9834,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.383715
9839,ACH-000268,Bile Duct Cancer,BGT226,0,0.360217
9838,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.158222
9841,ACH-000268,Bile Duct Cancer,LY3023414,1,0.15121
9833,ACH-000268,Bile Duct Cancer,GDC-0980,0,0.122481
9846,ACH-000268,Bile Duct Cancer,verubulin,0,0.111592
9835,ACH-000268,Bile Duct Cancer,delanzomib,0,0.083737
9850,ACH-000268,Bile Duct Cancer,bardoxolone-methyl,0,0.07786
9847,ACH-000268,Bile Duct Cancer,VE-822,0,0.075815


## Cancer precision

In [16]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [17]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Liver Cancer,1.0,1.0,1.0,1.0,1.0
Bladder Cancer,1.0,1.0,1.0,0.916667,0.866667
Endometrial/Uterine Cancer,1.0,1.0,1.0,0.916667,0.866667
Head and Neck Cancer,1.0,1.0,1.0,0.916667,0.866667
Skin Cancer,1.0,1.0,0.933333,0.8,0.72
Brain Cancer,1.0,0.875,0.833333,0.75,0.75
Lung Cancer,0.769231,0.884615,0.871795,0.826923,0.769231
Colon/Colorectal Cancer,0.75,0.875,0.916667,0.9375,0.8
Ovarian Cancer,0.75,0.875,0.833333,0.8125,0.8
Breast Cancer,0.666667,0.833333,0.777778,0.833333,0.733333


In [27]:
cancerTest.mean()

p1    0.841880
p2    0.886218
p3    0.891168
p4    0.852591
p5    0.801880
dtype: float64

In [18]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,1.0,1.0,0.9
Gastric Cancer,1.0,1.0,0.928571,0.946429,0.914286
Bone Cancer,1.0,1.0,0.925926,0.861111,0.8
Neuroblastoma,1.0,1.0,0.888889,0.833333,0.8
Rhabdoid,1.0,1.0,0.833333,0.8125,0.8
Gallbladder Cancer,1.0,1.0,0.666667,0.5,0.6
Sarcoma,1.0,0.916667,0.944444,0.833333,0.8
Thyroid Cancer,0.875,0.875,0.916667,0.90625,0.85
Bile Duct Cancer,0.8,0.8,0.8,0.8,0.76
Kidney Cancer,0.615385,0.769231,0.692308,0.711538,0.661538


In [26]:
cancerNew.mean()

p1    0.929038
p2    0.936090
p3    0.859681
p4    0.820449
p5    0.788582
dtype: float64

# Check pred range by drug

In [19]:
testPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in testDF.groupby(by='drug'):
    testPredRange['drug'].append(drug)
    testPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    testPredRange['predRange'].append(maxPred-minPred)
    
testPredRange = pd.DataFrame(testPredRange).sort_values(by='predRange', ascending=False)
testPredRange.reset_index(drop=True, inplace=True)
testPredRange[testPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,alvespimycin,38,0.662274
1,10-hydroxycamptothecin,38,0.657259
2,epothilone-b,41,0.653119
3,genz-644282,41,0.650048
4,YM-155,41,0.632166
5,romidepsin,32,0.620055
6,tanespimycin,39,0.619241
7,dolastatin-10,35,0.605186
8,cephalomannine,21,0.600409
9,nanchangmycin,14,0.596181


In [20]:
testPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
118,alvocidib,46,0.187772
43,bortezomib,44,0.359164
354,selinexor,43,0.083273
64,FK-866,42,0.274333
168,napabucasin,42,0.151051
207,ganetespib,41,0.131437
30,echinomycin,41,0.46042
48,LY3023414,41,0.338669
4,YM-155,41,0.632166
3,genz-644282,41,0.650048


In [23]:
testPredRange[testPredRange.predCount>1].predRange.mean()

0.12167913346743009

In [21]:
newPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in newDF.groupby(by='drug'):
    newPredRange['drug'].append(drug)
    newPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    newPredRange['predRange'].append(maxPred-minPred)
    
newPredRange = pd.DataFrame(newPredRange).sort_values(by='predRange', ascending=False)
newPredRange.reset_index(drop=True, inplace=True)
newPredRange[newPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,alvespimycin,43,0.703147
1,genz-644282,57,0.699525
2,10-hydroxycamptothecin,48,0.690107
3,YM-155,45,0.684376
4,rubitecan,49,0.678532
5,tanespimycin,49,0.678014
6,nemorubicin,44,0.67639
7,epothilone-b,45,0.672247
8,OTS167,50,0.653271
9,topotecan,45,0.649348


In [24]:
newPredRange.sort_values(by='predCount', ascending=False).head(10)

Unnamed: 0,drug,predCount,predRange
170,ganetespib,60,0.171979
34,echinomycin,58,0.471247
1,genz-644282,57,0.699525
38,bortezomib,57,0.433603
99,verubulin,52,0.244728
365,XL888,52,0.080448
42,LY3023414,51,0.411
103,alvocidib,51,0.240426
412,NVP-AUY922,51,0.069016
8,OTS167,50,0.653271


In [25]:
newPredRange[newPredRange.predCount>1].predRange.mean()

0.13067361214953396