# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Define

## vars

In [3]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/rawDrug_embedRNA/'
drugPath = None
rnaPath = '../../models/cellEncoders/CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'


# Data

## Load cell lines

In [3]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [4]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [5]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [6]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [7]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [8]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [9]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [15]:
rnaPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/models/'
rnaModel = 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
rnaPath = os.path.join(rnaPath, rnaModel)

In [12]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/rawDrug_embedRNA/'
modelPath = os.path.join(fusionPath, 'models')
models = os.listdir(modelPath)

In [None]:
thresh1 = 0.7
bestModel1 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=None, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=1, thresh=thresh1, returnThresh=True)
    
    if (currentThresh > thresh1):
        thresh1 = currentThresh
        bestModel1 = m

print(f"Best model in terms of precision@1:\n{bestModel1} ({round(thresh1, 4)})")

In [17]:
thresh5 = 0.75
bestModel5 = ''
for m in models:
    currentFusion = os.path.join(modelPath, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=None, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), 
                                     modelName=m, at=5, thresh=thresh5, returnThresh=True)
    
    if (currentThresh > thresh5):
        thresh5 = currentThresh
        bestModel5 = m
        
print(f"Best model in terms of precision@5:\n{bestModel5} ({round(thresh5, 4)})")

Model: FusionFewShotRawDrugEmbedCell_NL64_64_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna
	Precision@1: 0.8039
	Precision@2: 0.8235
	Precision@3: 0.817
	Precision@4: 0.8039
	Precision@5: 0.7647
	Precision@10: 0.7974

Best model in terms of precision@5:
FusionFewShotRawDrugEmbedCell_NL64_64_64_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna (0.7647)


In [18]:
print(f"Best model same @ k=1 and k=5: {bestModel1==bestModel5}")

Best model same @ k=1 and k=5: False


In [12]:
bestModel = 'FusionFewShotRawDrugEmbedCell_NL64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna' # this is bestModel1
fusionPath = os.path.join(modelPath, bestModel)

In [13]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=fusionPath, drugPath=drugPath, rnaPath=rnaPath)

In [14]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts  = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                                newDF=newTemp.copy(),
                                                                                thresh=0.1)


Average Cell Line precision @ k on test set
	Precision@1: 0.8431
	Precision@2: 0.8431
	Precision@3: 0.817
	Precision@4: 0.8186
	Precision@5: 0.7608
	Precision@10: 0.7538

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.8154
	Precision@2: 0.8
	Precision@3: 0.8308
	Precision@4: 0.8385
	Precision@5: 0.8123
	Precision@10: 0.751


Test set:
No true effective drugs identified in top 3 for ACH-000329 (top drug: 10-hydroxycamptothecin)

	# of cell lines without effective drug among top-3 recs: 1
	# of unique drugs among top-3 predictions: 11

New cancer set
No true effective drugs identified in top 3 for ACH-000268 (top drug: epothilone-d)
No true effective drugs identified in top 3 for ACH-000457 (top drug: genz-644282)
No true effective drugs identified in top 3 for ACH-000484 (top drug: genz-644282)

	# of cell lines without effective drug among top-3 recs: 3
	# of unique drugs among top-3 predictions: 15


In [16]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0


In [19]:
testDist[testDist.predCount > 1].variance.mean()

5.966039889199543e-07

In [17]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0004


In [22]:
testCounts

Unnamed: 0,1,2,3,total
10-hydroxycamptothecin,34,4,0,38
genz-644282,9,32,0,41
YM-155,7,4,24,35
dolastatin-10,1,6,12,19
JNJ-26481585,1,0,3,4
echinomycin,0,5,4,9
nemorubicin,0,1,0,1
litronesib,0,0,5,5
epothilone-d,0,0,2,2
cabazitaxel,0,0,1,1


In [21]:
# poor cell line
testDF[testDF.cell_line == 'ACH-000329'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
2760,ACH-000329,Brain Cancer,10-hydroxycamptothecin,0,0.643295
2708,ACH-000329,Brain Cancer,genz-644282,0,0.641072
2737,ACH-000329,Brain Cancer,YM-155,0,0.632627
2703,ACH-000329,Brain Cancer,valrubicin,1,0.555486
2728,ACH-000329,Brain Cancer,JNJ-26481585,1,0.554721
2704,ACH-000329,Brain Cancer,romidepsin,1,0.552038
2772,ACH-000329,Brain Cancer,nemorubicin,1,0.543201
2750,ACH-000329,Brain Cancer,OTS167,1,0.504252
2722,ACH-000329,Brain Cancer,mitoxantrone,0,0.474227
2773,ACH-000329,Brain Cancer,KPT-185,0,0.453169


## Cancer precision

In [23]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [24]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Bladder Cancer,1.0,1.0,1.0,0.916667,0.866667
Colon/Colorectal Cancer,1.0,1.0,0.916667,0.9375,0.85
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.916667,0.8
Head and Neck Cancer,1.0,1.0,0.888889,0.833333,0.8
Ovarian Cancer,1.0,0.875,0.916667,0.875,0.75
Esophageal Cancer,1.0,0.833333,0.666667,0.666667,0.666667
Skin Cancer,1.0,0.8,0.733333,0.8,0.76
Lung Cancer,0.846154,0.807692,0.794872,0.807692,0.769231
Breast Cancer,0.666667,0.833333,0.888889,0.916667,0.733333
Liver Cancer,0.5,0.75,0.833333,0.875,0.9


In [25]:
round(cancerTest.mean(), 4)

p1    0.8344
p2    0.8562
p3    0.8287
p4    0.8267
p5    0.7663
dtype: float64

In [26]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Sarcoma,1.0,1.0,1.0,0.958333,0.9
Rhabdoid,1.0,1.0,1.0,0.9375,0.95
Prostate Cancer,1.0,1.0,0.833333,0.875,0.9
Gallbladder Cancer,1.0,0.5,0.666667,0.75,0.6
Thyroid Cancer,0.875,0.9375,0.875,0.90625,0.875
Gastric Cancer,0.857143,0.857143,0.904762,0.892857,0.871429
Bone Cancer,0.777778,0.777778,0.851852,0.861111,0.844444
Kidney Cancer,0.692308,0.538462,0.641026,0.673077,0.646154
Neuroblastoma,0.666667,0.833333,0.888889,0.916667,0.866667
Bile Duct Cancer,0.6,0.7,0.666667,0.7,0.68


In [27]:
round(cancerNew.mean(), 4)

p1    0.8469
p2    0.8144
p3    0.8328
p4    0.8471
p5    0.8134
dtype: float64

# check pred range by drug

In [28]:
testPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in testDF.groupby(by='drug'):
    testPredRange['drug'].append(drug)
    testPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    testPredRange['predRange'].append(maxPred-minPred)
    
testPredRange = pd.DataFrame(testPredRange).sort_values(by='predRange', ascending=False)
testPredRange.reset_index(drop=True, inplace=True)
testPredRange[testPredRange.predCount > 1].head(10)

Unnamed: 0,drug,predCount,predRange
0,CR8-(R),17,0.021678
1,OTS167,34,0.020566
2,genz-644282,41,0.016629
3,brefeldin-a,18,0.016334
4,Ro-106-9920,10,0.016223
5,valrubicin,21,0.015941
6,MG-132,30,0.015885
7,A-674563,11,0.015112
8,romidepsin,32,0.014987
9,verubulin,33,0.014911


In [29]:
testPredRange.sort_values(by='predCount', ascending=False).head(5)

Unnamed: 0,drug,predCount,predRange
172,alvocidib,46,0.0004281018
40,bortezomib,44,0.006299561
170,selinexor,43,0.0004373189
590,napabucasin,42,4.825956e-09
33,FK-866,42,0.007530956


In [30]:
testPredRange[testPredRange.predCount>1].predRange.mean()

0.0011064722042802289

In [31]:
newPredRange = {'drug': [], 'predCount': [], 'predRange': []}

for drug, subdf in newDF.groupby(by='drug'):
    newPredRange['drug'].append(drug)
    newPredRange['predCount'].append(len(subdf))
    minPred = subdf.pred.min()
    maxPred = subdf.pred.max()
    newPredRange['predRange'].append(maxPred-minPred)
    
newPredRange = pd.DataFrame(newPredRange).sort_values(by='predRange', ascending=False)
newPredRange.reset_index(drop=True, inplace=True)
newPredRange.head(10)

Unnamed: 0,drug,predCount,predRange
0,10-hydroxycamptothecin,48,0.60997
1,YM-155,45,0.579459
2,cabazitaxel,38,0.547853
3,AZD8330,32,0.543214
4,nanchangmycin,20,0.513079
5,SB-743921,6,0.490647
6,gemcitabine,20,0.461408
7,KPT-185,23,0.459653
8,mitoxantrone,38,0.454316
9,maytansinol-isobutyrate,15,0.450521


In [32]:
newPredRange[newPredRange.predCount>1].predRange.mean()

0.027697561559051686