# Load packages

In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Data

## Load cell lines

In [3]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [4]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [5]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [6]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [7]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [8]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [9]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [14]:
rnaPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/models/'
rnaModel = 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
rnaPath = os.path.join(rnaPath, rnaModel)

In [11]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/rawDrug_embedRNA/models'
bestModel = 'FusionFewShotRawDrugEmbedCell_NL64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna'
fusionPath = os.path.join(fusionPath, bestModel)

In [15]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(), alt='rf',
                            fusionPath=fusionPath, drugPath=None, rnaPath=rnaPath)

In [25]:
evalLog.model.best_params_

{'criterion': 'gini', 'min_samples_split': 10, 'n_estimators': 100}

In [16]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                            newDF=newTemp.copy(),
                                                                            thresh=0.1)


Average Cell Line precision @ k on test set
	Precision@1: 0.9804
	Precision@2: 0.9216
	Precision@3: 0.8758
	Precision@4: 0.8725
	Precision@5: 0.8431
	Precision@10: 0.8231

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.9385
	Precision@2: 0.8923
	Precision@3: 0.8564
	Precision@4: 0.8385
	Precision@5: 0.8369
	Precision@10: 0.7902


Test set:

	# of cell lines without effective drug among top-3 recs: 0
	# of unique drugs among top-3 predictions: 16

New cancer set
No true effective drugs identified in top 3 for ACH-000268 (top drug: rubitecan)

	# of cell lines without effective drug among top-3 recs: 1
	# of unique drugs among top-3 predictions: 18


In [17]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0024


In [18]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0024


In [19]:
testCounts.head()

Unnamed: 0,1,2,3,total
dolastatin-10,24,6,2,32
romidepsin,12,10,2,24
10-hydroxycamptothecin,5,6,6,17
echinomycin,3,7,10,20
YM-155,2,7,5,14


In [20]:
# poor cell line
testDF[testDF.cell_line == 'ACH-000161'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
6288,ACH-000161,Lung Cancer,dolastatin-10,1,0.997143
6274,ACH-000161,Lung Cancer,YM-155,1,0.935867
6301,ACH-000161,Lung Cancer,echinomycin,0,0.871101
6284,ACH-000161,Lung Cancer,gemcitabine,1,0.86566
6296,ACH-000161,Lung Cancer,10-hydroxycamptothecin,1,0.862817
6235,ACH-000161,Lung Cancer,cabazitaxel,0,0.589262
6237,ACH-000161,Lung Cancer,genz-644282,0,0.505544
6259,ACH-000161,Lung Cancer,sangivamycin,1,0.48894
6278,ACH-000161,Lung Cancer,brilliant-green,1,0.411189
6262,ACH-000161,Lung Cancer,JNJ-26481585,0,0.393412


In [21]:
# poor cell line
testDF[testDF.cell_line == 'ACH-000899'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4545,ACH-000899,Skin Cancer,10-hydroxycamptothecin,1,0.997033
4514,ACH-000899,Skin Cancer,sangivamycin,1,0.895005
4496,ACH-000899,Skin Cancer,genz-644282,0,0.879025
4528,ACH-000899,Skin Cancer,alvespimycin,1,0.800558
4534,ACH-000899,Skin Cancer,gemcitabine,0,0.764128
4521,ACH-000899,Skin Cancer,epothilone-b,0,0.694863
4560,ACH-000899,Skin Cancer,verubulin,0,0.587632
4531,ACH-000899,Skin Cancer,CUDC-907,0,0.466394
4515,ACH-000899,Skin Cancer,JNJ-26481585,0,0.462645
4527,ACH-000899,Skin Cancer,sirolimus,0,0.433997


## Cancer precision

In [22]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [23]:
print(round(cancerTest.mean(), 4))
cancerTest

p1    0.9936
p2    0.9228
p3    0.8895
p4    0.8844
p5    0.8412
dtype: float64


Unnamed: 0,p1,p2,p3,p4,p5
Colon/Colorectal Cancer,1.0,1.0,1.0,1.0,0.95
Head and Neck Cancer,1.0,1.0,1.0,1.0,0.933333
Liver Cancer,1.0,1.0,1.0,1.0,0.9
Bladder Cancer,1.0,1.0,1.0,1.0,0.866667
Ovarian Cancer,1.0,1.0,0.916667,0.875,0.9
Skin Cancer,1.0,0.9,0.8,0.85,0.8
Pancreatic Cancer,1.0,0.875,0.833333,0.8125,0.8
Brain Cancer,1.0,0.875,0.833333,0.8125,0.75
Endometrial/Uterine Cancer,1.0,0.833333,0.888889,0.916667,0.866667
Breast Cancer,1.0,0.833333,0.777778,0.833333,0.733333


In [24]:
print(round(cancerNew.mean(), 4))
cancerNew

p1    0.9263
p2    0.8829
p3    0.8417
p4    0.8365
p5    0.8289
dtype: float64


Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,1.0,1.0,1.0
Bone Cancer,1.0,1.0,0.962963,0.916667,0.933333
Gastric Cancer,1.0,1.0,0.952381,0.928571,0.9
Bile Duct Cancer,1.0,1.0,0.866667,0.85,0.84
Gallbladder Cancer,1.0,1.0,0.666667,0.75,0.6
Sarcoma,1.0,0.833333,0.833333,0.833333,0.833333
Thyroid Cancer,1.0,0.8125,0.833333,0.8125,0.825
Kidney Cancer,0.846154,0.807692,0.717949,0.711538,0.707692
Rhabdoid,0.75,0.875,0.916667,0.8125,0.85
Neuroblastoma,0.666667,0.5,0.666667,0.75,0.8
