# Load packages

In [11]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [12]:
from scripts.evalModel import evalLogisticModels, getPredDist

# Data

## Load cell lines

In [13]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [14]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [15]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [16]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [17]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [18]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [19]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [22]:
fusionPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/rawDrug_rawRNA/models'
bestModel = 'FusionFewShotRawDrugRawCell_NL128_64_32_DO0-1_AFrelu_LR0-001_DR0-99_DS1024_BYrna'
fusionPath = os.path.join(fusionPath, bestModel)

In [23]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(), alt='rf',
                            fusionPath=fusionPath, drugPath=None, rnaPath=None)

In [36]:
evalLog.model.best_params_

{'criterion': 'entropy', 'min_samples_split': 10, 'n_estimators': 50}

In [37]:
testDF, newDF, testWrong, newWrong, testCounts, newCounts = evalLog.evaluate(testDF=testTemp.copy(), 
                                                                            newDF=newTemp.copy(),
                                                                            thresh=0.1)


Average Cell Line precision @ k on test set
	Precision@1: 0.549
	Precision@2: 0.5294
	Precision@3: 0.5098
	Precision@4: 0.4804
	Precision@5: 0.4588
	Precision@10: 0.4641

Average Cell Line precision @ k on newcancer set
	Precision@1: 0.5846
	Precision@2: 0.5
	Precision@3: 0.4872
	Precision@4: 0.5077
	Precision@5: 0.5077
	Precision@10: 0.5137


Test set:
No true effective drugs identified in top 3 for ACH-000467 (top drug: brilliant-green)
No true effective drugs identified in top 3 for ACH-000650 (top drug: dronedarone)
No true effective drugs identified in top 3 for ACH-000651 (top drug: brilliant-green)
No true effective drugs identified in top 3 for ACH-000663 (top drug: mitoxantrone)
No true effective drugs identified in top 3 for ACH-000853 (top drug: dacarbazine)
No true effective drugs identified in top 3 for ACH-000860 (top drug: cephalomannine)
No true effective drugs identified in top 3 for ACH-000978 (top drug: idarubicin)

	# of cell lines without effective drug among top-3

In [38]:
testDist = getPredDist(testDF)

Avg varaince of predictions for each drug: 0.0154


In [39]:
newDist = getPredDist(newDF)

Avg varaince of predictions for each drug: 0.0173


## Cancer precision

In [43]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [44]:
print(round(cancerTest.mean(), 4))
cancerTest

p1    0.5110
p2    0.4885
p3    0.5099
p4    0.4706
p5    0.4401
dtype: float64


Unnamed: 0,p1,p2,p3,p4,p5
Pancreatic Cancer,1.0,0.875,0.666667,0.5625,0.45
Brain Cancer,0.75,0.75,0.75,0.625,0.6
Ovarian Cancer,0.75,0.5,0.583333,0.5625,0.55
Head and Neck Cancer,0.666667,0.5,0.444444,0.5,0.4
Endometrial/Uterine Cancer,0.666667,0.333333,0.444444,0.416667,0.4
Lung Cancer,0.615385,0.653846,0.512821,0.538462,0.538462
Skin Cancer,0.6,0.5,0.466667,0.4,0.36
Liver Cancer,0.5,0.5,0.666667,0.625,0.5
Esophageal Cancer,0.333333,0.333333,0.555556,0.416667,0.4
Colon/Colorectal Cancer,0.25,0.25,0.25,0.25,0.35


In [45]:
print(round(cancerNew.mean(), 4))
cancerNew

p1    0.6542
p2    0.5304
p3    0.4970
p4    0.5190
p5    0.5338
dtype: float64


Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,0.833333,0.875,0.9
Gallbladder Cancer,1.0,0.5,0.333333,0.25,0.4
Bone Cancer,0.888889,0.666667,0.555556,0.5,0.511111
Neuroblastoma,0.666667,0.5,0.444444,0.5,0.533333
Thyroid Cancer,0.625,0.5,0.416667,0.5,0.525
Sarcoma,0.5,0.5,0.611111,0.625,0.6
Gastric Cancer,0.5,0.464286,0.52381,0.535714,0.528571
Rhabdoid,0.5,0.25,0.333333,0.5,0.5
Kidney Cancer,0.461538,0.423077,0.384615,0.403846,0.4
Bile Duct Cancer,0.4,0.5,0.533333,0.5,0.44
