# Load packages

In [2]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
from scripts.evalModel import evalLogisticModels

# Define

## vars

In [4]:
fusionPath = '../../models/fusionEncoders/FusionFewShot_Layers2_Hidden32_DO0-0_AFrelu_LR0-001_DR0-99_DS4096_BYrna'
drugPath = '../../models/drugEncoders/DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
rnaPath = '../../models/cellEncoders/CellLineFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000'

# Data

## Load cell lines

In [5]:
trainRNA = pd.read_csv('../../data/processed/RNA_train_cancergenes.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [6]:
cdr = pd.read_csv('../../data/processed/drugCellLinePairsData.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [7]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [8]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [9]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [10]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [11]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Fusion performance

In [12]:
path = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/fusion/embedDrug_embedRNA_V1/models'
models = os.listdir(path)
models = [m for m in models if 'BYrna' in m]

In [None]:
thresh = 0.75
bestModel = ''
for m in models:
    print(f"\n{m}")
    currentFusion = os.path.join(path, m)
    evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=currentFusion, drugPath=drugPath, rnaPath=rnaPath)
    
    currentThresh = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), thresh=thresh, returnThresh=True)
    
    if (currentThresh > thresh):
        thresh = currentThresh
        bestModel = m


FusionFewShot_Layers2_Hidden16_DO0-0_AFrelu_LR0-1_DR0-99_DS4096_BYrna

FusionFewShot_Layers2_Hidden8_DO0-0_AFrelu_LR0-01_DR0-99_DS4096_BYrna


In [None]:
# bestModel = 'FusionFewShot_Layers2_Hidden32_DO0-0_AFrelu_LR0-001_DR0-99_DS4096_BYrna'
print(bestModel)

### Precision by cell line and top-3 predictions for best model

In [None]:
# fusionPath = os.path.join(path, bestModel)

In [13]:
evalLog = evalLogisticModels(trainData.copy(), trainEff, testData.copy(), newData.copy(),
                                fusionPath=fusionPath, drugPath=drugPath, rnaPath=rnaPath)

In [14]:
testDF, newDF, wrong, counts = evalLog.evaluate(testDF=testTemp.copy(), newDF=newTemp.copy(), thresh=0.5)

Average Cell Line precision @ k on test set
	Precision@1: 0.7647
	Precision@2: 0.7745
	Precision@3: 0.7778
	Precision@4: 0.7745
	Precision@5: 0.7882
	Precision@10: 0.7923


Average Cell Line precision @ k on newcancer set
	Precision@1: 0.8154
	Precision@2: 0.7769
	Precision@3: 0.7897
	Precision@4: 0.7885
	Precision@5: 0.7846
	Precision@10: 0.7941


No true effective drugs identified in top 3 for ACH-000161 (top drug: echinomycin)
No true effective drugs identified in top 3 for ACH-000750 (top drug: camptothecin)

# of unique drugs among top-3 predictions: 26


In [15]:
counts

Unnamed: 0,1,2,3,total
10-hydroxycamptothecin,6,4,4,14
genz-644282,5,7,8,20
YM-155,5,5,3,13
cabazitaxel,4,6,2,12
romidepsin,3,5,5,13
alvespimycin,3,4,1,8
epothilone-b,3,3,9,15
echinomycin,3,3,6,12
camptothecin,3,1,0,4
maytansinol-isobutyrate,3,1,0,4


In [16]:
print(f"Number of unique drugs: {len(counts)}")

Number of unique drugs: 26


In [17]:
wrong

['ACH-000161', 'ACH-000750']

In [18]:
testDF[testDF.cell_line == 'ACH-000161'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
6301,ACH-000161,Lung Cancer,echinomycin,0,0.810681
6235,ACH-000161,Lung Cancer,cabazitaxel,0,0.807362
6237,ACH-000161,Lung Cancer,genz-644282,0,0.79731
6288,ACH-000161,Lung Cancer,dolastatin-10,1,0.796758
6296,ACH-000161,Lung Cancer,10-hydroxycamptothecin,1,0.796732
6231,ACH-000161,Lung Cancer,RITA,1,0.773266
6274,ACH-000161,Lung Cancer,YM-155,1,0.766606
6262,ACH-000161,Lung Cancer,JNJ-26481585,0,0.745375
6259,ACH-000161,Lung Cancer,sangivamycin,1,0.731357
6284,ACH-000161,Lung Cancer,gemcitabine,1,0.719159


In [19]:
testDF[testDF.cell_line == 'ACH-000750'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4087,ACH-000750,Skin Cancer,camptothecin,0,0.82116
4198,ACH-000750,Skin Cancer,alvespimycin,0,0.802244
4082,ACH-000750,Skin Cancer,gemcitabine,0,0.795641
4093,ACH-000750,Skin Cancer,dolastatin-10,1,0.773673
4066,ACH-000750,Skin Cancer,YM-155,1,0.768075
4048,ACH-000750,Skin Cancer,genz-644282,1,0.758633
4130,ACH-000750,Skin Cancer,echinomycin,1,0.754948
4111,ACH-000750,Skin Cancer,10-hydroxycamptothecin,1,0.7545
4137,ACH-000750,Skin Cancer,nemorubicin,1,0.719455
4096,ACH-000750,Skin Cancer,OTS167,1,0.500505


### precision by cancer

In [20]:
cancerTest, cancerNew = evalLog.getCancerPerformance(testDF.copy(), newDF.copy())

In [21]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Bladder Cancer,1.0,1.0,1.0,1.0,1.0
Head and Neck Cancer,1.0,1.0,1.0,0.916667,0.866667
Ovarian Cancer,1.0,1.0,1.0,0.875,0.8
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.833333,0.8
Colon/Colorectal Cancer,1.0,0.875,0.833333,0.875,0.9
Pancreatic Cancer,1.0,0.875,0.833333,0.75,0.8
Esophageal Cancer,1.0,0.666667,0.777778,0.833333,0.733333
Skin Cancer,0.8,0.8,0.666667,0.7,0.76
Lung Cancer,0.538462,0.730769,0.717949,0.711538,0.769231
Liver Cancer,0.5,0.5,0.666667,0.75,0.7


In [22]:
print('Number of cell lines per cancer type in training data')
trainCDR.loc[:, ['DepMap_ID', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Number of cell lines per cancer type in training data


Lung Cancer                   63
Skin Cancer                   25
Brain Cancer                  23
Pancreatic Cancer             21
Ovarian Cancer                21
Colon/Colorectal Cancer       17
Esophageal Cancer             15
Endometrial/Uterine Cancer    14
Head and Neck Cancer          14
Bladder Cancer                14
Breast Cancer                 13
Liver Cancer                  11
Name: cancer_type, dtype: int64

In [23]:
print('Number of cell lines per cancer type in test data')
testDF.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()


Number of cell lines per cancer type in test data


Lung Cancer                   13
Skin Cancer                    5
Brain Cancer                   5
Ovarian Cancer                 4
Pancreatic Cancer              4
Colon/Colorectal Cancer        4
Breast Cancer                  3
Bladder Cancer                 3
Esophageal Cancer              3
Head and Neck Cancer           3
Endometrial/Uterine Cancer     3
Liver Cancer                   2
Name: cancer_type, dtype: int64

In [24]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,1.0,1.0,0.9
Gallbladder Cancer,1.0,1.0,1.0,0.75,0.6
Neuroblastoma,1.0,1.0,0.888889,0.916667,0.933333
Bile Duct Cancer,1.0,0.7,0.733333,0.7,0.72
Bone Cancer,0.888889,0.833333,0.851852,0.777778,0.8
Gastric Cancer,0.857143,0.821429,0.809524,0.821429,0.814286
Sarcoma,0.833333,0.916667,0.888889,0.875,0.833333
Rhabdoid,0.75,0.75,0.833333,0.875,0.85
Thyroid Cancer,0.75,0.75,0.833333,0.8125,0.825
Kidney Cancer,0.615385,0.576923,0.589744,0.653846,0.661538


In [25]:
print('Number of cell lines per cancer type in newcancer data')
newDF.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Number of cell lines per cancer type in newcancer data


Kidney Cancer         14
Gastric Cancer        14
Bone Cancer            9
Thyroid Cancer         8
Bile Duct Cancer       6
Sarcoma                6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64