# Load packages

In [6]:
%load_ext autoreload
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Concatenate, Input

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sn

In [8]:
import warnings
warnings.filterwarnings('ignore')

# Define

## funcs

In [9]:
def countDrugsK(df, k=1):
    drugCount = {}
    wrong = []
    for cell, subdf in df.groupby(by='cell_line'):
        sortDF = subdf.sort_values(by='pred', ascending=False).reset_index(drop=True)
        drugs = sortDF.loc[:k-1, 'drug']
        for drug in drugs:
            if drug in drugCount.keys():
                drugCount[drug] += 1
            else:
                drugCount[drug] = 1
        drug = drugs[0]

        if sortDF.iloc[:k, :].true.sum() == 0:
            wrong.append(cell)
            print(f"\nNo true effective drugs identified in top {k} for {cell}")
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}\n")
        else:
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}")
    return drugCount, wrong

# Data

## Load cell lines

In [10]:
trainRNA = pd.read_csv('../data/cell_lines/RNA_train_cancergenes_updatedSplit.csv', index_col=0)
trainCellLines = list(trainRNA.index)

testRNA = pd.read_csv('../data/cell_lines/RNA_test_cancergenes_updatedSplit.csv', index_col=0)
testCellLines = list(testRNA.index)

newRNA = pd.read_csv('../data/cell_lines/RNA_newcancer_cancergenes_updatedSplit.csv', index_col=0)
newCellLines = list(newRNA.index)

## CDR

In [11]:
cdr = pd.read_csv('../data/drug_response/prismProcessed3.csv', index_col='DepMap_ID')
trainCDR = cdr.loc[trainCellLines, :].reset_index()
testCDR = cdr.loc[testCellLines, :].reset_index()
newCDR = cdr.loc[newCellLines, :].reset_index()

In [12]:
testCDR.head()

Unnamed: 0,DepMap_ID,cancer_type,name,moa,target,indication,phase,r2,ic50,auc,lower_limit,effectiveCont,effective
0,ACH-000961,Endometrial/Uterine Cancer,floxuridine,DNA synthesis inhibitor,TYMS,colorectal cancer,Launched,0.822527,0.039417,0.629954,0.3914,3.999535,0
1,ACH-000961,Endometrial/Uterine Cancer,valrubicin,"DNA inhibitor, topoisomerase inhibitor",TOP2A,bladder cancer,Launched,0.89977,0.218469,0.605413,0.000926,8.12136,1
2,ACH-000961,Endometrial/Uterine Cancer,romidepsin,HDAC inhibitor,"HDAC1, HDAC2, HDAC3, HDAC4, HDAC5, HDAC6, HDAC...",cutaneous T-cell lymphoma (CTCL),Launched,0.709292,0.007056,0.252551,0.000448,12.000541,1
3,ACH-000961,Endometrial/Uterine Cancer,AZD3463,"ALK tyrosine kinase receptor inhibitor, insuli...","ALK, IGF1R",,Preclinical,0.726827,0.352112,0.653865,0.002481,6.782965,0
4,ACH-000961,Endometrial/Uterine Cancer,cycloheximide,protein synthesis inhibitor,RPL3,,Preclinical,0.823164,0.543942,0.681261,0.015422,4.687175,0


In [13]:
testTemp = testCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                             'name': 'drug',
                                                                                             'effective': 'true'})

newTemp = newCDR.loc[:, ['DepMap_ID', 'cancer_type', 'name', 'effective']].rename(columns={'DepMap_ID':'cell_line',
                                                                                          'name': 'drug',
                                                                                          'effective': 'true'})

# Load drugs

In [14]:
drugs = pd.read_csv('../data/drug_response/drug_fingerprints.csv', index_col=0)

drugs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
cytarabine,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
epinastine,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
floxuridine,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
valrubicin,1,0,0,0,0,1,0,0,1,0,...,0,1,0,1,1,1,0,1,0,0
adapalene,1,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [15]:
trainDrugs = drugs.loc[list(trainCDR.name.values), :].to_numpy()
testDrugs = drugs.loc[list(testCDR.name.values), :].to_numpy()
newDrugs = drugs.loc[list(newCDR.name.values), :].to_numpy()

trainRNA = trainRNA.loc[list(trainCDR.DepMap_ID.values), :].to_numpy()
testRNA = testRNA.loc[list(testCDR.DepMap_ID.values), :].to_numpy()
newRNA = newRNA.loc[list(newCDR.DepMap_ID.values), :].to_numpy()

In [16]:
trainData = [trainDrugs, trainRNA]
trainEff = trainCDR.effective.to_numpy()
testData = [testDrugs, testRNA]
newData = [newDrugs, newRNA]

In [17]:
del cdr, drugs, trainDrugs, testDrugs, newDrugs, trainRNA, testRNA, newRNA

# Encoders

In [18]:
def loadEncoder(path, which='rna'):
    snn = load_model(path)
    encoder = snn.get_layer('model')
    encoder._name = f'{which}Encoder'
    return encoder

## Load best cell line encoder

In [19]:
basePath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/'
modelDir = os.path.join(basePath, 'models')
bestRNA = 'CellLineFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000'
bestRNAPath = os.path.join(modelDir, bestRNA)

rnaEncoder = loadEncoder(bestRNAPath)

In [20]:
rnaEncoder.summary()

Model: "rnaEncoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 463)]             0         
                                                                 
 dense (Dense)               (None, 64)                29696     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
Total params: 33,856
Trainable params: 33,856
Non-trainable params: 0
_________________________________________________________________


# Load drug encoder

In [21]:
drugEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/'
modelDir = os.path.join(drugEncoderPath, 'models')
bestDrug = 'DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
bestDrugPath = os.path.join(modelDir, bestDrug)

drugEncoder = loadEncoder(bestDrugPath, 'drug')

In [22]:
drugEncoder.summary()

Model: "drugEncoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 256)]             0         
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
Total params: 20,608
Trainable params: 20,608
Non-trainable params: 0
_________________________________________________________________


# Get fusion encoder func

In [23]:
def triplet_loss(y_true, y_pred, alpha=0.1):
    anchor, positive, negative = y_pred[:, :8],\
                                 y_pred[:, 8:2*8],\
                                 y_pred[:, 2*8:]
    posDist = tf.reduce_mean(tf.square(anchor - positive), axis=1)
    negDist = tf.reduce_mean(tf.square(anchor - negative), axis=1)
    return tf.maximum(posDist - negDist + alpha, 0.)


def getEncoder(path=None, encodeDrug='embed', encodeRNA='embed', drugInDim=256, rnaInDim=463):
    # Define encoded drug input
    drugInput = Input(drugInDim)
    if encodeDrug == 'embed':
        drugEmbed = drugEncoder(drugInput)
    else:
        drugEmbed = drugInput
    
    # Define encoded rna input
    rnaInput = Input(rnaInDim)
    if encodeRNA == 'embed':
        rnaEmbed = rnaEncoder(rnaInput)
    else:
        rnaEmbed = rnaInput
        
    
    # combine drug + rna input
    pairEmbed = Concatenate()([drugEmbed, rnaEmbed])
    
    # Add fusion model if desired

    if path != None:
        fusionEncoder = load_model(path, custom_objects={'triplet_loss':triplet_loss})
        fusionEncoder = fusionEncoder.get_layer('model')

        pairEmbed = fusionEncoder(pairEmbed)
    
    return Model(inputs=[drugInput, rnaInput], outputs=pairEmbed)

In [78]:
def clPrecision(preds, modelName=None, thresh=0.6, getResults=False, verbose=True):
    p1 = []
    p2 = []
    p3 = []
    p4 = []
    p5 = []
    p0 = []
    cellLines = []
    for cell, subdf in preds.groupby(by='cell_line'):
        nEff = subdf.true.sum()
        if nEff < 5:
            continue
            
        cellLines.append(cell)
        sortDF = subdf.sort_values(by='pred', ascending=False)
        p1.append(sortDF.iloc[:1, :].true.sum() / 1)
        p2.append(sortDF.iloc[:2, :].true.sum() / 2)
        p3.append(sortDF.iloc[:3, :].true.sum() / 3)
        p4.append(sortDF.iloc[:4, :].true.sum() / 4)
        p5.append(sortDF.iloc[:5, :].true.sum() / 5)
        if nEff >= 10:
            p0.append(sortDF.iloc[:10, :].true.sum() / 10)
            
    if np.mean(p5) >= thresh:
        if verbose:
            thresh = np.mean(p5)
            print(f"Model: {modelName}")
            print(f"\tPrecision@1: {round(np.mean(p1), 4)}")
            print(f"\tPrecision@2: {round(np.mean(p2), 4)}")
            print(f"\tPrecision@3: {round(np.mean(p3), 4)}")
            print(f"\tPrecision@4: {round(np.mean(p4), 4)}")
            print(f"\tPrecision@5: {round(np.mean(p5), 4)}")
            print(f"\tPrecision@10: {round(np.mean(p0), 4)}\n\n")
        
    if getResults:
        return [np.mean(p1), np.mean(p2), np.mean(p3), np.mean(p4), np.mean(p5)]
    
    if verbose:
        return thresh
    
    
def precision(preds, modelName, thresh, by='cellLine'):
    if by == 'cellLine':
        return clPrecision(preds, modelName, thresh=thresh)
    else:
        cancers = {}
        for ct, subdf in preds.groupby(by = 'cancer_type'):
            cancers[ct] = clPrecision(subdf, verbose=False, getResults=True)
        return pd.DataFrame(cancers, index=['p1', 'p2', 'p3', 'p4', 'p5']).T


def getPreds(trainData, trainEff, testData,
             modelPath=None, encodeDrug=True, encodeRNA=True):
    
    # Load encoder
    if (modelPath != None):
        encoder = getEncoder(path=modelPath, encodeDrug=encodeDrug, encodeRNA=encodeRNA)
    else:
        encoder = getEncoder(encodeDrug=encodeDrug, encodeRNA=encodeRNA)
        
    # Encode pairs
    trainEmbed = encoder(trainData)
    testEmbed = encoder(testData)

    # Create logisitic regression model
    lm = LogisticRegression().fit(trainEmbed, trainEff)

    # predict
    return [p[1] for p in lm.predict_proba(testEmbed)]
        
        
def iterateModels(trainData, trainEff, testData, predDF,
                  thresh=0.5, modelName=None, k=1, by='cellLine', drug='embed', rna='embed', fusion=True):
    if (modelName != None):
        # get preds
        if fusion:
            modelPath = os.path.join(fusionPath, modelName)
        else: 
            modelPath = None
        preds = getPreds(trainData, trainEff, testData, 
                         modelPath=modelPath, encodeDrug=drug, encodeRNA=rna)
        predDF['pred'] = preds
        
        predDF.sort_values(by='pred', ascending=False, inplace=True)
        if by == 'cellLine':
            print('Average Cell Line precision @ k')
            precision(predDF, modelName, thresh, by)
            
            print("Top ranked drug for each cell line:")
            counts, wrong = countDrugsK(predDF, k)
            
            print(f"\n# cell lines without highly effective drug among top-{k} predictions: {len(wrong)}")
            
            print(f"\n# of times each drug recommended in top-{k}:")
            counts = sorted(counts.items(), key=lambda x:x[1], reverse=True)
            for drug, cnt in counts:
                print(f"{drug}: {cnt}")
                
            return predDF, wrong
        
        else:
            df = precision(predDF, modelName, thresh, by)
            df.sort_values(by=['p1','p1','p3','p4','p5'], ascending=False, inplace=True)
            return df
        
    else:
        if fusion:
            files = [f for f in os.listdir(fusionPath) if 'BYrna' in f]
            for f in files:
                modelPath = os.path.join(fusionPath, f)
                preds = getPreds(trainData, trainEff, testData,
                                 modelPath=modelPath, encodeDrug=drug, encodeRNA=rna)
        else:
            preds = getPreds(trainData, trainEff, testData,
                                 encodeDrug=drug, encodeRNA=rna)
            predDF['pred'] = preds
            thresh = precision(predDF.copy(), modelName, thresh, by)
            
        return thresh

# Drug FP + RNA Raw

## Test set 

### Precision by cell line and top-3 predictions

In [56]:
rawTestPred, rawTestWrong = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                        modelName='', k=3, by='cellLine', drug='fps', rna='raw', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.8235
	Precision@2: 0.8824
	Precision@3: 0.8824
	Precision@4: 0.8382
	Precision@5: 0.7843
	Precision@10: 0.7


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: echinomycin
Cell line: ACH-000062; Top drug: echinomycin
Cell line: ACH-000086; Top drug: echinomycin
Cell line: ACH-000161; Top drug: echinomycin
Cell line: ACH-000164; Top drug: 10-hydroxycamptothecin
Cell line: ACH-000222; Top drug: echinomycin
Cell line: ACH-000280; Top drug: echinomycin
Cell line: ACH-000305; Top drug: echinomycin
Cell line: ACH-000316; Top drug: echinomycin
Cell line: ACH-000320; Top drug: echinomycin
Cell line: ACH-000329; Top drug: romidepsin
Cell line: ACH-000347; Top drug: echinomycin
Cell line: ACH-000368; Top drug: echinomycin
Cell line: ACH-000376; Top drug: nemorubicin
Cell line: ACH-000421; Top drug: dolastatin-10
Cell line: ACH-000450; Top drug: echinomycin
Cell line: ACH-000467; Top drug: echinomycin
Cell line: ACH-000486

In [23]:
rawTestWrong

[]

In [25]:
rawTestPred.head(20)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
6703,ACH-000573,Breast Cancer,echinomycin,0,0.960481
1098,ACH-000280,Ovarian Cancer,echinomycin,1,0.949348
3227,ACH-000776,Brain Cancer,echinomycin,1,0.938781
1351,ACH-000704,Ovarian Cancer,echinomycin,1,0.935787
323,ACH-000978,Endometrial/Uterine Cancer,echinomycin,1,0.934691
5899,ACH-000893,Lung Cancer,echinomycin,1,0.928602
7655,ACH-000486,Bladder Cancer,echinomycin,1,0.925156
1703,ACH-000305,Esophageal Cancer,echinomycin,1,0.919233
7226,ACH-000896,Bladder Cancer,echinomycin,1,0.918362
6679,ACH-000573,Breast Cancer,dolastatin-10,1,0.916585


### precision by cancer

In [27]:
cancerRawTest = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                modelName='', k=3, by='cancer', drug='fps', rna='raw', fusion=False)

In [28]:
cancerRawTest

Unnamed: 0,p1,p2,p3,p4,p5
Liver Cancer,1.0,1.0,1.0,1.0,1.0
Bladder Cancer,1.0,1.0,1.0,0.916667,0.866667
Endometrial/Uterine Cancer,1.0,1.0,1.0,0.916667,0.866667
Head and Neck Cancer,1.0,1.0,1.0,0.916667,0.866667
Skin Cancer,1.0,1.0,0.933333,0.8,0.72
Brain Cancer,1.0,0.875,0.833333,0.75,0.75
Lung Cancer,0.769231,0.884615,0.871795,0.826923,0.769231
Colon/Colorectal Cancer,0.75,0.875,0.916667,0.9375,0.8
Ovarian Cancer,0.75,0.875,0.833333,0.8125,0.8
Esophageal Cancer,0.666667,0.666667,0.777778,0.833333,0.8


In [29]:
rawTestPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Lung Cancer                   13
Brain Cancer                   5
Skin Cancer                    5
Ovarian Cancer                 4
Colon/Colorectal Cancer        4
Pancreatic Cancer              4
Breast Cancer                  3
Endometrial/Uterine Cancer     3
Bladder Cancer                 3
Esophageal Cancer              3
Head and Neck Cancer           3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New Cancer set 

### Precision by cell line and top-3 predictions

In [38]:
testCDR.DepMap_ID.nunique()

52

In [39]:
rawNewPred, rawNewWrong  = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                                            modelName='', k=3, by='cellLine', drug='fps', rna='raw', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.8923
	Precision@2: 0.9154
	Precision@3: 0.8615
	Precision@4: 0.8423
	Precision@5: 0.8
	Precision@10: 0.7157


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: echinomycin
Cell line: ACH-000046; Top drug: echinomycin
Cell line: ACH-000052; Top drug: echinomycin
Cell line: ACH-000054; Top drug: echinomycin
Cell line: ACH-000087; Top drug: echinomycin
Cell line: ACH-000090; Top drug: echinomycin
Cell line: ACH-000096; Top drug: echinomycin
Cell line: ACH-000099; Top drug: echinomycin
Cell line: ACH-000141; Top drug: dolastatin-10
Cell line: ACH-000159; Top drug: echinomycin
Cell line: ACH-000169; Top drug: echinomycin
Cell line: ACH-000171; Top drug: echinomycin
Cell line: ACH-000172; Top drug: echinomycin
Cell line: ACH-000174; Top drug: echinomycin
Cell line: ACH-000182; Top drug: echinomycin
Cell line: ACH-000189; Top drug: echinomycin
Cell line: ACH-000191; Top drug: echinomycin
Cell line: ACH-000201; Top drug

In [31]:
rawNewWrong

['ACH-000268']

In [41]:
rawNewPred[rawNewPred.drug == 'echinomycin']

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4481,ACH-000090,Prostate Cancer,echinomycin,1,0.970583
8162,ACH-000903,Thyroid Cancer,echinomycin,1,0.969306
9228,ACH-000359,Bone Cancer,echinomycin,1,0.960581
7141,ACH-000169,Sarcoma,echinomycin,1,0.955664
1954,ACH-000516,Bone Cancer,echinomycin,1,0.946525
8229,ACH-000748,Bone Cancer,echinomycin,1,0.937221
7565,ACH-000037,Sarcoma,echinomycin,1,0.936282
2418,ACH-000880,Gastric Cancer,echinomycin,1,0.935379
9450,ACH-000191,Thyroid Cancer,echinomycin,1,0.931874
6694,ACH-000808,Bile Duct Cancer,echinomycin,1,0.929264


In [189]:
unfusedNewPred[unfusedNewPred.cell_line == 'ACH-000457'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
7104,ACH-000457,Kidney Cancer,echinomycin,0,0.464658
7109,ACH-000457,Kidney Cancer,rubitecan,0,0.451901
7074,ACH-000457,Kidney Cancer,genz-644282,0,0.432253
7069,ACH-000457,Kidney Cancer,romidepsin,1,0.319723
7107,ACH-000457,Kidney Cancer,nemorubicin,1,0.231143
7073,ACH-000457,Kidney Cancer,teniposide,0,0.214668
7089,ACH-000457,Kidney Cancer,epothilone-d,1,0.206817
7068,ACH-000457,Kidney Cancer,valrubicin,0,0.193748
7077,ACH-000457,Kidney Cancer,daunorubicin,0,0.149771
7106,ACH-000457,Kidney Cancer,ixazomib-citrate,0,0.144213


In [190]:
unfusedNewPred[unfusedNewPred.cell_line == 'ACH-000484'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
8279,ACH-000484,Kidney Cancer,OTS167,0,0.525903
8272,ACH-000484,Kidney Cancer,YM-155,0,0.497218
8303,ACH-000484,Kidney Cancer,rubitecan,0,0.492174
8257,ACH-000484,Kidney Cancer,genz-644282,0,0.472284
8269,ACH-000484,Kidney Cancer,epothilone-b,0,0.368853
8254,ACH-000484,Kidney Cancer,romidepsin,1,0.355866
8289,ACH-000484,Kidney Cancer,beta-lapachone,0,0.333685
8314,ACH-000484,Kidney Cancer,resminostat,0,0.262995
8286,ACH-000484,Kidney Cancer,nemorubicin,0,0.261115
8307,ACH-000484,Kidney Cancer,verubulin,0,0.233291


### precision by cancer

In [42]:
cancerRawNew = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                                modelName='', k=3, by='cancer', drug='fps', rna='raw', fusion=False)

In [43]:
cancerRawNew

Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,1.0,1.0,0.9
Sarcoma,1.0,0.916667,0.944444,0.833333,0.8
Gastric Cancer,1.0,1.0,0.928571,0.946429,0.914286
Bone Cancer,1.0,1.0,0.925926,0.861111,0.8
Neuroblastoma,1.0,1.0,0.888889,0.833333,0.8
Rhabdoid,1.0,1.0,0.833333,0.8125,0.8
Gallbladder Cancer,1.0,1.0,0.666667,0.5,0.6
Thyroid Cancer,0.875,0.875,0.916667,0.90625,0.85
Bile Duct Cancer,0.8,0.8,0.8,0.8,0.76
Kidney Cancer,0.615385,0.769231,0.692308,0.711538,0.661538


In [45]:
rawNewPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Gastric Cancer        14
Kidney Cancer         14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64

# Drug FPS + RNA Embed

## Test set 

In [89]:
rnaEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/'
modelDir = os.path.join(rnaEncoderPath, 'models')
bestRNA = ''
maxThresh = 0.0
for d in os.listdir(modelDir):
    rnaPath = os.path.join(modelDir, d)
    try:
        drugEncoder = loadEncoder(rnaPath, 'rna')
    except AttributeError:
        continue
    print(d)
    thresh = iterateModels(trainData, trainEff, testData, testTemp.copy(), thresh=maxThresh, 
                            by='cellLine', drug='raw', rna='embed', fusion=False)
    if thresh > maxThresh:
        maxThresh = thresh
        bestRNA = d
    

CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden32_DO0-1_AFrelu_LR0-001_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers2_Hidden64_DO0-3_AFrelu_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden64_DO0-3_AFrelu_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden16_DO0-1_AFsigmoid_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	P

Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-0001_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers1_Hidden32_DO0-3_AFsigmoid_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_Layers2_Hidden16_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
Model: None
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8235
	Precision@10: 0.8128


CellLineFewShot_La

In [90]:
bestRNA

'CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'

In [91]:
rnaEncoder = loadEncoder(os.path.join(rnaEncoderPath, 'models', bestRNA), 'rna')

### Precision by cell line and top-3 predictions for best model

In [92]:
fpsEmbedTestPred, fpsEmbedTestWrong  = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                            modelName='', k=3, by='cellLine', drug='fps', rna='embed', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.9412
	Precision@2: 0.9118
	Precision@3: 0.8562
	Precision@4: 0.848
	Precision@5: 0.8275
	Precision@10: 0.8154


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: dolastatin-10
Cell line: ACH-000062; Top drug: dolastatin-10
Cell line: ACH-000086; Top drug: echinomycin
Cell line: ACH-000161; Top drug: dolastatin-10
Cell line: ACH-000164; Top drug: 10-hydroxycamptothecin
Cell line: ACH-000222; Top drug: dolastatin-10
Cell line: ACH-000280; Top drug: dolastatin-10
Cell line: ACH-000305; Top drug: romidepsin
Cell line: ACH-000316; Top drug: dolastatin-10
Cell line: ACH-000320; Top drug: dolastatin-10
Cell line: ACH-000329; Top drug: romidepsin
Cell line: ACH-000347; Top drug: romidepsin
Cell line: ACH-000368; Top drug: dolastatin-10
Cell line: ACH-000376; Top drug: nemorubicin
Cell line: ACH-000421; Top drug: dolastatin-10
Cell line: ACH-000450; Top drug: dolastatin-10
Cell line: ACH-000467; Top drug: dolastatin-10
C

In [93]:
fpsEmbedTestWrong

[]

### precision by cancer

In [95]:
cancerFpsEmbedTest = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                modelName='', k=3, by='cancer', drug='fps', rna='embed', fusion=False)

In [96]:
cancerFpsEmbedTest

Unnamed: 0,p1,p2,p3,p4,p5
Bladder Cancer,1.0,1.0,1.0,1.0,1.0
Liver Cancer,1.0,1.0,1.0,1.0,0.9
Head and Neck Cancer,1.0,1.0,1.0,0.833333,0.8
Colon/Colorectal Cancer,1.0,0.875,0.916667,0.9375,0.9
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.833333,0.866667
Ovarian Cancer,1.0,0.875,0.833333,0.8125,0.85
Skin Cancer,1.0,0.9,0.8,0.8,0.84
Breast Cancer,1.0,1.0,0.777778,0.833333,0.8
Brain Cancer,1.0,0.875,0.75,0.75,0.7
Lung Cancer,0.923077,0.884615,0.871795,0.884615,0.846154


In [38]:
print('Number of cell lines per cancer type in training data')
trainCDR.loc[:, ['DepMap_ID', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Number of cell lines per cancer type in training data


Lung Cancer                   63
Skin Cancer                   25
Brain Cancer                  23
Pancreatic Cancer             21
Ovarian Cancer                21
Colon/Colorectal Cancer       17
Esophageal Cancer             15
Endometrial/Uterine Cancer    14
Head and Neck Cancer          14
Bladder Cancer                14
Breast Cancer                 13
Liver Cancer                  11
Name: cancer_type, dtype: int64

In [39]:
print('Number of cell lines per cancer type in test data')
fpsEmbedTestPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()


Number of cell lines per cancer type in test data


Lung Cancer                   13
Brain Cancer                   5
Skin Cancer                    5
Ovarian Cancer                 4
Pancreatic Cancer              4
Colon/Colorectal Cancer        4
Breast Cancer                  3
Head and Neck Cancer           3
Bladder Cancer                 3
Esophageal Cancer              3
Endometrial/Uterine Cancer     3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New cancer set 

### Precision by cell line and top-3 predictions

In [94]:
fpsEmbedNewPred, fpsEmbedNewWrong  = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                                        modelName='', k=1, by='cellLine', drug='fps', rna='embed', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.9538
	Precision@2: 0.9154
	Precision@3: 0.9077
	Precision@4: 0.8769
	Precision@5: 0.8677
	Precision@10: 0.802


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: dolastatin-10
Cell line: ACH-000046; Top drug: dolastatin-10
Cell line: ACH-000052; Top drug: dolastatin-10
Cell line: ACH-000054; Top drug: romidepsin
Cell line: ACH-000087; Top drug: dolastatin-10
Cell line: ACH-000090; Top drug: dolastatin-10
Cell line: ACH-000096; Top drug: dolastatin-10
Cell line: ACH-000099; Top drug: dolastatin-10
Cell line: ACH-000141; Top drug: dolastatin-10
Cell line: ACH-000159; Top drug: romidepsin
Cell line: ACH-000169; Top drug: echinomycin
Cell line: ACH-000171; Top drug: romidepsin
Cell line: ACH-000172; Top drug: dolastatin-10
Cell line: ACH-000174; Top drug: dolastatin-10
Cell line: ACH-000182; Top drug: echinomycin
Cell line: ACH-000189; Top drug: dolastatin-10
Cell line: ACH-000191; Top drug: dolastatin-10
Cell line:

In [41]:
fpsEmbedNewWrong

['ACH-000268']

In [43]:
fpsEmbedNewPred[fpsEmbedNewPred.cell_line == 'ACH-000268'].head(15)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9832,ACH-000268,Bile Duct Cancer,sangivamycin,0,0.595817
9839,ACH-000268,Bile Duct Cancer,BGT226,0,0.484339
9844,ACH-000268,Bile Duct Cancer,rubitecan,0,0.455686
9846,ACH-000268,Bile Duct Cancer,verubulin,0,0.368537
9834,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.367347
9835,ACH-000268,Bile Duct Cancer,delanzomib,0,0.202018
9838,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.172232
9847,ACH-000268,Bile Duct Cancer,VE-822,0,0.132534
9843,ACH-000268,Bile Duct Cancer,LY2606368,0,0.108609
9840,ACH-000268,Bile Duct Cancer,CGS-15943,0,0.094575


### precision by cancer

In [44]:
cancerFpsEmbedNew = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                            modelName='', k=3, by='cancer', drug='fps', rna='embed', fusion=False)

In [45]:
cancerFpsEmbedNew

Unnamed: 0,p1,p2,p3,p4,p5
Rhabdoid,1.0,1.0,1.0,1.0,0.95
Sarcoma,1.0,1.0,1.0,1.0,0.933333
Prostate Cancer,1.0,1.0,1.0,1.0,0.9
Gastric Cancer,1.0,1.0,1.0,0.946429,0.942857
Gallbladder Cancer,1.0,1.0,1.0,0.75,0.6
Thyroid Cancer,1.0,0.9375,0.916667,0.875,0.9
Neuroblastoma,1.0,0.833333,0.888889,0.833333,0.866667
Bone Cancer,1.0,0.888889,0.851852,0.861111,0.822222
Kidney Cancer,0.846154,0.769231,0.717949,0.711538,0.707692
Bile Duct Cancer,0.8,0.8,0.866667,0.8,0.84


In [46]:
fpsEmbedNewPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Kidney Cancer         14
Gastric Cancer        14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64

# Drug Embed + RNA Raw

## Test set 

In [79]:
drugEncoderPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/'
modelDir = os.path.join(drugEncoderPath, 'models')
bestDrug = ''
maxThresh = 0.0
for d in os.listdir(modelDir):
    drugPath = os.path.join(modelDir, d)
    try:
        drugEncoder = loadEncoder(drugPath, 'drug')
    except AttributeError:
        continue
    print(d)
    thresh = iterateModels(trainData, trainEff, testData, testTemp.copy(), thresh=maxThresh, 
                            by='cellLine', drug='embed', rna='raw', fusion=False)
    if thresh > maxThresh:
        maxThresh = thresh
        bestDrug = d
    

DrugFewShot_Layers1_Hidden32_DO0-3_AFsigmoid_LR0-001_DR0-99_DS1000
Model: None
	Precision@1: 0.6667
	Precision@2: 0.402
	Precision@3: 0.4052
	Precision@4: 0.348
	Precision@5: 0.3333
	Precision@10: 0.2667


DrugFewShot_Layers2_Hidden32_DO0-3_AFrelu_LR0-01_DR0-99_DS1000
DrugFewShot_Layers2_Hidden32_DO0-3_AFsigmoid_LR0-0001_DR0-99_DS1000
DrugFewShot_Layers2_Hidden16_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.7451
	Precision@2: 0.6471
	Precision@3: 0.5621
	Precision@4: 0.4902
	Precision@5: 0.4588
	Precision@10: 0.3667


DrugFewShot_Layers1_Hidden32_DO0-3_AFsigmoid_LR0-01_DR0-99_DS1000
DrugFewShot_Layers1_Hidden32_DO0-3_AFrelu_LR0-001_DR0-99_DS1000
DrugFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
DrugFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
DrugFewShot_Layers1_Hidden32_DO0-3_AFrelu_LR0-01_DR0-99_DS1000
DrugFewShot_Layers2_Hidden64_DO0-3_AFsigmoid_LR0-001_DR0-99_DS1000
DrugFewShot_Layers1_Hidden64_DO0-3_AFsigmoid_LR0-0001_DR0-99_DS10

In [82]:
drugOpts = [bestDrug, 
            'DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000',
            'DrugFewShot_Layers2_Hidden16_DO0-1_AFrelu_LR0-01_DR0-99_DS1000']


In [83]:
bestDrug

'DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000'

In [98]:
drugEncoder = loadEncoder(os.path.join(drugEncoderPath, 'models', bestDrug), 'drug')

### Precision by cell line and top-3 predictions for best model

In [99]:
embedRawTestPred, embedRawTestWrong = iterateModels(trainData, trainEff, testData, testTemp.copy(), thresh=0.2,
                                            modelName='', k=3, by='cellLine', drug='embed', rna='raw', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.5882
	Precision@2: 0.6176
	Precision@3: 0.6078
	Precision@4: 0.6029
	Precision@5: 0.5647
	Precision@10: 0.5231


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: CR8-(R)
Cell line: ACH-000062; Top drug: YM-155
Cell line: ACH-000086; Top drug: YM-155
Cell line: ACH-000161; Top drug: YM-155
Cell line: ACH-000164; Top drug: genz-644282
Cell line: ACH-000222; Top drug: YM-155
Cell line: ACH-000280; Top drug: CR8-(R)
Cell line: ACH-000305; Top drug: CR8-(R)
Cell line: ACH-000316; Top drug: YM-155
Cell line: ACH-000320; Top drug: CR8-(R)
Cell line: ACH-000329; Top drug: YM-155
Cell line: ACH-000347; Top drug: YM-155
Cell line: ACH-000368; Top drug: echinomycin

No true effective drugs identified in top 3 for ACH-000376
Cell line: ACH-000376; Top drug: epothilone-d

Cell line: ACH-000421; Top drug: genz-644282
Cell line: ACH-000450; Top drug: CR8-(R)
Cell line: ACH-000467; Top drug: YM-155
Cell line: ACH-000486; Top d

In [66]:
embedRawTestPred[embedRawTestPred.cell_line == 'ACH-000012'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
6468,ACH-000012,Lung Cancer,OTS167,0,0.464295
6520,ACH-000012,Lung Cancer,galeterone,0,0.415898
6459,ACH-000012,Lung Cancer,vincristine,0,0.380638
6337,ACH-000012,Lung Cancer,cabazitaxel,1,0.354823
6421,ACH-000012,Lung Cancer,YM-155,1,0.335933
6383,ACH-000012,Lung Cancer,mebendazole,0,0.334448
6410,ACH-000012,Lung Cancer,epothilone-b,1,0.328762
6541,ACH-000012,Lung Cancer,rubitecan,1,0.316008
6341,ACH-000012,Lung Cancer,genz-644282,1,0.308628
6481,ACH-000012,Lung Cancer,10-hydroxycamptothecin,1,0.302789


### precision by cancer

In [100]:
cancerEmbedRawTest = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                modelName='', k=3, by='cancer', drug='embed', rna='raw', fusion=False)

In [101]:
cancerEmbedRawTest

Unnamed: 0,p1,p2,p3,p4,p5
Liver Cancer,1.0,0.75,0.666667,0.625,0.6
Breast Cancer,1.0,0.833333,0.666667,0.5,0.466667
Lung Cancer,0.769231,0.730769,0.641026,0.615385,0.569231
Pancreatic Cancer,0.75,0.625,0.583333,0.625,0.55
Endometrial/Uterine Cancer,0.666667,0.833333,0.888889,0.833333,0.866667
Bladder Cancer,0.666667,0.666667,0.777778,0.75,0.666667
Colon/Colorectal Cancer,0.5,0.5,0.583333,0.6875,0.65
Brain Cancer,0.5,0.5,0.5,0.4375,0.45
Ovarian Cancer,0.5,0.625,0.416667,0.4375,0.5
Esophageal Cancer,0.333333,0.333333,0.555556,0.583333,0.466667


In [69]:
print('Number of cell lines per cancer type in training data')
trainCDR.loc[:, ['DepMap_ID', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Number of cell lines per cancer type in training data


Lung Cancer                   63
Skin Cancer                   25
Brain Cancer                  23
Pancreatic Cancer             21
Ovarian Cancer                21
Colon/Colorectal Cancer       17
Esophageal Cancer             15
Endometrial/Uterine Cancer    14
Head and Neck Cancer          14
Bladder Cancer                14
Breast Cancer                 13
Liver Cancer                  11
Name: cancer_type, dtype: int64

In [70]:
print('Number of cell lines per cancer type in test data')
embedRawTestPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()


Number of cell lines per cancer type in test data


Lung Cancer                   13
Skin Cancer                    5
Brain Cancer                   5
Colon/Colorectal Cancer        4
Ovarian Cancer                 4
Pancreatic Cancer              4
Endometrial/Uterine Cancer     3
Head and Neck Cancer           3
Breast Cancer                  3
Bladder Cancer                 3
Esophageal Cancer              3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New cancer set 

### Precision by cell line and top-3 predictions

In [102]:
embedRawNewPred, embedRawNewWrong  = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                                        modelName='', k=3, by='cellLine', drug='embed', rna='raw', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.5538
	Precision@2: 0.6385
	Precision@3: 0.6513
	Precision@4: 0.6346
	Precision@5: 0.5815
	Precision@10: 0.5392


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: YM-155
Cell line: ACH-000046; Top drug: YM-155
Cell line: ACH-000052; Top drug: echinomycin
Cell line: ACH-000054; Top drug: echinomycin
Cell line: ACH-000087; Top drug: YM-155
Cell line: ACH-000090; Top drug: YM-155
Cell line: ACH-000096; Top drug: echinomycin
Cell line: ACH-000099; Top drug: CR8-(R)
Cell line: ACH-000141; Top drug: YM-155
Cell line: ACH-000159; Top drug: CR8-(R)
Cell line: ACH-000169; Top drug: echinomycin
Cell line: ACH-000171; Top drug: YM-155
Cell line: ACH-000172; Top drug: YM-155
Cell line: ACH-000174; Top drug: CR8-(R)
Cell line: ACH-000182; Top drug: YM-155
Cell line: ACH-000189; Top drug: echinomycin
Cell line: ACH-000191; Top drug: YM-155
Cell line: ACH-000201; Top drug: CR8-(R)

No true effective drugs identified in top 3 f

In [73]:
embedRawNewWrong

['ACH-000046',
 'ACH-000182',
 'ACH-000209',
 'ACH-000268',
 'ACH-000313',
 'ACH-000411',
 'ACH-000428',
 'ACH-000457',
 'ACH-000484',
 'ACH-000649',
 'ACH-000898',
 'ACH-000977',
 'ACH-001306']

In [74]:
embedRawNewPred[embedRawNewPred.cell_line == 'ACH-000046'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
611,ACH-000046,Kidney Cancer,OTS167,0,0.561894
567,ACH-000046,Kidney Cancer,ZK811752,0,0.522893
455,ACH-000046,Kidney Cancer,cabazitaxel,0,0.448684
555,ACH-000046,Kidney Cancer,YM-155,1,0.428113
508,ACH-000046,Kidney Cancer,mebendazole,0,0.426482
547,ACH-000046,Kidney Cancer,epothilone-b,1,0.42022
704,ACH-000046,Kidney Cancer,CYT-997,0,0.406975
711,ACH-000046,Kidney Cancer,rubitecan,1,0.406063
459,ACH-000046,Kidney Cancer,genz-644282,1,0.397804
634,ACH-000046,Kidney Cancer,10-hydroxycamptothecin,1,0.391232


### precision by cancer

In [75]:
cancerEmbedRawNew = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                            modelName='', k=3, by='cancer', drug='embed', rna='raw', fusion=False)

In [76]:
cancerEmbedRawNew

Unnamed: 0,p1,p2,p3,p4,p5
Neuroblastoma,1.0,0.666667,0.666667,0.75,0.733333
Gallbladder Cancer,1.0,0.5,0.333333,0.5,0.4
Bone Cancer,0.777778,0.611111,0.666667,0.694444,0.711111
Sarcoma,0.5,0.666667,0.666667,0.708333,0.7
Prostate Cancer,0.5,0.5,0.5,0.5,0.4
Thyroid Cancer,0.5,0.375,0.416667,0.46875,0.5
Gastric Cancer,0.428571,0.5,0.52381,0.589286,0.642857
Kidney Cancer,0.384615,0.346154,0.25641,0.307692,0.323077
Rhabdoid,0.25,0.375,0.5,0.5,0.45
Bile Duct Cancer,0.2,0.2,0.266667,0.35,0.36


In [77]:
fpsEmbedNewPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Kidney Cancer         14
Gastric Cancer        14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64

# Drug Embed + RNA Embed

## Test set 

In [104]:
drugOpts

['DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000',
 'DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000',
 'DrugFewShot_Layers2_Hidden16_DO0-1_AFrelu_LR0-01_DR0-99_DS1000']

In [105]:
bestRNA

'CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'

In [108]:
rnaOpts = [bestRNA,
           'CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000',
           'CellLineFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000']

In [109]:
drugDir = os.path.join(drugEncoderPath, 'models')
rnaDir = os.path.join(rnaEncoderPath, 'models')
bestPair = ('', '')
maxThresh = 0.0
for d in drugOpts:
    drugPath = os.path.join(drugDir, d)
    drugEncoder = loadEncoder(drugPath, 'drug')
        
    for r in rnaOpts:
        rnaPath = os.path.join(rnaDir, r)
        rnaEncoder = loadEncoder(rnaPath, 'rna')
        
        print(f"\nDrug Encoder: {d}")
        print(f"RNA Encoder: {r}")
        thresh = iterateModels(trainData, trainEff, testData, testTemp.copy(), thresh=maxThresh, 
                                by='cellLine', drug='embed', rna='embed', fusion=False)
        if thresh > maxThresh:
            bestPair = (d, r)
            maxThresh = thresh



Drug Encoder: DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
RNA Encoder: CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000
Model: None
	Precision@1: 0.5882
	Precision@2: 0.6078
	Precision@3: 0.6078
	Precision@4: 0.5833
	Precision@5: 0.5569
	Precision@10: 0.5205



Drug Encoder: DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
RNA Encoder: CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
Model: None
	Precision@1: 0.5882
	Precision@2: 0.6275
	Precision@3: 0.6013
	Precision@4: 0.6029
	Precision@5: 0.5647
	Precision@10: 0.5179



Drug Encoder: DrugFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-01_DR0-99_DS1000
RNA Encoder: CellLineFewShot_Layers1_Hidden64_DO0-1_AFsigmoid_LR0-0001_DR0-99_DS1000
Model: None
	Precision@1: 0.5882
	Precision@2: 0.6373
	Precision@3: 0.634
	Precision@4: 0.6176
	Precision@5: 0.5647
	Precision@10: 0.5077



Drug Encoder: DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000
RNA Encoder: Cel

In [110]:
bestPair

('DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000',
 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000')

In [111]:
drugEncoder = loadEncoder(os.path.join(drugEncoderPath, 'models', bestPair[0]), 'drug')
rnaEncoder = loadEncoder(os.path.join(rnaEncoderPath, 'models', bestPair[1]), 'rna')

### Precision by cell line and top-3 predictions for best model

In [120]:
embedTestPred, embedTestWrong = iterateModels(trainData, trainEff, testData, testTemp.copy(), thresh=0.2,
                                            modelName='', k=3, by='cellLine', drug='embed', rna='embed', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.6078
	Precision@2: 0.6471
	Precision@3: 0.6667
	Precision@4: 0.6324
	Precision@5: 0.6235
	Precision@10: 0.5846


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: OTS167
Cell line: ACH-000062; Top drug: echinomycin
Cell line: ACH-000086; Top drug: OTS167
Cell line: ACH-000161; Top drug: echinomycin
Cell line: ACH-000164; Top drug: OTS167
Cell line: ACH-000222; Top drug: ZK811752
Cell line: ACH-000280; Top drug: dacarbazine
Cell line: ACH-000305; Top drug: echinomycin
Cell line: ACH-000316; Top drug: OTS167
Cell line: ACH-000320; Top drug: OTS167
Cell line: ACH-000329; Top drug: OTS167
Cell line: ACH-000347; Top drug: echinomycin
Cell line: ACH-000368; Top drug: echinomycin
Cell line: ACH-000376; Top drug: nemorubicin
Cell line: ACH-000421; Top drug: OTS167
Cell line: ACH-000450; Top drug: echinomycin
Cell line: ACH-000467; Top drug: OTS167
Cell line: ACH-000486; Top drug: OTS167
Cell line: ACH-000493; Top drug: 

In [121]:
embedTestWrong

['ACH-000510', 'ACH-000663', 'ACH-000899']

In [122]:
embedRawTestPred[embedRawTestPred.cell_line == 'ACH-000510'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
5706,ACH-000510,Lung Cancer,ZK811752,0,0.558391
5745,ACH-000510,Lung Cancer,OTS167,0,0.539123
5769,ACH-000510,Lung Cancer,echinomycin,0,0.512548
5685,ACH-000510,Lung Cancer,YM-155,0,0.502113
5799,ACH-000510,Lung Cancer,rubitecan,0,0.501706
5737,ACH-000510,Lung Cancer,vincristine,0,0.454187
5671,ACH-000510,Lung Cancer,cabazitaxel,1,0.401537
5675,ACH-000510,Lung Cancer,epothilone-b,1,0.377355
5784,ACH-000510,Lung Cancer,beta-lapachone,0,0.341262
5782,ACH-000510,Lung Cancer,CYT-997,0,0.338514


In [123]:
embedRawTestPred[embedRawTestPred.cell_line == 'ACH-000663'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
1219,ACH-000663,Ovarian Cancer,OTS167,0,0.539122
1238,ACH-000663,Ovarian Cancer,echinomycin,0,0.512547
1268,ACH-000663,Ovarian Cancer,rubitecan,0,0.501705
1170,ACH-000663,Ovarian Cancer,genz-644282,0,0.482182
1247,ACH-000663,Ovarian Cancer,KF-38789,0,0.47192
1196,ACH-000663,Ovarian Cancer,epothilone-b,1,0.377354
1211,ACH-000663,Ovarian Cancer,camptothecin,1,0.372306
1167,ACH-000663,Ovarian Cancer,romidepsin,0,0.363884
1250,ACH-000663,Ovarian Cancer,bis(maltolato)oxovanadium(IV),0,0.316161
1180,ACH-000663,Ovarian Cancer,fenbendazole,0,0.302559


In [124]:
embedRawTestPred[embedRawTestPred.cell_line == 'ACH-000899'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4541,ACH-000899,Skin Cancer,OTS167,0,0.539047
4496,ACH-000899,Skin Cancer,genz-644282,0,0.482107
4521,ACH-000899,Skin Cancer,epothilone-b,0,0.377283
4511,ACH-000899,Skin Cancer,mebendazole,0,0.341791
4553,ACH-000899,Skin Cancer,beta-lapachone,0,0.341194
4552,ACH-000899,Skin Cancer,bis(maltolato)oxovanadium(IV),0,0.316096
4506,ACH-000899,Skin Cancer,fenbendazole,0,0.302496
4545,ACH-000899,Skin Cancer,10-hydroxycamptothecin,1,0.297734
4531,ACH-000899,Skin Cancer,CUDC-907,0,0.29373
4528,ACH-000899,Skin Cancer,alvespimycin,1,0.243624


### precision by cancer

In [125]:
cancerEmbedTest = iterateModels(trainData, trainEff, testData, testTemp.copy(), 
                                modelName='', k=3, by='cancer', drug='embed', rna='embed', fusion=False)

In [126]:
cancerEmbedTest

Unnamed: 0,p1,p2,p3,p4,p5
Bladder Cancer,1.0,1.0,1.0,0.916667,0.933333
Endometrial/Uterine Cancer,1.0,1.0,0.888889,0.833333,0.8
Brain Cancer,1.0,0.875,0.75,0.75,0.75
Colon/Colorectal Cancer,0.75,0.625,0.666667,0.625,0.7
Head and Neck Cancer,0.666667,0.5,0.666667,0.583333,0.533333
Esophageal Cancer,0.666667,0.333333,0.444444,0.5,0.6
Skin Cancer,0.6,0.6,0.6,0.6,0.6
Liver Cancer,0.5,0.75,0.666667,0.625,0.7
Pancreatic Cancer,0.5,0.625,0.583333,0.5625,0.5
Ovarian Cancer,0.5,0.5,0.5,0.5,0.5


In [69]:
print('Number of cell lines per cancer type in training data')
trainCDR.loc[:, ['DepMap_ID', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Number of cell lines per cancer type in training data


Lung Cancer                   63
Skin Cancer                   25
Brain Cancer                  23
Pancreatic Cancer             21
Ovarian Cancer                21
Colon/Colorectal Cancer       17
Esophageal Cancer             15
Endometrial/Uterine Cancer    14
Head and Neck Cancer          14
Bladder Cancer                14
Breast Cancer                 13
Liver Cancer                  11
Name: cancer_type, dtype: int64

In [70]:
print('Number of cell lines per cancer type in test data')
embedRawTestPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()


Number of cell lines per cancer type in test data


Lung Cancer                   13
Skin Cancer                    5
Brain Cancer                   5
Colon/Colorectal Cancer        4
Ovarian Cancer                 4
Pancreatic Cancer              4
Endometrial/Uterine Cancer     3
Head and Neck Cancer           3
Breast Cancer                  3
Bladder Cancer                 3
Esophageal Cancer              3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New cancer set 

### Precision by cell line and top-3 predictions

In [127]:
embedNewPred, embedNewWrong  = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                                        modelName='', k=3, by='cellLine', drug='embed', rna='embed', fusion=False)

Average Cell Line precision @ k
Model: 
	Precision@1: 0.5692
	Precision@2: 0.6462
	Precision@3: 0.6923
	Precision@4: 0.6654
	Precision@5: 0.64
	Precision@10: 0.6157


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: echinomycin
Cell line: ACH-000046; Top drug: ZK811752
Cell line: ACH-000052; Top drug: echinomycin
Cell line: ACH-000054; Top drug: OTS167
Cell line: ACH-000087; Top drug: OTS167
Cell line: ACH-000090; Top drug: OTS167
Cell line: ACH-000096; Top drug: OTS167
Cell line: ACH-000099; Top drug: OTS167
Cell line: ACH-000141; Top drug: OTS167
Cell line: ACH-000159; Top drug: echinomycin
Cell line: ACH-000169; Top drug: echinomycin
Cell line: ACH-000171; Top drug: OTS167
Cell line: ACH-000172; Top drug: echinomycin
Cell line: ACH-000174; Top drug: ZK811752
Cell line: ACH-000182; Top drug: OTS167
Cell line: ACH-000189; Top drug: OTS167
Cell line: ACH-000191; Top drug: OTS167
Cell line: ACH-000201; Top drug: OTS167
Cell line: ACH-000209; Top drug: rubitecan
Cell 

In [129]:
embedNewWrong

['ACH-000268', 'ACH-000457', 'ACH-000484']

In [130]:
embedNewPred[embedNewPred.cell_line == 'ACH-000268'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
9844,ACH-000268,Bile Duct Cancer,rubitecan,0,0.501705
9834,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.243279
9846,ACH-000268,Bile Duct Cancer,verubulin,0,0.240596
9836,ACH-000268,Bile Duct Cancer,SB-939,0,0.176686
9848,ACH-000268,Bile Duct Cancer,panobinostat,0,0.166827
9847,ACH-000268,Bile Duct Cancer,VE-822,0,0.146621
9839,ACH-000268,Bile Duct Cancer,BGT226,0,0.113549
9838,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.083451
9840,ACH-000268,Bile Duct Cancer,CGS-15943,0,0.061863
9833,ACH-000268,Bile Duct Cancer,GDC-0980,0,0.058504


In [131]:
embedNewPred[embedNewPred.cell_line == 'ACH-000457'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
7104,ACH-000457,Kidney Cancer,echinomycin,0,0.512544
7109,ACH-000457,Kidney Cancer,rubitecan,0,0.501702
7074,ACH-000457,Kidney Cancer,genz-644282,0,0.482179
7069,ACH-000457,Kidney Cancer,romidepsin,1,0.363881
7107,ACH-000457,Kidney Cancer,nemorubicin,1,0.272131
7073,ACH-000457,Kidney Cancer,teniposide,0,0.247746
7089,ACH-000457,Kidney Cancer,epothilone-d,1,0.243277
7068,ACH-000457,Kidney Cancer,valrubicin,0,0.226631
7077,ACH-000457,Kidney Cancer,daunorubicin,0,0.179678
7106,ACH-000457,Kidney Cancer,ixazomib-citrate,0,0.175711


In [132]:
embedNewPred[embedNewPred.cell_line == 'ACH-000484'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
8279,ACH-000484,Kidney Cancer,OTS167,0,0.539121
8272,ACH-000484,Kidney Cancer,YM-155,0,0.502112
8303,ACH-000484,Kidney Cancer,rubitecan,0,0.501705
8257,ACH-000484,Kidney Cancer,genz-644282,0,0.482182
8269,ACH-000484,Kidney Cancer,epothilone-b,0,0.377353
8254,ACH-000484,Kidney Cancer,romidepsin,1,0.363884
8289,ACH-000484,Kidney Cancer,beta-lapachone,0,0.341261
8286,ACH-000484,Kidney Cancer,nemorubicin,0,0.272134
8314,ACH-000484,Kidney Cancer,resminostat,0,0.269591
8307,ACH-000484,Kidney Cancer,verubulin,0,0.240596


### precision by cancer

In [133]:
cancerEmbedNew = iterateModels(trainData, trainEff, newData, newTemp.copy(), 
                            modelName='', k=3, by='cancer', drug='embed', rna='embed', fusion=False)

In [134]:
cancerEmbedNew

Unnamed: 0,p1,p2,p3,p4,p5
Neuroblastoma,1.0,1.0,1.0,0.916667,0.8
Prostate Cancer,1.0,0.75,0.833333,0.75,0.7
Gallbladder Cancer,1.0,0.5,0.333333,0.5,0.4
Bone Cancer,0.888889,0.833333,0.888889,0.805556,0.733333
Sarcoma,0.666667,0.75,0.833333,0.833333,0.766667
Kidney Cancer,0.538462,0.461538,0.461538,0.423077,0.461538
Rhabdoid,0.5,0.75,0.833333,0.8125,0.75
Thyroid Cancer,0.5,0.625,0.625,0.6875,0.65
Gastric Cancer,0.357143,0.607143,0.738095,0.714286,0.685714
Bile Duct Cancer,0.2,0.5,0.466667,0.4,0.48


In [77]:
fpsEmbedNewPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Kidney Cancer         14
Gastric Cancer        14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64