# Import packages

## General

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Personal

In [2]:
from rbm.utils import Predictions, Compiler

# Define vars

In [3]:
datasets = ['test', 'newcancer']
metrics = ['precision', 'recall', 'f1', 'auc']
modelType = 'DeepDSC'
basePath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/DeepDSC/customEffScore'

# Define Funcs

## Compiling results

In [4]:
def getResults(predPath, metrics):
    preds = pd.read_csv(predPath, index_col=0)
    Evaluator = Predictions(None)
    cellLineResults = Evaluator._cellLinePerformance(preds, metrics)
    cancerResults = Evaluator._cancerTypePerformance(cellLineResults) 
    return cellLineResults, cancerResults

def iterateModelPreds(basePath, dataset, metrics):
    predDir = os.path.join(basePath, f"{dataset}_preds")
    resultDir = os.path.join(basePath, f"{dataset}_results")
    files = os.listdir(predDir)
    for f in files:
        predPath = os.path.join(predDir, f)
        modelName = "_".join(f.split('_')[:-1])
        cellLinePath = os.path.join(resultDir, f"{modelName}_CLresults.csv")
        cancerPath = os.path.join(resultDir, f"{modelName}_CancerResults.csv")
        cellLineResults, cancerResults = getResults(predPath, metrics)
        cellLineResults.to_csv(cellLinePath)
        cancerResults.to_csv(cancerPath)
        
def compileResults(basePath, modelType, subdir, prefix, suffix='CancerResults.csv'):
    compiler = Compiler(basePath, modelType)
    compiler.compileResults(subdir, prefix, suffix)

In [5]:
def getResultsAndCompile(basePath, modelType, prefix, datasets, metrics):
    for d in datasets:
        iterateModelPreds(basePath, d, metrics)
        subdir = f"{d}_res"
        compileResults(basePath, modelType, subdir, prefix)

## precision@k

In [6]:
def countDrugsK(df, k=1):
    drugCount = {}
    wrong = []
    for cell, subdf in df.groupby(by='cell_line'):
        sortDF = subdf.sort_values(by='pred', ascending=False).reset_index(drop=True)
        drugs = sortDF.loc[:k-1, 'drug']
        for drug in drugs:
            if drug in drugCount.keys():
                drugCount[drug] += 1
            else:
                drugCount[drug] = 1
        drug = drugs[0]

        if sortDF.iloc[:k, :].true.sum() == 0:
            wrong.append(cell)
            print(f"\nNo true effective drugs identified in top {k} for {cell}")
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}\n")
        else:
            print(f"Cell line: {sortDF.loc[0, 'cell_line']}; Top drug: {drug}")
    return drugCount, wrong

In [7]:
def clPrecision(df, modelName=None, verbose=True, getResults=False, thresh=0.5):
    p1 = []
    p2 = []
    p3 = []
    p4 = []
    p5 = []
    p0 = []
    cellLines = []
    for cell, subdf in df.groupby(by='cell_line'):
        nEff = subdf.true.sum()
        if nEff < 5:
            continue
        cellLines.append(cell)
        sortDF = subdf.sort_values(by='pred', ascending=False)
        p1.append(sortDF.iloc[:1, :].true.sum() / 1)
        p2.append(sortDF.iloc[:2, :].true.sum() / 2)
        p3.append(sortDF.iloc[:3, :].true.sum() / 3)
        p4.append(sortDF.iloc[:4, :].true.sum() / 4)
        p5.append(sortDF.iloc[:5, :].true.sum() / 5)
        if sortDF.true.sum() >= 10:
            p0.append(sortDF.iloc[:9, :].true.sum() / 10)

    if np.mean(p1) >= thresh:
        thresh = np.mean(p1)
        if verbose:
            print(f"Model: {modelName}")
            print(f"\tPrecision@1: {round(thresh, 4)}")
            print(f"\tPrecision@2: {round(np.mean(p2), 4)}")
            print(f"\tPrecision@3: {round(np.mean(p3), 4)}")
            print(f"\tPrecision@4: {round(np.mean(p4), 4)}")
            print(f"\tPrecision@5: {round(np.mean(p5), 4)}")
            print(f"\tPrecision@10: {round(np.mean(p0), 4)}\n\n")
            
    if getResults:
        return [np.mean(p1), np.mean(p2), np.mean(p3), np.mean(p4), np.mean(p5)]
    if verbose:
        return thresh

def precision(predPath, modelName, thresh, by='cellLine'):
    predDF = pd.read_csv(predPath, index_col=0) 
    if by == 'cellLine':
        return clPrecision(predDF, modelName, thresh=thresh)
    else:
        cancers = {}
        for ct, subdf in predDF.groupby(by = 'cancer_type'):
            cancers[ct] = clPrecision(subdf, verbose=False, getResults=True)
        return pd.DataFrame(cancers, index=['p1', 'p2', 'p3', 'p4', 'p5']).T
        
    
def iteratePrecision(basePath, file=None, thresh=0.7, k=1, by='cellLine'):
    if file != None:
        if by == 'cellLine':
            print('Average Cell Line precision @ k')
            precision(os.path.join(basePath, file), file, thresh, by)
            df = pd.read_csv(os.path.join(basePath, file), index_col=0).sort_values(by='pred', ascending = False).reset_index(drop=True)
            print("Top ranked drug for each cell line:")
            counts, wrong = countDrugsK(df, k)
            print(f"\n# of times each drug recommended in top-{k}:")
            counts = sorted(counts.items(), key=lambda x:x[1], reverse=True)
            for drug, cnt in counts:
                print(f"{drug}: {cnt}")
            return df, wrong
        else:
            df = precision(os.path.join(basePath, file), file, thresh, by)
            df.sort_values(by=['p1','p2','p3','p4','p5'], ascending=False, inplace=True)
            return df
        
    else:
        files = os.listdir(basePath)
        print('Average Cell Line precision @ k')
        for f in files:
            thresh = precision(os.path.join(basePath, f), f, thresh, by='cellLine')
            
        print(thresh)
        

# DeepDSC

## Compile

In [8]:
# getResultsAndCompile(basePath, modelType, prefix, datasets, metrics)

## Test data

### Precision@k for all models

In [8]:
# Average of 52 cell lines with cancer types seen during training
iteratePrecision(os.path.join(basePath, 'test_preds'), thresh=0.88)

Average Cell Line precision @ k
Model: DeepDSC_Encoder_Hidden_64_32_AFlkyu_LR1e-05_DR99_DS10_GC1_DNN_Hidden_128_16_DO3_AFrelu_LR001_DR96_DS5_preds.csv.gz
	Precision@1: 0.902
	Precision@2: 0.7843
	Precision@3: 0.6863
	Precision@4: 0.6029
	Precision@5: 0.5412
	Precision@10: 0.4564


0.9019607843137255


### Define best model via above

In [9]:
best = 'DeepDSC_Encoder_Hidden_64_32_AFlkyu_LR1e-05_DR99_DS10_GC1_DNN_Hidden_128_16_DO3_AFrelu_LR001_DR96_DS5_preds.csv.gz'


### Get precision@k for cell lines

In [13]:
testPred, wrongTest = iteratePrecision(os.path.join(basePath, 'test_preds'), file=best, k=3)

Average Cell Line precision @ k
Model: DeepDSC_Encoder_Hidden_64_32_AFlkyu_LR1e-05_DR99_DS10_GC1_DNN_Hidden_128_16_DO3_AFrelu_LR001_DR96_DS5_preds.csv.gz
	Precision@1: 0.902
	Precision@2: 0.7843
	Precision@3: 0.6863
	Precision@4: 0.6029
	Precision@5: 0.5412
	Precision@10: 0.4564


Top ranked drug for each cell line:
Cell line: ACH-000012; Top drug: dolastatin-10
Cell line: ACH-000062; Top drug: dolastatin-10
Cell line: ACH-000086; Top drug: alvespimycin
Cell line: ACH-000161; Top drug: dolastatin-10
Cell line: ACH-000164; Top drug: ouabain
Cell line: ACH-000222; Top drug: dolastatin-10
Cell line: ACH-000280; Top drug: dolastatin-10
Cell line: ACH-000305; Top drug: alvespimycin
Cell line: ACH-000316; Top drug: dolastatin-10
Cell line: ACH-000320; Top drug: dolastatin-10
Cell line: ACH-000329; Top drug: alvespimycin
Cell line: ACH-000347; Top drug: alvespimycin
Cell line: ACH-000368; Top drug: dolastatin-10
Cell line: ACH-000376; Top drug: BGT226
Cell line: ACH-000421; Top drug: dolastat

In [67]:
wrongTest

['ACH-000563']

In [39]:
testPred[testPred.cell_line == 'ACH-000823'].head(25)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
4835,ACH-000823,Esophageal Cancer,dolastatin-10,0,0.203736
6471,ACH-000823,Esophageal Cancer,BGT226,0,0.192077
57,ACH-000823,Esophageal Cancer,romidepsin,1,0.191631
5815,ACH-000823,Esophageal Cancer,WAY-600,0,0.188381
2681,ACH-000823,Esophageal Cancer,peruvoside,0,0.187605
780,ACH-000823,Esophageal Cancer,genz-644282,1,0.187352
6312,ACH-000823,Esophageal Cancer,echinomycin,0,0.185388
3063,ACH-000823,Esophageal Cancer,YM-155,1,0.184808
6752,ACH-000823,Esophageal Cancer,XL388,0,0.184694
6206,ACH-000823,Esophageal Cancer,GSK2126458,0,0.184207


In [47]:
predDF[predDF.cell_line == 'ACH-000563'].sort_values(by='pred', ascending=False).head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
8125,ACH-000563,Lung Cancer,alvespimycin,0,0.160778
8123,ACH-000563,Lung Cancer,tanespimycin,0,0.157999
2103,ACH-000563,Lung Cancer,ouabain,0,0.155691
2702,ACH-000563,Lung Cancer,peruvoside,0,0.155516
808,ACH-000563,Lung Cancer,genz-644282,1,0.155297
758,ACH-000563,Lung Cancer,paclitaxel,1,0.154645
2937,ACH-000563,Lung Cancer,PF-03758309,0,0.153605
6341,ACH-000563,Lung Cancer,echinomycin,1,0.15338
6758,ACH-000563,Lung Cancer,XL388,0,0.152935
6811,ACH-000563,Lung Cancer,deforolimus,0,0.15262


### precision @ k by cancer type

In [72]:
cancerTest = iteratePrecision(os.path.join(basePath, 'test_preds'), file=best, by='cancer')

In [73]:
cancerTest

Unnamed: 0,p1,p2,p3,p4,p5
Liver Cancer,1.0,1.0,1.0,0.75,0.7
Head and Neck Cancer,1.0,1.0,0.888889,0.833333,0.666667
Breast Cancer,1.0,1.0,0.777778,0.583333,0.533333
Bladder Cancer,1.0,0.833333,0.777778,0.666667,0.666667
Ovarian Cancer,1.0,0.75,0.75,0.625,0.6
Colon/Colorectal Cancer,1.0,0.75,0.666667,0.625,0.5
Skin Cancer,1.0,0.6,0.533333,0.4,0.4
Lung Cancer,0.923077,0.807692,0.666667,0.615385,0.553846
Pancreatic Cancer,0.75,0.875,0.75,0.625,0.55
Brain Cancer,0.75,0.75,0.583333,0.5625,0.45


In [63]:
testPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()


Lung Cancer                   13
Skin Cancer                    5
Brain Cancer                   5
Colon/Colorectal Cancer        4
Ovarian Cancer                 4
Pancreatic Cancer              4
Head and Neck Cancer           3
Esophageal Cancer              3
Breast Cancer                  3
Bladder Cancer                 3
Endometrial/Uterine Cancer     3
Liver Cancer                   2
Name: cancer_type, dtype: int64

## New Cancer data

### Precision@k for all models

### Get precision@k for cell lines

In [68]:
newPred, wrongNew = iteratePrecision(os.path.join(basePath, 'newcancer_preds'), file=best, k=3)

Average Cell Line precision @ k
Model: DeepDSC_Encoder_Hidden_64_32_AFlkyu_LR1e-05_DR99_DS10_GC1_DNN_Hidden_128_16_DO3_AFrelu_LR001_DR96_DS5_preds.csv.gz
	Precision@1: 0.8923
	Precision@2: 0.7308
	Precision@3: 0.6667
	Precision@4: 0.6192
	Precision@5: 0.5569
	Precision@10: 0.4902


Top ranked drug for each cell line:
Cell line: ACH-000037; Top drug: dolastatin-10
Cell line: ACH-000046; Top drug: dolastatin-10
Cell line: ACH-000052; Top drug: dolastatin-10
Cell line: ACH-000054; Top drug: romidepsin
Cell line: ACH-000087; Top drug: dolastatin-10
Cell line: ACH-000090; Top drug: dolastatin-10
Cell line: ACH-000096; Top drug: dolastatin-10
Cell line: ACH-000099; Top drug: dolastatin-10
Cell line: ACH-000141; Top drug: dolastatin-10
Cell line: ACH-000159; Top drug: alvespimycin
Cell line: ACH-000169; Top drug: cephalomannine
Cell line: ACH-000171; Top drug: alvespimycin
Cell line: ACH-000172; Top drug: dolastatin-10
Cell line: ACH-000174; Top drug: dolastatin-10
Cell line: ACH-000182; Top 

In [69]:
wrongNew

['ACH-000250', 'ACH-000268', 'ACH-000678']

In [55]:
newPred[newPred.cell_line == 'ACH-000250'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
60,ACH-000250,Kidney Cancer,dolastatin-10,0,0.18204
231,ACH-000250,Kidney Cancer,alvespimycin,0,0.172844
286,ACH-000250,Kidney Cancer,BGT226,0,0.171355
302,ACH-000250,Kidney Cancer,romidepsin,0,0.170947
457,ACH-000250,Kidney Cancer,ouabain,1,0.167452
471,ACH-000250,Kidney Cancer,peruvoside,0,0.167266
480,ACH-000250,Kidney Cancer,genz-644282,0,0.167035
600,ACH-000250,Kidney Cancer,PF-03758309,0,0.165023
627,ACH-000250,Kidney Cancer,YM-155,1,0.16471
668,ACH-000250,Kidney Cancer,docetaxel,0,0.16417


In [56]:
newPred[newPred.cell_line == 'ACH-000268'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
58,ACH-000268,Bile Duct Cancer,BGT226,0,0.18222
179,ACH-000268,Bile Duct Cancer,GSK2126458,0,0.174667
500,ACH-000268,Bile Duct Cancer,epothilone-d,0,0.16664
551,ACH-000268,Bile Duct Cancer,LY3023414,1,0.165651
586,ACH-000268,Bile Duct Cancer,VE-822,0,0.165191
770,ACH-000268,Bile Duct Cancer,CGS-15943,0,0.162896
832,ACH-000268,Bile Duct Cancer,LY2606368,0,0.161998
1104,ACH-000268,Bile Duct Cancer,bardoxolone-methyl,0,0.159127
1121,ACH-000268,Bile Duct Cancer,verubulin,0,0.159006
1193,ACH-000268,Bile Duct Cancer,alvocidib,0,0.158248


In [57]:
newPred[newPred.cell_line == 'ACH-000678'].head(10)

Unnamed: 0,cell_line,cancer_type,drug,true,pred
684,ACH-000678,Gastric Cancer,alvespimycin,0,0.164
909,ACH-000678,Gastric Cancer,tanespimycin,0,0.161177
1138,ACH-000678,Gastric Cancer,ouabain,0,0.158831
1172,ACH-000678,Gastric Cancer,genz-644282,1,0.158431
1174,ACH-000678,Gastric Cancer,peruvoside,0,0.158416
1337,ACH-000678,Gastric Cancer,paclitaxel,0,0.15713
1483,ACH-000678,Gastric Cancer,echinomycin,1,0.156044
1530,ACH-000678,Gastric Cancer,GSK2126458,0,0.155679
1639,ACH-000678,Gastric Cancer,docetaxel,0,0.154905
1821,ACH-000678,Gastric Cancer,10-hydroxycamptothecin,1,0.153777


### precision @ k by cancer type

In [58]:
cancerNew = iteratePrecision(os.path.join(basePath, 'newcancer_preds'), file=best, by='cancer')

In [59]:
cancerNew

Unnamed: 0,p1,p2,p3,p4,p5
Prostate Cancer,1.0,1.0,0.833333,0.75,0.7
Rhabdoid,1.0,0.875,0.75,0.8125,0.75
Thyroid Cancer,1.0,0.8125,0.708333,0.59375,0.5
Bone Cancer,1.0,0.777778,0.703704,0.666667,0.6
Bile Duct Cancer,1.0,0.7,0.533333,0.5,0.44
Neuroblastoma,1.0,0.5,0.444444,0.416667,0.466667
Gallbladder Cancer,1.0,0.5,0.333333,0.25,0.2
Gastric Cancer,0.857143,0.75,0.690476,0.696429,0.628571
Kidney Cancer,0.769231,0.769231,0.717949,0.596154,0.507692
Sarcoma,0.666667,0.416667,0.555556,0.541667,0.533333


In [62]:
newPred.loc[:, ['cell_line', 'cancer_type']].drop_duplicates(keep='first').cancer_type.value_counts()

Kidney Cancer         14
Gastric Cancer        14
Bone Cancer            9
Thyroid Cancer         8
Sarcoma                6
Bile Duct Cancer       6
Rhabdoid               4
Neuroblastoma          3
Prostate Cancer        2
Gallbladder Cancer     1
Name: cancer_type, dtype: int64