# Import

In [1]:
import os, joblib
import numpy as np
import pandas as pd

from itertools import product

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Helvetica'

import seaborn as sns

from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Input, Concatenate

from scipy.stats import ttest_ind as ttest

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Define funcs

In [3]:
def loadEncoder(path, which='rna'):
    try:
        snn = load_model(path)
        encoder = snn.get_layer('model')
        encoder._name = f'{which}Encoder'
        return encoder
    except AttributeError:
        return None
    
def getModel(rnaPath=None, drugPath=None, rnaDim=463, drugDim=256):
        # Define encoded drug input
        drugInput = Input(drugDim)
        rnaInput = Input(rnaDim)
                                             
        if drugPath == None:
            if rnaPath == None:         
                pairEmbed = Concatenate()([drugInput, rnaInput])
            else: 
                rnaEmbed = loadEncoder(rnaPath)(rnaInput)
                pairEmbed = Concatenate()([drugInput, rnaEmbed])
        else:
            drugEmbed = loadEncoder(drugPath, which='drug')(drugInput)
            if rnaPath == None:         
                pairEmbed = Concatenate()([drugEmbed, rnaInput])
            else: 
                rnaEmbed = loadEncoder(rnaPath)(rnaInput)
                pairEmbed = Concatenate()([drugEmbed, rnaEmbed])

        return Model(inputs=[drugInput, rnaInput], outputs=pairEmbed)

## Define paths

## Cell line encoder

In [4]:
rnaPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/cellLines/siameseV1/models/'

rnaRFModel = 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
rnaRFPath = os.path.join(rnaPath, rnaRFModel)

rnaLMModel = 'CellLineFewShot_Layers2_Hidden64_DO0-1_AFsigmoid_LR0-001_DR0-99_DS1000'
rnaLMPath = os.path.join(rnaPath, rnaLMModel)

## Drug encoder

In [5]:
# drugPath = '../../models/drugEncoders/'
drugPath = '/fs/scratch/PCON0041/PatrickLawrence/cancer-drug-response/fewShot/drugs/siameseV1/models/'
drugModel = 'DrugFewShot_Layers1_Hidden64_DO0-1_AFrelu_LR0-001_DR0-99_DS1000'
drugPath = os.path.join(drugPath, drugModel)

# Load Data

## RNA

In [6]:
testRNA = pd.read_csv('../../data/processed/RNA_test_cancergenes.csv', index_col=0)
newRNA = pd.read_csv('../../data/processed/RNA_newcancer_cancergenes.csv', index_col=0)
preRNA = pd.read_csv('../../data/processed/cellLineEncoderPretrainData.csv', index_col=0)
RNA = pd.concat([testRNA, preRNA, newRNA])
del testRNA, newRNA, preRNA
RNA.head()

Unnamed: 0,XIAP,CDKN2B,ITGB1,TRAF1,GSTM4,LRP5,IL3,HEYL,CXCL12,FN1,...,CALM1,MGST3,ARAF,NFKB1,ZBTB17,FGF23,FZD5,RAC3,GSTA1,LAMB2
ACH-000961,13.154465,8.8491,15.549766,6.920184,12.0727,14.011049,0.130963,10.485365,7.216564,15.074025,...,14.184124,11.098191,11.818052,11.481821,10.725326,0.569748,12.588184,10.000158,6.114922,15.500269
ACH-000984,12.658436,11.099468,16.368337,10.291742,16.10027,13.418132,0.130963,1.764265,6.714064,13.98851,...,12.864645,12.050875,11.611258,12.007825,10.958988,1.569748,9.581694,6.629056,4.52996,13.717964
ACH-000978,12.391499,9.313873,14.526513,8.183218,10.895111,12.847516,0.130963,11.515809,10.011866,12.451502,...,13.081577,12.571796,10.471045,10.505967,10.594366,3.377103,10.755789,8.836714,16.416312,15.371843
ACH-000222,14.216933,13.075022,16.941942,9.317074,9.706432,14.477196,0.130963,1.764265,4.714064,10.887437,...,12.979026,12.599238,10.817039,11.842455,8.791468,1.569748,11.702942,6.244975,2.944997,14.53038
ACH-000164,13.991268,2.579163,17.134937,9.55439,11.698739,14.880114,0.130963,8.914012,6.588533,11.203131,...,12.769463,11.896701,12.273183,12.263763,10.776197,0.569748,9.966419,9.899531,2.944997,13.806081


In [7]:
RNA.shape

(983, 463)

## Drugs

In [8]:
drugInfo = pd.read_csv('../../data/processed/drugCellLinePairsData.csv')
drugInfo = drugInfo.loc[:, ['name', 'moa', 'indication', 'phase']].drop_duplicates(keep='first')
drugInfo.set_index('name', inplace=True)
print(drugInfo.shape)
drugInfo.head()

(1119, 3)


Unnamed: 0_level_0,moa,indication,phase
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
floxuridine,DNA synthesis inhibitor,colorectal cancer,Launched
valrubicin,"DNA inhibitor, topoisomerase inhibitor",bladder cancer,Launched
belinostat,HDAC inhibitor,peripheral T-cell lymphoma (PTCL),Launched
romidepsin,HDAC inhibitor,cutaneous T-cell lymphoma (CTCL),Launched
dihydroartemisinin,antimalarial agent,malaria,Launched


In [9]:
drugs = pd.read_csv('../../data/processed/drug_fingerprints.csv', index_col=0)
drugs = drugs[drugs.index.isin(list(drugInfo.index))]
print(drugs.shape)

(1119, 256)


## Pairs

In [10]:
pairs = [*product(RNA.index.values, drugs.index.values)]

In [11]:
pairDF = pd.DataFrame({'cellLine': [p[0] for p in pairs],
                       'drug': [p[1] for p in pairs]})
pairDF.sort_values(by=['cellLine', 'drug'], inplace=True)
print(pairDF.shape)
pairDF.head()

(1099977, 2)


Unnamed: 0,cellLine,drug
557951,ACH-000001,1-azakenpaullone
558108,ACH-000001,1-naphthyl-PP1
558174,ACH-000001,1-phenylbiguanide
557777,ACH-000001,10-deacetylbaccatin
558099,ACH-000001,10-hydroxycamptothecin


## Add cancer type

In [12]:
cellInfo = pd.read_csv('../../data/processed/cellLineInfo.csv', index_col=0)
cellInfo = cellInfo.loc[list(pairDF.cellLine.unique()), ['primary_disease']]
cellInfo.rename(columns={'primary_disease': 'cancerType'}, inplace=True)
cellInfo

Unnamed: 0_level_0,cancerType
DepMap_ID,Unnamed: 1_level_1
ACH-000001,Ovarian Cancer
ACH-000002,Leukemia
ACH-000003,Colon/Colorectal Cancer
ACH-000004,Leukemia
ACH-000005,Leukemia
...,...
ACH-002680,Brain Cancer
ACH-002687,Eye Cancer
ACH-002693,Sarcoma
ACH-002710,Sarcoma


In [13]:
getCT = lambda x: cellInfo.loc[x, 'cancerType']
pairDF.insert(1, 'cancerType', pairDF.cellLine.apply(getCT))
pairDF = pairDF[pairDF.cancerType.isin(['Bladder Cancer',
                                        'Head and Neck Cancer',
                                        'Gastric Cancer',
                                        'Prostate Cancer'])]

In [14]:
pairDF.shape

(115257, 3)

## Add drug indication

In [15]:
getMOA = lambda x: str(drugInfo.loc[x, 'moa']).lower()
getPhase = lambda x: str(drugInfo.loc[x, 'phase']).lower()
getIndication = lambda x: str(drugInfo.loc[x, 'indication']).lower()

pairDF['moa'] = pairDF.drug.apply(getMOA)
pairDF['phase'] = pairDF.drug.apply(getPhase)
pairDF['indication'] = pairDF.drug.apply(getIndication)

In [16]:
del drugInfo, cellInfo

## CDR model

In [17]:
cdr = joblib.load(f'../../models/fsCDR/RF/EmbedDrug-EmbedCell-Concat-RF-CVfold1.joblib')

# Get test preds

Chunked the predictions becasue kernel kept dying when trying to get raw data for everything all at once

In [21]:
preds = []
encoder = getModel(rnaRFPath, drugPath)
i = 0
print("[INFO] gettting predictions...")
while i < len(pairDF):
    print(f"[INFO] {100 * round(i/len(pairDF), 4)}% of predictions obtained...")
    end = min(i + 25000, len(pairDF))
    data = [drugs.loc[pairDF.drug[i:end], :].to_numpy(),
            RNA.loc[pairDF.cellLine[i:end], :].to_numpy()]
    
    preds.extend([p[1] for p in cdr.predict_proba(encoder(data))])
    i = end

print("[INFO] obtained predictions...")

pairDF['pred'] = preds
pairDF['cellRank'] = [0 for i in range(len(pairDF))]

print("[INFO] getting drug rankings...")
for cellLine, sub in pairDF.groupby(by='cellLine'):
    indicies = list(sub.index)
    sub.sort_values(by='pred', ascending=False, inplace=True)
    sub['cellRank'] = [i+1 for i in range(len(sub))]

    sub.sort_values(by=['cellLine', 'drug'], inplace=True)
    pairDF.loc[indicies, 'cellRank'] = sub['cellRank'].values
    
print("[INFO] done.")

[INFO] gettting predictions...
[INFO] 0.0% of predictions obtained...
[INFO] 21.69% of predictions obtained...
[INFO] 43.38% of predictions obtained...
[INFO] 65.07% of predictions obtained...
[INFO] 86.76% of predictions obtained...
[INFO] obtained predictions...
[INFO] getting drug rankings...
[INFO] done.


In [22]:
pairDF.head()

Unnamed: 0,cellLine,cancerType,drug,moa,phase,indication,pred,cellRank
1051430,ACH-000090,Prostate Cancer,1-azakenpaullone,glycogen synthase kinase inhibitor,preclinical,,0.016435,579
1051587,ACH-000090,Prostate Cancer,1-naphthyl-PP1,src inhibitor,preclinical,,0.007334,822
1051653,ACH-000090,Prostate Cancer,1-phenylbiguanide,serotonin receptor agonist,preclinical,,0.013188,654
1051256,ACH-000090,Prostate Cancer,10-deacetylbaccatin,antitumor agent,preclinical,,0.014362,620
1051578,ACH-000090,Prostate Cancer,10-hydroxycamptothecin,topoisomerase inhibitor,preclinical,,0.875929,6


In [23]:
pairDF.to_csv('repurposingPreds.csv', index=False)

# For each cancer type, find top-10 drugs recommended most often in top-25

In [4]:
pairDF = pd.read_csv('repurposingPreds.csv')
print(pairDF.shape)
pairDF.head()

(115257, 8)


Unnamed: 0,cellLine,cancerType,drug,moa,phase,indication,pred,cellRank
0,ACH-000090,Prostate Cancer,1-azakenpaullone,glycogen synthase kinase inhibitor,preclinical,,0.016435,579
1,ACH-000090,Prostate Cancer,1-naphthyl-PP1,src inhibitor,preclinical,,0.007334,822
2,ACH-000090,Prostate Cancer,1-phenylbiguanide,serotonin receptor agonist,preclinical,,0.013188,654
3,ACH-000090,Prostate Cancer,10-deacetylbaccatin,antitumor agent,preclinical,,0.014362,620
4,ACH-000090,Prostate Cancer,10-hydroxycamptothecin,topoisomerase inhibitor,preclinical,,0.875929,6


In [5]:
pairDF.cancerType.unique()

array(['Prostate Cancer', 'Gastric Cancer', 'Bladder Cancer',
       'Head and Neck Cancer'], dtype=object)

In [7]:
pairDF.drug.nunique()

1119

In [8]:
103 * 1119

115257

In [15]:
bestFDA = [*zip(pairDF.cancerType.unique(), ['cabazitaxel', 'docetaxel', 'valrubicin', 'docetaxel'])]
ranks = {}
for ct, d in bestFDA:
    meanRank = pairDF[(pairDF.cancerType == ct) & (pairDF.drug == d)].cellRank.mean()
    ranks[ct] = round(meanRank, 1)
    print(f"Mean rank of highest rated FDA drug ({d}) for {ct}: {ranks[ct]}")

Mean rank of highest rated FDA drug (cabazitaxel) for Prostate Cancer: 17.7
Mean rank of highest rated FDA drug (docetaxel) for Gastric Cancer: 79.4
Mean rank of highest rated FDA drug (valrubicin) for Bladder Cancer: 70.6
Mean rank of highest rated FDA drug (docetaxel) for Head and Neck Cancer: 85.1


In [39]:
ct = 'Bladder Cancer'
bladder = pairDF[pairDF.cancerType == ct].groupby(by='drug').mean().sort_values('cellRank')
bladder[bladder.cellRank < min(50, ranks[ct])].head(50)

Unnamed: 0_level_0,pred,cellRank
drug,Unnamed: 1_level_1,Unnamed: 2_level_1
maytansinol-isobutyrate,0.990099,1.294118
triptolide,0.976101,2.117647
10-hydroxycamptothecin,0.896769,4.764706
dolastatin-10,0.874898,5.0
exatecan-mesylate,0.854484,6.411765
SB-743921,0.838908,7.705882
genz-644282,0.741447,11.176471
romidepsin,0.782408,11.529412
YM-155,0.743942,12.058824
cabazitaxel,0.677655,13.411765


In [44]:
rounder = lambda x: round(x, 1)
for i in bladder[bladder.cellRank < min(50, ranks[ct])].cellRank.apply(rounder):
    print(i)

1.3
2.1
4.8
5.0
6.4
7.7
11.2
11.5
12.1
13.4
13.9
14.9
15.3
15.4
16.6
17.0
20.0
23.5
23.6
23.8
25.6
25.7
26.1
27.5
28.6
29.8
30.0
30.2
32.4
33.1
34.2
37.2
38.0
39.2
42.0
42.4
43.2
43.8
44.0
44.8
45.6
45.6
45.6
48.0
49.2


In [40]:
ct = 'Gastric Cancer'
gastric = pairDF[pairDF.cancerType == ct].groupby(by='drug').mean().sort_values('cellRank')
gastric[gastric.cellRank < min(50, ranks[ct])].head(50)

Unnamed: 0_level_0,pred,cellRank
drug,Unnamed: 1_level_1,Unnamed: 2_level_1
maytansinol-isobutyrate,0.988621,1.461538
triptolide,0.975248,2.25641
dolastatin-10,0.927624,3.666667
10-hydroxycamptothecin,0.849258,6.717949
exatecan-mesylate,0.840985,7.051282
SB-743921,0.841569,7.358974
romidepsin,0.829173,9.25641
genz-644282,0.747844,10.794872
YM-155,0.753766,11.717949
epothilone-b,0.76134,12.179487


In [46]:
for i in gastric[gastric.cellRank < min(50, ranks[ct])].index:#cellRank.apply(rounder):
    print(i)

maytansinol-isobutyrate
triptolide
dolastatin-10
10-hydroxycamptothecin
exatecan-mesylate
SB-743921
romidepsin
genz-644282
YM-155
epothilone-b
echinomycin
alvespimycin
gemcitabine
camptothecin
cabazitaxel
vincristine
sangivamycin
nemorubicin
BGT226
GZD824
epothilone-d
CUDC-907
oltipraz
OTS167
verubulin
penicillamine-(D)
JNJ-26481585
aztreonam
cefpiramide
rubitecan
ixazomib-citrate
temsirolimus
litronesib
ixazomib
BAY-11-7085
propoxycaine
norgestrel
levonorgestrel
LY2874455
PF-03758309
mitoxantrone
R547
poziotinib
SN-38
KPT-185


In [38]:
ct = 'Head and Neck Cancer'
head = pairDF[pairDF.cancerType == ct].groupby(by='drug').mean().sort_values('cellRank')
head[head.cellRank < min(50, ranks[ct])].head(50)

Unnamed: 0_level_0,pred,cellRank
drug,Unnamed: 1_level_1,Unnamed: 2_level_1
maytansinol-isobutyrate,0.994721,1.297297
triptolide,0.981726,2.513514
dolastatin-10,0.963003,2.945946
10-hydroxycamptothecin,0.892577,6.135135
romidepsin,0.884458,6.27027
exatecan-mesylate,0.855277,8.054054
echinomycin,0.847492,8.081081
SB-743921,0.845088,8.675676
YM-155,0.832267,8.864865
epothilone-b,0.815811,9.540541


In [48]:
for i in head[head.cellRank < min(50, ranks[ct])].cellRank.apply(rounder):
    print(i)

1.3
2.5
2.9
6.1
6.3
8.1
8.1
8.7
8.9
9.5
11.1
11.6
13.6
14.6
15.6
17.4
18.0
19.2
23.6
24.1
24.6
25.0
25.3
26.3
27.7
29.7
29.8
31.0
31.5
32.1
33.4
37.3
37.4
37.6
38.5
38.9
42.1
42.2
42.3
42.5
43.2
45.9
46.0
48.9


In [51]:
ct = 'Prostate Cancer'
prostate = pairDF[pairDF.cancerType == ct].groupby(by='drug').mean().sort_values('cellRank')
prostate[prostate.cellRank < min(50, ranks[ct])]

Unnamed: 0_level_0,pred,cellRank
drug,Unnamed: 1_level_1,Unnamed: 2_level_1
maytansinol-isobutyrate,0.993549,1.1
dolastatin-10,0.972543,2.1
triptolide,0.949424,3.3
romidepsin,0.909547,4.8
10-hydroxycamptothecin,0.863173,6.3
echinomycin,0.861907,6.6
exatecan-mesylate,0.830781,8.0
SB-743921,0.831126,8.1
alvespimycin,0.806078,8.8
YM-155,0.805492,9.1


In [53]:
for i in prostate[prostate.cellRank < min(50, ranks[ct])].cellRank.apply(rounder):
    print(i)

1.1
2.1
3.3
4.8
6.3
6.6
8.0
8.1
8.8
9.1
10.2
11.9
13.6
15.4
15.4
16.1


# Get drug info

In [71]:
drugs = list(bladder[bladder.cellRank < min(50, ranks['Bladder Cancer'])].index)
drugs.extend(list(head[head.cellRank < min(50, ranks['Head and Neck Cancer'])].index))
drugs.extend(list(gastric[gastric.cellRank < min(50, ranks['Gastric Cancer'])].index))
drugs.extend(list(prostate[prostate.cellRank < min(50, ranks['Prostate Cancer'])].index))
drugs.extend(['docetaxel', 'valrubicin', 'cabazitaxel'])
drugs = list(set(drugs))

drugInfo = pairDF[pairDF.drug.isin(drugs)].loc[:, ['drug', 'moa', 'indication', 'phase']]
drugInfo = drugInfo.drop_duplicates(keep='first')
drugInfo.drug = drugInfo.drug.str.lower()
drugInfo = drugInfo.sort_values(by='drug')
drugInfo

Unnamed: 0,drug,moa,indication,phase
4,10-hydroxycamptothecin,topoisomerase inhibitor,,preclinical
443,alvespimycin,hsp inhibitor,,phase 2
64,azd8330,mek inhibitor,,phase 1
476,aztreonam,bacterial cell wall synthesis inhibitor,"urinary tract infections, respiratory tract in...",launched
68,bay-11-7085,nfkb pathway inhibitor,,preclinical
71,bgt226,pi3k inhibitor,,phase 1/phase 2
522,cabazitaxel,microtubule inhibitor,prostate cancer,launched
527,camptothecin,topoisomerase inhibitor,,phase 3
541,cefpiramide,bacterial cell wall synthesis inhibitor,gram-negative bacterial infections,launched
122,cudc-907,pi3k inhibitor,,phase 2


In [66]:
drugInfo

Unnamed: 0,drug,moa,indication,phase
1102,vincristine,tubulin polymerization inhibitor,acute lymphoblastic leukemia (all),launched
1098,verubulin,tubulin polymerization inhibitor,,phase 2
1079,triptolide,rna polymerase inhibitor,,phase 3
1043,thiomersal,other antibiotic,,launched
1029,temsirolimus,mtor inhibitor,renal cell carcinoma (rcc),launched
976,sangivamycin,dna inhibitor,,phase 1
970,rubitecan,topoisomerase inhibitor,,phase 3
969,romidepsin,hdac inhibitor,cutaneous t-cell lymphoma (ctcl),launched
940,propoxycaine,local anesthetic,anesthetic,launched
933,poziotinib,egfr inhibitor,,phase 2
