In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Old mutation and drug response dataset

In [2]:
alterations = pd.read_csv("data/Alterations.txt", delim_whitespace  = True)
alterations = alterations.set_index("Description").transpose()
alterations.head()

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,HNRNPDL_del,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del
127399_SOFT_TISSUE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
response = pd.read_csv('data/Drug_response.txt', sep = '\t')
response.head()

Unnamed: 0,Description,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,451LU_SKIN,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,...,VMRCRCW_KIDNEY,VMRCRCZ_KIDNEY,WM115_SKIN,WM793_SKIN,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,WSUNHL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR7530_BREAST
0,(5Z)-7-Oxozeaenol,0.862564,0.759749,0.658579,0.104421,0.821017,0.710453,0.662955,0.578093,0.662305,...,,0.652107,0.489042,0.480574,0.977626,0.564966,0.882111,0.439166,0.650792,0.899007
1,5-Fluorouracil,0.486544,0.606334,0.724452,0.958518,0.86427,0.954015,0.851043,0.504025,0.607007,...,0.966414,0.965504,0.864352,0.933608,0.976159,0.895568,0.926724,0.97995,0.917654,
2,681640,0.957936,0.968253,0.947649,,0.923259,0.98886,0.958651,0.875268,,...,,0.959264,0.979195,0.935878,0.98393,0.979477,0.965043,0.919013,0.890455,0.982976
3,A-443654,,,,,,,,0.388111,,...,,,,,0.841546,0.989981,,,,
4,A-770041,,,,,,,,0.795282,,...,,,,,0.992595,0.986994,,,,


In [4]:
print("Number of different drugs: ",len(response["Description"].unique()))
print("Number of different cell lines: ", len(response.columns.unique()))
print("Total number of tests: ", response.notnull().astype(int).sum().sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(response.set_index("Description").notnull().astype(int).sum(axis = 1)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

Number of different drugs:  265
Number of different cell lines:  744
Total number of tests:  159083
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
Description,Unnamed: 1_level_1
Bleomycin (50 uM),705
SN-38,702
PFI-1,701
UNC0638 (2),701
IOX2,700


In [5]:
# Drug with largest number of cells: Bleomycin (50 uM)

final_old_matrix = pd.merge(alterations, response[response["Description"] == "Bleomycin (50 uM)"].set_index("Description").T, how = "inner", left_index = True, right_index = True)
final_old_matrix.dropna(axis = 0, inplace = True)
final_old_matrix

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Bleomycin (50 uM)
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.858908
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208277
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483364
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320193
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.912958
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547133
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.569965
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.521187
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660936
CCFSTTG1_CENTRAL_NERVOUS_SYSTEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480362


In [6]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_old_matrix.index.unique()))
print("Number of different mutations: ", len(final_old_matrix.columns.unique()))
print("Total number of experiments: ", final_old_matrix.notnull().sum().sum())

Number of different cell lines:  698
Number of different mutations:  64145
Total number of experiments:  44773210


## New dataset

In [7]:
response2 = pd.read_excel("data/GDSC/Fitted_dose_response.xlsx")
response2.head()

Unnamed: 0,DATASET_VERSION,IC50_RESULTS_ID,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE
0,17.3,1,683665,MC-CAR,1,Erlotinib,EGFR,2.0,0.007812,2.453524,0.98261,0.021678,-0.015505
1,17.3,1482,684055,ES3,1,Erlotinib,EGFR,2.0,0.007812,3.376592,0.985169,0.029915,0.779999
2,17.3,1791,684057,ES5,1,Erlotinib,EGFR,2.0,0.007812,3.614664,0.983207,0.031201,0.98517
3,17.3,2177,684059,ES7,1,Erlotinib,EGFR,2.0,0.007812,3.223394,0.984574,0.093857,0.647971
4,17.3,2754,684062,EW-11,1,Erlotinib,EGFR,2.0,0.007812,2.486405,0.946034,0.08728,0.012832


In [8]:
#Information

print("Number of different drugs: ",len(response2["DRUG_NAME"].unique()))
print("Number of different cell lines: ", len(response2["CELL_LINE_NAME"].unique()))
print("Total number of tests: ", len(response2.index))
print("Number of tests and cells per drug compounds:")
response_summary = response2.groupby(["DRUG_NAME"])["CELL_LINE_NAME"].agg(['count','nunique']).sort_values(["nunique"], ascending = False)
response_summary.rename(index = str, columns = {'count': "Number of tests", "nunique" : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

Number of different drugs:  251
Number of different cell lines:  1065
Total number of tests:  224202
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of tests,Number of unique cell populations
DRUG_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Avagacestat,1934,1043
JQ1,1881,1040
CHIR-99021,1879,1040
AZD6482,1864,1038
UNC0638,1930,1038


In [10]:
# Drug with largest number of cells: Avagacestat

conversion = pd.read_excel("data/GDSC/GDSC_CCLE_conversion.xlsx")[["GDSC1000 cosmic id","GDSC1000 name","CCLE name"]] #load the conversion dataframe
final_new_matrix = response2[response2["DRUG_NAME"] == "Avagacestat"] #filter the responses df to keep only the selected drug
final_new_matrix = pd.merge(left = final_new_matrix, right = conversion, how = 'left', left_on = 'COSMIC_ID', right_on = "GDSC1000 cosmic id") #add the corresponding CCLE names
final_new_matrix = final_new_matrix.drop(labels = ["DATASET_VERSION", "IC50_RESULTS_ID", "PUTATIVE_TARGET","MAX_CONC_MICROMOLAR", "MIN_CONC_MICROMOLAR", "RMSE", "Z_SCORE", "GDSC1000 cosmic id", "GDSC1000 name"], axis = 1)
final_new_matrix = final_new_matrix.pivot_table(index = "CCLE name", columns = "DRUG_NAME", values = "AUC")
final_new_matrix = alterations.merge(final_new_matrix[['Avagacestat']], left_index = True, right_index = True).dropna(axis = 0)
final_new_matrix

Unnamed: 0,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Avagacestat
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.929403
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.957215
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980485
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970697
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.983013
CORL23_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945110
DOV13_OVARY,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980110
G401_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.960608
G402_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.958368
HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.966866


In [11]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_new_matrix.index.unique()))
print("Number of different mutations: ", len(final_new_matrix.columns.unique()))
print("Total number of experiments: ", final_new_matrix.notnull().sum().sum())

Number of different cell lines:  383
Number of different mutations:  64145
Total number of experiments:  24567535


In [12]:
response3 = pd.read_csv("data/Depmap Project/GDSC_AUC.csv")
response3['Unnamed: 0'] = response3["Unnamed: 0"].apply(lambda a: re.findall(r"GDSC:(\d+)", a)[0])
response3.rename(index = str, columns = {"Unnamed: 0": "Drugs"}, inplace = True)
response3.set_index("Drugs", inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,ACH-002089,ACH-000956,ACH-000948,ACH-000323,ACH-001002,ACH-000905,ACH-000973,ACH-000896,...,ACH-002207,ACH-000827,ACH-000534,ACH-001709,ACH-000332,ACH-000469,ACH-000570,ACH-002208,ACH-002317,ACH-000828
Drugs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [13]:
depmap = pd.read_csv("data/Depmap Project/sample_info.csv")#, sep = "\t")#, delim_whitespace  = True)
conversion_dict = depmap[["Broad_ID","aliases"]].set_index("Broad_ID").to_dict()['aliases']
conversion_dict

{'ACH-000004': 'HEL',
 'ACH-000005': 'HEL9217',
 'ACH-000007': 'LS513',
 'ACH-000009': 'C2BBe1',
 'ACH-000011': '253J',
 'ACH-000012': 'HCC827',
 'ACH-000013': 'ONCODG1',
 'ACH-000014': 'A101D',
 'ACH-000015': 'NCI-H1581',
 'ACH-000017': 'SK-BR-3',
 'ACH-000018': 'T24',
 'ACH-000019': 'MCF7',
 'ACH-000021': 'NCI-H1693',
 'ACH-000022': 'PATU8988S',
 'ACH-000025': 'CH-157MN',
 'ACH-000028': 'KPL-1',
 'ACH-000030': 'PC-14',
 'ACH-000035': 'NCI-H1650',
 'ACH-000036': 'U343',
 'ACH-000037': 'S117',
 'ACH-000039': 'SK-N-MC',
 'ACH-000040': 'U-118 MG',
 'ACH-000041': nan,
 'ACH-000042': nan,
 'ACH-000045': 'MV4;11',
 'ACH-000047': 'GCIY',
 'ACH-000052': nan,
 'ACH-000053': nan,
 'ACH-000054': 'HT-1080',
 'ACH-000055': 'D283',
 'ACH-000060': 'Panc 10.05',
 'ACH-000067': 'Hs 683',
 'ACH-000070': '697',
 'ACH-000075': 'U-87 MG',
 'ACH-000078': nan,
 'ACH-000082': 'G292CLONEA141B1_BONE',
 'ACH-000085': 'T3M4',
 'ACH-000086': 'ACCMESO1',
 'ACH-000087': nan,
 'ACH-000092': 'NCI-H2452',
 'ACH-000095

In [14]:
response3.rename(index = str, columns = conversion_dict, inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,ACH-002089,ACH-000956,2313287,42MGBA,ACH-001002,5637,639V,647V,...,ACH-002207,WM-793,ACH-000534,ACH-001709,nan,YH13,YKG1,ACH-002208,ACH-002317,ACH-000828
Drugs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [15]:
response3 = response3.T
conv_dict2 = response2[["DRUG_ID","DRUG_NAME"]].groupby(by = "DRUG_ID").first().astype(str).to_dict()["DRUG_NAME"]
response3.rename(index = str, columns = conv_dict2, inplace = True)
response3

Drugs,1,1001,1004,1005,1006,1007,1008,1009,1010,1011,...,64,71,83,86,87,88,89,9,91,94
ACH-002137,,0.817796,0.681053,0.956814,0.973314,0.793255,0.961593,0.963326,0.361285,0.948181,...,,,,,,,,,,
ACH-000474,,0.943611,0.409378,0.966637,0.509397,0.339583,0.959671,0.831719,0.749995,,...,,,,,,,,,,
ACH-002089,,0.971663,0.581949,0.792002,0.537315,0.668981,0.908456,0.970879,0.910169,0.978947,...,,,,,,,,,,
ACH-000956,,0.899492,0.600888,0.913204,0.802140,0.700832,0.914641,0.980448,0.980936,0.976109,...,,,,,,,,,,
2313287,,0.939093,0.579856,0.969032,0.561090,0.785138,0.879085,0.984961,0.986605,0.970947,...,,,,,,,,,,
42MGBA,,0.866271,0.568026,0.923544,0.796770,0.559385,0.931492,0.957136,0.980874,0.963165,...,,,,,,,,,,
ACH-001002,,0.925465,0.344816,0.816539,0.749275,0.733372,0.986144,0.964027,0.976748,0.911476,...,,,,,,,,,,
5637,,0.926390,0.253640,0.807772,0.695291,0.297872,0.890535,0.986937,0.963056,0.921170,...,,,,,,,,,,
639V,,0.896797,0.448963,0.710855,0.814927,0.623296,0.991228,0.991692,0.943735,0.967331,...,,,,,,,,,,
647V,,0.837512,0.692615,0.871049,0.805393,0.695986,0.977368,0.956395,0.924450,0.979596,...,,,,,,,,,,


In [16]:
response2[response2["DRUG_NAME"] == "AKT inhibitor VIII"]

Unnamed: 0,DATASET_VERSION,IC50_RESULTS_ID,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE
37937,17.3,64,683665,MC-CAR,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,1.503369,0.937827,0.038962,-0.788336
37938,17.3,569,683667,PFSK-1,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,3.581299,0.983804,0.028401,1.238343
37939,17.3,993,684052,A673,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,2.421323,0.975509,0.012956,0.106978
37940,17.3,1545,684055,ES3,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,2.219081,0.949698,0.055024,-0.090276
37941,17.3,1854,684057,ES5,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,1.263841,0.881716,0.060530,-1.021956
37942,17.3,2240,684059,ES7,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,0.059393,0.776918,0.126590,-2.196696
37943,17.3,2817,684062,EW-11,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,1.660378,0.892721,0.039681,-0.635199
37944,17.3,3396,684072,SK-ES-1,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,2.751544,0.966941,0.086352,0.429054
37945,17.3,3941,684681,NCI-H1395,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,3.816082,0.972802,0.026307,1.467335
37946,17.3,4400,687448,COLO-829,171,AKT inhibitor VIII,"AKT1, AKT2, AKT3",2.56,0.01,2.681174,0.929636,0.110856,0.360419


In [17]:
conv_dict2

{1: 'Erlotinib',
 3: 'Rapamycin',
 5: 'Sunitinib',
 6: 'PHA-665752',
 9: 'MG-132',
 11: 'Paclitaxel',
 17: 'Cyclopamine',
 29: 'AZ628',
 30: 'Sorafenib',
 32: 'Tozasertib',
 34: 'Imatinib',
 35: 'NVP-TAE684',
 37: 'Crizotinib',
 38: 'Saracatinib',
 41: 'S-Trityl-L-cysteine',
 45: 'Z-LLNle-CHO',
 51: 'Dasatinib',
 52: 'GNF-2',
 53: 'CGP-60474',
 54: 'CGP-082996',
 55: 'A-770041',
 56: 'WH-4-023',
 59: 'WZ-1-84',
 60: 'BI-2536',
 62: 'BMS-536924',
 63: 'BMS-509744',
 64: 'CMK',
 71: 'Pyrimethamine',
 83: 'JW-7-52-1',
 86: 'A-443654',
 87: 'GW843682X',
 88: 'Entinostat',
 89: 'Parthenolide',
 91: 'GSK319347A',
 94: 'TGX221',
 104: 'Bortezomib',
 106: 'XMD8-85',
 110: 'Seliciclib',
 111: 'Salubrinal',
 119: 'Lapatinib',
 127: 'GSK269962A',
 133: 'Doxorubicin',
 134: 'Etoposide',
 135: 'Gemcitabine',
 136: 'Mitomycin-C',
 140: 'Vinorelbine ',
 147: 'NSC-87877',
 150: 'Bicalutamide',
 151: 'QS11',
 152: 'CP466722',
 153: 'Midostaurin',
 154: 'CHIR-99021',
 155: 'Ponatinib',
 156: 'AZD6482',


In [18]:
response2[response2["DRUG_ID"] == 1004]

Unnamed: 0,DATASET_VERSION,IC50_RESULTS_ID,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE
138776,17.3,169,683665,MC-CAR,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.124198,0.491782,0.141227,-0.716554
138777,17.3,674,683667,PFSK-1,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.330421,0.460611,0.100147,-0.853394
138778,17.3,1959,684057,ES5,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.314827,0.464284,0.095208,-0.843046
138779,17.3,2345,684059,ES7,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.269610,0.469456,0.124850,-0.813042
138780,17.3,2922,684062,EW-11,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-4.779189,0.541586,0.059512,-0.487623
138781,17.3,3501,684072,SK-ES-1,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-4.494415,0.589267,0.064547,-0.298661
138782,17.3,4046,684681,NCI-H1395,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.297612,0.469107,0.113796,-0.831623
138783,17.3,4954,687452,5637,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-6.618734,0.253640,0.093489,-1.708255
138784,17.3,5458,687455,RT4,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-2.790807,0.818562,0.122158,0.831770
138785,17.3,5965,687457,SW780,1004,Vinblastine,Microtubule destabiliser,0.1,0.000391,-5.837376,0.384836,0.064693,-1.189784


In [19]:
conversion

Unnamed: 0,GDSC1000 cosmic id,GDSC1000 name,CCLE name
0,906800,697,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
1,687452,5637,5637_URINARY_TRACT
2,924100,22RV1,22RV1_PROSTATE
3,910924,23132-87,
4,687561,42-MG-BA,42MGBA_CENTRAL_NERVOUS_SYSTEM
5,906798,639-V,639V_URINARY_TRACT
6,906797,647-V,
7,910922,769-P,769P_KIDNEY
8,905947,786-0,786O_KIDNEY
9,906795,8305C,8305C_THYROID


In [None]:
#Let's looka at the alterations dataset

print("Are there any NaN values in the mutations dataset? ", alterations.isnull().values.any()) #no Nan values
print(f"Size of the dataset: {alterations.shape}: {alterations.shape[0]} mutations available in {alterations.shape[1]-1} cell populations.")

In [None]:
#Let's look at the response dataset

print("Are there any NaN values in the response dataset? ", response.isnull().values.any()) #no Nan values
print(f"Size of the dataset: {response.shape}: the response to {response.shape[0]} drugs in {response.shape[1]} cell populations.")

In [None]:
proportion_non_nans = (response.count()/response.count()["Description"]).sort_values() #looks at the proportions of NaNs for each cell population
proportion_non_nans

In [None]:
weakest_response = min(response.drop('Description', axis = 1).min())
print(weakest_response) #No response is 0 ! 

In [None]:
#What is the proportion of NaNs? Fill
 
filled_response = response.fillna(-1)
liste = range(0,len(filled_response.columns))
filled_response = filled_response.reset_index(drop = True).drop("Description", axis = 1).T.reset_index(drop=True).T#.rename(columns = map(lambda elem: elem, liste))# for i in len(filled_response))
fig = plt.figure( figsize = (15,10))
sns.heatmap(filled_response, cmap = sns.cm.rocket)
plt.xlabel('Cell populations')
plt.ylabel('Drugs');

### Transpose dataframes for homogeneity

In [None]:
alterations_t = alterations.set_index('Description').transpose()
response_t = response.set_index('Description').transpose()

print('Alterations:')
display(alterations_t.head())

print('Responses:')
response_t.head()

#### Let's first try to see if the number of mutations has any impact on the response (naive)

In [None]:
alterations_t['Number of mutations'] = list(alterations_t.iloc[:,:].sum(axis = 1))

In [None]:
print(f'Number of cells in the alterations database: {len(alterations_t.index)}')
print(f'Number of cells in the response database: {len(response_t.index)}')
alterations_t.head()

##### Let's look at the elements in common (these are the only ones that can be used for training)

In [None]:
df1 = pd.concat([response_t, alterations_t['Number of mutations']], axis = 1, join = 'inner')
df1.head()

In [None]:
print(f'Number of cell populations in common: {len(df1.index)}')
proportion_non_nan_drugs = (df1.count()/len(df1.index)).sort_values() #looks at the proportions of NaNs for each cell population
#noNanDrugs = [drug for drug in df1.columns if df1[drug].isnull().values.any() == False]
proportion_non_nan_drugs #we have NaN's in all columns ! What shall we do with them?