In [128]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Old mutation and drug response dataset

In [129]:
alterations = pd.read_csv("data/Alterations.txt", delim_whitespace  = True)
alterations = alterations.set_index("Description").transpose()
alterations.head()

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,HNRNPDL_del,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del
127399_SOFT_TISSUE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
response = pd.read_csv('data/Drug_response.txt', sep = '\t')
response.head()

Unnamed: 0,Description,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,451LU_SKIN,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,...,VMRCRCW_KIDNEY,VMRCRCZ_KIDNEY,WM115_SKIN,WM793_SKIN,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,WSUNHL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR7530_BREAST
0,(5Z)-7-Oxozeaenol,0.862564,0.759749,0.658579,0.104421,0.821017,0.710453,0.662955,0.578093,0.662305,...,,0.652107,0.489042,0.480574,0.977626,0.564966,0.882111,0.439166,0.650792,0.899007
1,5-Fluorouracil,0.486544,0.606334,0.724452,0.958518,0.86427,0.954015,0.851043,0.504025,0.607007,...,0.966414,0.965504,0.864352,0.933608,0.976159,0.895568,0.926724,0.97995,0.917654,
2,681640,0.957936,0.968253,0.947649,,0.923259,0.98886,0.958651,0.875268,,...,,0.959264,0.979195,0.935878,0.98393,0.979477,0.965043,0.919013,0.890455,0.982976
3,A-443654,,,,,,,,0.388111,,...,,,,,0.841546,0.989981,,,,
4,A-770041,,,,,,,,0.795282,,...,,,,,0.992595,0.986994,,,,


In [131]:
print("Number of different drugs: ",len(response["Description"].unique()))
print("Number of different cell lines: ", len(response.columns.unique()))
print("Total number of tests: ", response.notnull().astype(int).sum().sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(response.set_index("Description").notnull().astype(int).sum(axis = 1)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

Number of different drugs:  265
Number of different cell lines:  744
Total number of tests:  159083
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
Description,Unnamed: 1_level_1
Bleomycin (50 uM),705
SN-38,702
PFI-1,701
UNC0638 (2),701
IOX2,700


In [132]:
# Drug with largest number of cells: Bleomycin (50 uM)

final_old_matrix = pd.merge(alterations, response[response["Description"] == "Bleomycin (50 uM)"].set_index("Description").T, how = "inner", left_index = True, right_index = True)
final_old_matrix.dropna(axis = 0, inplace = True)
final_old_matrix

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Bleomycin (50 uM)
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.858908
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208277
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483364
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320193
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.912958
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547133
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.569965
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.521187
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660936
CCFSTTG1_CENTRAL_NERVOUS_SYSTEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480362


In [133]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_old_matrix.index.unique()))
print("Number of different mutations: ", len(final_old_matrix.columns.unique()))
print("Total number of experiments: ", final_old_matrix.notnull().sum().sum())

Number of different cell lines:  698
Number of different mutations:  64145
Total number of experiments:  44773210


## New dataset

In [134]:
response2 = pd.read_excel("data/GDSC/Fitted_dose_response.xlsx")
response2.head()

Unnamed: 0,DATASET_VERSION,IC50_RESULTS_ID,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE
0,17.3,1,683665,MC-CAR,1,Erlotinib,EGFR,2.0,0.007812,2.453524,0.98261,0.021678,-0.015505
1,17.3,1482,684055,ES3,1,Erlotinib,EGFR,2.0,0.007812,3.376592,0.985169,0.029915,0.779999
2,17.3,1791,684057,ES5,1,Erlotinib,EGFR,2.0,0.007812,3.614664,0.983207,0.031201,0.98517
3,17.3,2177,684059,ES7,1,Erlotinib,EGFR,2.0,0.007812,3.223394,0.984574,0.093857,0.647971
4,17.3,2754,684062,EW-11,1,Erlotinib,EGFR,2.0,0.007812,2.486405,0.946034,0.08728,0.012832


In [135]:
# #Information

print("____ Pre-merging information ! _____")
print("Number of different drugs: ",len(response2["DRUG_NAME"].unique()))
print("Number of different cell lines: ", len(response2["CELL_LINE_NAME"].unique()))
print("Total number of tests: ", len(response2.index))
print("Number of tests and cells per drug compounds:")
response_summary = response2.groupby(["DRUG_NAME"])["CELL_LINE_NAME"].agg(['count','nunique']).sort_values(["nunique"], ascending = False)
response_summary.rename(index = str, columns = {'count': "Number of tests", "nunique" : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

____ Pre-merging information ! _____
Number of different drugs:  251
Number of different cell lines:  1065
Total number of tests:  224202
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of tests,Number of unique cell populations
DRUG_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Avagacestat,1934,1043
JQ1,1881,1040
CHIR-99021,1879,1040
AZD6482,1864,1038
UNC0638,1930,1038


Creating the conversion matrix from two different sources.

In [136]:
conversion = pd.read_excel("data/GDSC/GDSC_CCLE_conversion.xlsx")[["GDSC1000 cosmic id","GDSC1000 name","CCLE name"]] #load the conversion dataframe
conversion2 = pd.read_excel("data/Depmap Project/conv2.xlsx", header = 8)[["GDSC1000 cosmic id","GDSC1000 name","CCLE name"]] #load the conversion2 dataframe
conversion3 = response2[["DRUG_ID","DRUG_NAME"]]
conv_tot = pd.concat([conversion, conversion2]) #Concatenate the two
conv_tot.drop_duplicates() #Drop potential duplicates
conv_tot.head()

Unnamed: 0,GDSC1000 cosmic id,GDSC1000 name,CCLE name
0,906800,697,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
1,687452,5637,5637_URINARY_TRACT
2,924100,22RV1,22RV1_PROSTATE
3,910924,23132-87,
4,687561,42-MG-BA,42MGBA_CENTRAL_NERVOUS_SYSTEM


In [137]:
# Finish building the response matrix

response2 = pd.merge(left = response2, right = conv_tot, how = 'inner', left_on = 'COSMIC_ID', right_on = "GDSC1000 cosmic id") #add the corresponding CCLE names
response2 = response2.drop(labels = ["DATASET_VERSION", "IC50_RESULTS_ID", "PUTATIVE_TARGET","MAX_CONC_MICROMOLAR", "MIN_CONC_MICROMOLAR", "RMSE", "Z_SCORE", "GDSC1000 cosmic id", "GDSC1000 name"], axis = 1)
response2 = response2.pivot_table(index = "CCLE name", columns = "DRUG_NAME", values = "AUC")
display(response2.head())

DRUG_NAME,(5Z)-7-Oxozeaenol,5-Fluorouracil,A-443654,A-770041,AICA Ribonucleotide,AKT inhibitor VIII,AR-42,AS601245,AS605240,AT-7519,...,XMD8-92,Y-39983,YK-4-279,YM201636,Z-LLNle-CHO,ZG-10,ZM447439,ZSTK474,Zibotentan,rTRAIL
CCLE name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,0.861735,0.486315,,,0.899492,0.789973,0.407386,0.915276,0.83654,0.723344,...,0.967511,0.957602,0.873885,0.81368,,0.841406,0.963793,0.580946,0.982525,0.953259
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.657716,0.72377,,,0.866271,0.930914,0.682441,0.892492,0.949897,0.849353,...,,0.947804,0.74305,0.934114,,,0.865274,0.914677,0.984417,0.918075
5637_URINARY_TRACT,0.820701,0.863951,,,0.92639,0.942392,0.367742,0.868024,0.731431,0.631728,...,,0.932738,0.581957,0.909406,,,0.801499,0.721278,0.989178,0.831758
639V_URINARY_TRACT,0.70962,0.953706,,,0.896797,0.96848,0.836887,0.79618,0.96425,0.988952,...,0.919457,0.973766,0.726722,0.974948,,0.806263,0.972147,0.982849,0.991687,0.888246
697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.577808,0.503924,0.387922,0.795119,0.71425,0.661972,0.173788,0.824592,0.41496,0.205645,...,,0.82964,0.631235,0.773976,0.7534,,0.661727,0.275989,0.990099,0.870943


In [138]:
#Information

print("____ Post-merging information ! _____")
print("Number of different drugs: ",len(response2.columns.unique()))
print("Number of different cell lines: ", len(response2.index.unique()))
print("Total number of tests: ", response2.notnull().sum(axis = 0).sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(response2.notnull().sum(axis = 0)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

____ Post-merging information ! _____
Number of different drugs:  251
Number of different cell lines:  387
Total number of tests:  78881
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
DRUG_NAME,Unnamed: 1_level_1
JQ1,386
Avagacestat,386
CHIR-99021,386
UNC0638,385
Bicalutamide,385


In [139]:
# Drug with largest number of cells: Avagacestat

final_new_matrix = response2.loc[:,["Avagacestat"]] #filter the responses df to keep only the selected drug
final_new_matrix = alterations.merge(final_new_matrix[['Avagacestat']], left_index = True, right_index = True).dropna(axis = 0)
final_new_matrix

Unnamed: 0,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Avagacestat
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.929403
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.957214
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980485
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970697
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.983012
CORL23_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945110
DOV13_OVARY,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980110
G401_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.960608
G402_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.958368
HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.966866


In [140]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_new_matrix.index.unique()))
print("Number of different mutations: ", len(final_new_matrix.columns.unique()))
print("Total number of experiments: ", final_new_matrix.notnull().sum().sum())

Number of different cell lines:  383
Number of different mutations:  64145
Total number of experiments:  24567535


## Depmap dataset

In [141]:
response3 = pd.read_csv("data/Depmap Project/GDSC_AUC.csv")
response3['Unnamed: 0'] = response3["Unnamed: 0"].apply(lambda a: re.findall(r"GDSC:(\d+)", a)[0])
response3.rename(index = str, columns = {"Unnamed: 0": "Description"}, inplace = True)
response3.set_index("Description", inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,ACH-002089,ACH-000956,ACH-000948,ACH-000323,ACH-001002,ACH-000905,ACH-000973,ACH-000896,...,ACH-002207,ACH-000827,ACH-000534,ACH-001709,ACH-000332,ACH-000469,ACH-000570,ACH-002208,ACH-002317,ACH-000828
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [142]:
## Create a dictionary to convert the column names (ACH-...) into cell population names using the "sample_info" file.

depmap = pd.read_csv("data/Depmap Project/sample_info.csv")
conversion_dict = depmap[["Broad_ID","CCLE_name"]].set_index("Broad_ID").to_dict()['CCLE_name']
conversion_dict = {k: v for k, v in conversion_dict.items() if pd.Series(v).notna().all()} #to get rid of the entries corresponding to nan values

# Replace the names of the cell lines. We notice a lot of missing values. 

response3.rename(index = str, columns = conversion_dict, inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,ACH-002089,ACH-000956,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,ACH-001002,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,...,ACH-002207,WM793_SKIN,ACH-000534,ACH-001709,YAPC_PANCREAS,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ACH-002208,ACH-002317,ACH-000828
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [143]:
# Replace the names of the drugs

response3 = response3.T
conv_dict2 = conversion3.groupby(by = "DRUG_ID").first().to_dict()["DRUG_NAME"]
conv_dict2 = {str(k):v for k,v in conv_dict2.items()}
response3.rename(index = str, columns = conv_dict2, inplace = True)
response3.head()

Description,Erlotinib,AICA Ribonucleotide,Vinblastine,Cisplatin,Cytarabine,Docetaxel,Methotrexate,Tretinoin,Gefitinib,Navitoclax,...,CMK,Pyrimethamine,JW-7-52-1,A-443654,GW843682X,Entinostat,Parthenolide,MG-132,GSK319347A,TGX221
ACH-002137,,0.817796,0.681053,0.956814,0.973314,0.793255,0.961593,0.963326,0.361285,0.948181,...,,,,,,,,,,
ACH-000474,,0.943611,0.409378,0.966637,0.509397,0.339583,0.959671,0.831719,0.749995,,...,,,,,,,,,,
ACH-002089,,0.971663,0.581949,0.792002,0.537315,0.668981,0.908456,0.970879,0.910169,0.978947,...,,,,,,,,,,
ACH-000956,,0.899492,0.600888,0.913204,0.80214,0.700832,0.914641,0.980448,0.980936,0.976109,...,,,,,,,,,,
2313287_STOMACH,,0.939093,0.579856,0.969032,0.56109,0.785138,0.879085,0.984961,0.986605,0.970947,...,,,,,,,,,,


In [144]:
#Information

print("Number of different drugs: ",len(response3.columns))
print("Number of different cell lines: ", len(response3.index))
print("Total number of tests: ", response3.notnull().sum(axis = 0).sum())
print("Number of cell populations per drug compounds:")
drug_nb = pd.DataFrame(response3.notnull().astype(int).sum(axis = 0)).sort_values([0], ascending = False)
drug_nb.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(drug_nb.head())

Number of different drugs:  266
Number of different cell lines:  969
Total number of tests:  208734
Number of cell populations per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
Description,Unnamed: 1_level_1
SN-38,935
Bleomycin (50 uM),914
UNC0638,910
PFI-1,910
Piperlongumine,908


In [145]:
# Drug with largest number of cells: SN-38

print("Initial number")
final_depmap_matrix = response3.loc[:,["SN-38"]] #filter the responses df to keep only the selected drug
final_depmap_matrix = alterations.merge(final_depmap_matrix[['SN-38']], left_index = True, right_index = True).dropna(axis = 0)
final_depmap_matrix

Initial number


Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,SN-38
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559201
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527349
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272082
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449346
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.544522
CCFSTTG1_CENTRAL_NERVOUS_SYSTEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.864838
CORL23_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563115
D283MED_CENTRAL_NERVOUS_SYSTEM,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542599
ESO26_OESOPHAGUS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.643066
FLO1_OESOPHAGUS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814089


In [146]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_depmap_matrix.index.unique()))
print("Number of different mutations: ", len(final_depmap_matrix.columns.unique()))
print("Total number of experiments: ", final_depmap_matrix.notnull().sum().sum())

Number of different cell lines:  324
Number of different mutations:  64145
Total number of experiments:  20782980


# Summary

In [147]:
summary = pd.DataFrame(index = ["Number of different drugs","Number of different cell lines","Total number of tests","Final number of cell lines","Final number of mutations","Final number of experiments"])

summary['Old Dataset'] = [len(response["Description"].unique()), len(response.columns.unique()), response.notnull().astype(int).sum().sum(), len(final_old_matrix.index.unique()), len(final_old_matrix.columns.unique()), final_old_matrix.notnull().sum().sum()]
summary['New Dataset'] = [len(response2.columns.unique()), len(response2.index.unique()), response2.notnull().sum(axis = 0).sum(), len(final_new_matrix.index.unique()), len(final_new_matrix.columns.unique()), final_new_matrix.notnull().sum().sum()]
summary['Depmap Dataset'] = [len(response3.columns), len(response3.index), response3.notnull().sum(axis = 0).sum(), len(final_depmap_matrix.index.unique()), len(final_depmap_matrix.columns.unique()), final_depmap_matrix.notnull().sum().sum()]

summary

Unnamed: 0,Old Dataset,New Dataset,Depmap Dataset
Number of different drugs,265,251,266
Number of different cell lines,744,387,969
Total number of tests,159083,78881,208734
Final number of cell lines,698,383,324
Final number of mutations,64145,64145,64145
Final number of experiments,44773210,24567535,20782980


## A new idea

Another way of creating the feature matrix would be to merge all three datasets together. We noticed that the main problem was usually coming from how the cell names were converted. If these do not overlap much, we could have a larger final dataset.

We will attempt this we Bleomycin, one of the drugs with the largest number of cells for all three datasets.

In [178]:
Bleo1 = response[response["Description"] == "Bleomycin (50 uM)"].set_index("Description").T
Bleo2 = response2.loc[:,["Bleomycin (50 uM)"]]
Bleo3 = response3.loc[:,["Bleomycin (50 uM)"]]

In [179]:
Bleo = pd.concat([Bleo1,Bleo2,Bleo3])
Bleo = Bleo.drop_duplicates()
Bleo = Bleo.reset_index(drop = False)
Bleo = Bleo.groupby(by='index').mean()
Bleo.sort_index()

Unnamed: 0_level_0,Bleomycin (50 uM)
index,Unnamed: 1_level_1
22RV1_PROSTATE,0.858519
2313287_STOMACH,0.400989
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.265196
451LU_SKIN,0.952523
5637_URINARY_TRACT,0.238531
639V_URINARY_TRACT,0.317053
647V_URINARY_TRACT,0.524096
697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.567745
769P_KIDNEY,0.499204
786O_KIDNEY,0.316241


In [180]:
final_matrix = alterations.merge(Bleo[['Bleomycin (50 uM)']], left_index = True, right_index = True).dropna(axis = 0)
final_matrix

Unnamed: 0,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Bleomycin (50 uM)
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.858519
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207704
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157651
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.482731
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319110
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.912958
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546792
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.569768
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.520658
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660276


We end up with 710 cells. This is only a tiny improvement compared to the first dataset, which had 698 cell populations with the Bleomyicin drug. 

It appears that an important work should be carried to improve the conversion from GDSC --> CCLE cell names.