In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Old mutation and drug response dataset

In [2]:
alterations = pd.read_csv("data/Alterations.txt", delim_whitespace  = True)
alterations = alterations.set_index("Description").transpose()
alterations.head()

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,HNRNPDL_del,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del
127399_SOFT_TISSUE,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
response = pd.read_csv('data/Drug_response.txt', sep = '\t')
response.head()

Unnamed: 0,Description,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,451LU_SKIN,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,...,VMRCRCW_KIDNEY,VMRCRCZ_KIDNEY,WM115_SKIN,WM793_SKIN,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,WSUNHL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR7530_BREAST
0,(5Z)-7-Oxozeaenol,0.862564,0.759749,0.658579,0.104421,0.821017,0.710453,0.662955,0.578093,0.662305,...,,0.652107,0.489042,0.480574,0.977626,0.564966,0.882111,0.439166,0.650792,0.899007
1,5-Fluorouracil,0.486544,0.606334,0.724452,0.958518,0.86427,0.954015,0.851043,0.504025,0.607007,...,0.966414,0.965504,0.864352,0.933608,0.976159,0.895568,0.926724,0.97995,0.917654,
2,681640,0.957936,0.968253,0.947649,,0.923259,0.98886,0.958651,0.875268,,...,,0.959264,0.979195,0.935878,0.98393,0.979477,0.965043,0.919013,0.890455,0.982976
3,A-443654,,,,,,,,0.388111,,...,,,,,0.841546,0.989981,,,,
4,A-770041,,,,,,,,0.795282,,...,,,,,0.992595,0.986994,,,,


In [125]:
[response.loc[i,"Description"] for i in response.index if "Af" in response.loc[i,"Description"]]

'Afatinib (1)'

In [4]:
print("Number of different drugs: ",len(response["Description"].unique()))
print("Number of different cell lines: ", len(response.columns.unique()))
print("Total number of tests: ", response.notnull().astype(int).sum().sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(response.set_index("Description").notnull().astype(int).sum(axis = 1)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

Number of different drugs:  265
Number of different cell lines:  744
Total number of tests:  159083
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
Description,Unnamed: 1_level_1
Bleomycin (50 uM),705
SN-38,702
PFI-1,701
UNC0638 (2),701
IOX2,700


In [5]:
# Drug with largest number of cells: Bleomycin (50 uM)

final_old_matrix = pd.merge(alterations, response[response["Description"] == "Bleomycin (50 uM)"].set_index("Description").T, how = "inner", left_index = True, right_index = True)
final_old_matrix.dropna(axis = 0, inplace = True)
final_old_matrix

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Bleomycin (50 uM)
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.858908
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208277
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.483364
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320193
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.912958
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547133
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.569965
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.521187
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660936
CCFSTTG1_CENTRAL_NERVOUS_SYSTEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480362


In [6]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_old_matrix.index.unique()))
print("Number of different mutations: ", len(final_old_matrix.columns.unique()))
print("Total number of experiments: ", final_old_matrix.notnull().sum().sum())

Number of different cell lines:  698
Number of different mutations:  64145
Total number of experiments:  44773210


## New dataset

In [7]:
response2 = pd.read_excel("data/GDSC/Fitted_dose_response.xlsx")
response2.head()

Unnamed: 0,DATASET_VERSION,IC50_RESULTS_ID,COSMIC_ID,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,MAX_CONC_MICROMOLAR,MIN_CONC_MICROMOLAR,LN_IC50,AUC,RMSE,Z_SCORE
0,17.3,1,683665,MC-CAR,1,Erlotinib,EGFR,2.0,0.007812,2.453524,0.98261,0.021678,-0.015505
1,17.3,1482,684055,ES3,1,Erlotinib,EGFR,2.0,0.007812,3.376592,0.985169,0.029915,0.779999
2,17.3,1791,684057,ES5,1,Erlotinib,EGFR,2.0,0.007812,3.614664,0.983207,0.031201,0.98517
3,17.3,2177,684059,ES7,1,Erlotinib,EGFR,2.0,0.007812,3.223394,0.984574,0.093857,0.647971
4,17.3,2754,684062,EW-11,1,Erlotinib,EGFR,2.0,0.007812,2.486405,0.946034,0.08728,0.012832


In [8]:
# #Information

print("____ Pre-merging information ! _____")
print("Number of different drugs: ",len(response2["DRUG_NAME"].unique()))
print("Number of different cell lines: ", len(response2["CELL_LINE_NAME"].unique()))
print("Total number of tests: ", len(response2.index))
print("Number of tests and cells per drug compounds:")
response_summary = response2.groupby(["DRUG_NAME"])["CELL_LINE_NAME"].agg(['count','nunique']).sort_values(["nunique"], ascending = False)
response_summary.rename(index = str, columns = {'count': "Number of tests", "nunique" : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

____ Pre-merging information ! _____
Number of different drugs:  251
Number of different cell lines:  1065
Total number of tests:  224202
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of tests,Number of unique cell populations
DRUG_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Avagacestat,1934,1043
JQ1,1881,1040
CHIR-99021,1879,1040
AZD6482,1864,1038
UNC0638,1930,1038


Creating the conversion matrix from two different sources.

In [9]:
conversion = pd.read_excel("data/GDSC/GDSC_CCLE_conversion.xlsx")[["GDSC1000 cosmic id","GDSC1000 name","CCLE name"]] #load the conversion dataframe
conversion2 = pd.read_excel("data/Depmap Project/conv2.xlsx", header = 8)[["GDSC1000 cosmic id","GDSC1000 name","CCLE name"]] #load the conversion2 dataframe
conversion3 = response2[["DRUG_ID","DRUG_NAME"]]
conv_tot = pd.concat([conversion, conversion2]) #Concatenate the two
conv_tot.drop_duplicates() #Drop potential duplicates
conv_tot.head()

Unnamed: 0,GDSC1000 cosmic id,GDSC1000 name,CCLE name
0,906800,697,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
1,687452,5637,5637_URINARY_TRACT
2,924100,22RV1,22RV1_PROSTATE
3,910924,23132-87,
4,687561,42-MG-BA,42MGBA_CENTRAL_NERVOUS_SYSTEM


In [10]:
# Finish building the response matrix

response2 = pd.merge(left = response2, right = conv_tot, how = 'inner', left_on = 'COSMIC_ID', right_on = "GDSC1000 cosmic id") #add the corresponding CCLE names
response2 = response2.drop(labels = ["DATASET_VERSION", "IC50_RESULTS_ID", "PUTATIVE_TARGET","MAX_CONC_MICROMOLAR", "MIN_CONC_MICROMOLAR", "RMSE", "Z_SCORE", "GDSC1000 cosmic id", "GDSC1000 name"], axis = 1)
response2 = response2.pivot_table(index = "CCLE name", columns = "DRUG_NAME", values = "AUC")
display(response2.head())

DRUG_NAME,(5Z)-7-Oxozeaenol,5-Fluorouracil,A-443654,A-770041,AICA Ribonucleotide,AKT inhibitor VIII,AR-42,AS601245,AS605240,AT-7519,...,XMD8-92,Y-39983,YK-4-279,YM201636,Z-LLNle-CHO,ZG-10,ZM447439,ZSTK474,Zibotentan,rTRAIL
CCLE name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,0.861735,0.486315,,,0.899492,0.789973,0.407386,0.915276,0.83654,0.723344,...,0.967511,0.957602,0.873885,0.81368,,0.841406,0.963793,0.580946,0.982525,0.953259
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.657716,0.72377,,,0.866271,0.930914,0.682441,0.892492,0.949897,0.849353,...,,0.947804,0.74305,0.934114,,,0.865274,0.914677,0.984417,0.918075
5637_URINARY_TRACT,0.820701,0.863951,,,0.92639,0.942392,0.367742,0.868024,0.731431,0.631728,...,,0.932738,0.581957,0.909406,,,0.801499,0.721278,0.989178,0.831758
639V_URINARY_TRACT,0.70962,0.953706,,,0.896797,0.96848,0.836887,0.79618,0.96425,0.988952,...,0.919457,0.973766,0.726722,0.974948,,0.806263,0.972147,0.982849,0.991687,0.888246
697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.577808,0.503924,0.387922,0.795119,0.71425,0.661972,0.173788,0.824592,0.41496,0.205645,...,,0.82964,0.631235,0.773976,0.7534,,0.661727,0.275989,0.990099,0.870943


In [11]:
#Information

print("____ Post-merging information ! _____")
print("Number of different drugs: ",len(response2.columns.unique()))
print("Number of different cell lines: ", len(response2.index.unique()))
print("Total number of tests: ", response2.notnull().sum(axis = 0).sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(response2.notnull().sum(axis = 0)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

____ Post-merging information ! _____
Number of different drugs:  251
Number of different cell lines:  387
Total number of tests:  78881
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
DRUG_NAME,Unnamed: 1_level_1
JQ1,386
Avagacestat,386
CHIR-99021,386
UNC0638,385
Bicalutamide,385


In [12]:
# Drug with largest number of cells: Avagacestat

final_new_matrix = response2.loc[:,["Avagacestat"]] #filter the responses df to keep only the selected drug
final_new_matrix = alterations.merge(final_new_matrix[['Avagacestat']], left_index = True, right_index = True).dropna(axis = 0)
final_new_matrix

Unnamed: 0,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Avagacestat
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.929403
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.957214
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980485
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970697
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.983012
CORL23_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.945110
DOV13_OVARY,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.980110
G401_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.960608
G402_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.958368
HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.966866


In [13]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_new_matrix.index.unique()))
print("Number of different mutations: ", len(final_new_matrix.columns.unique()))
print("Total number of experiments: ", final_new_matrix.notnull().sum().sum())

Number of different cell lines:  383
Number of different mutations:  64145
Total number of experiments:  24567535


## Depmap dataset

In [228]:
response3 = pd.read_csv("data/Depmap Project/GDSC_AUC.csv")
response3['Unnamed: 0'] = response3["Unnamed: 0"].apply(lambda a: re.findall(r"GDSC:(\d+)", a)[0])
response3.rename(index = str, columns = {"Unnamed: 0": "Description"}, inplace = True)
response3.set_index("Description", inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,ACH-002089,ACH-000956,ACH-000948,ACH-000323,ACH-001002,ACH-000905,ACH-000973,ACH-000896,...,ACH-002207,ACH-000827,ACH-000534,ACH-001709,ACH-000332,ACH-000469,ACH-000570,ACH-002208,ACH-002317,ACH-000828
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [229]:
## Create a dictionary to convert the column names (ACH-...) into cell population names using the "sample_info" file.

depmap1 = pd.read_csv("data/Depmap Project/sample_info.csv")
depmap2 = pd.read_csv("data/CCLE/CCLE_Cell_lines.txt", sep = "\t")
conversion_dict1 = depmap1[["Broad_ID","CCLE_name"]].set_index("Broad_ID")
conversion_dict2 = depmap2[["depMapID","CCLE_ID"]].set_index("depMapID")
conversion_dict = pd.concat([conversion_dict1,conversion_dict2.rename(index = str, columns = {"CCLE_ID":"CCLE_name"})]).reset_index(drop=False).drop_duplicates().set_index("index").to_dict()['CCLE_name']
conversion_dict = {k: v for k, v in conversion_dict.items() if pd.Series(v).notna().all()} #to get rid of the entries corresponding to nan values

# Replace the names of the cell lines. We notice a lot of missing values. 

response3.rename(index = str, columns = conversion_dict, inplace = True)
response3.head()

Unnamed: 0_level_0,ACH-002137,ACH-000474,201T_LUNG,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,451LU_SKIN,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,...,WM35_SKIN,WM793_SKIN,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,WSUNHL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,YMB1E_BREAST,YT_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ZR7530_BREAST
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,0.992474,0.986123,,,,,0.992171,
1001,0.817796,0.943611,0.971663,0.899492,0.939093,0.866271,0.925465,0.92639,0.896797,0.837512,...,0.650388,0.873253,0.934107,0.821713,0.948678,0.960856,0.745632,,0.988175,0.90317
1004,0.681053,0.409378,0.581949,0.600888,0.579856,0.568026,0.344816,0.25364,0.448963,0.692615,...,0.366429,0.596463,0.405591,0.364567,0.849627,0.437981,0.514133,,0.964358,0.82137
1005,0.956814,0.966637,0.792002,0.913204,0.969032,0.923544,0.816539,0.807772,0.710855,0.871049,...,0.846198,0.883452,0.976314,0.920612,0.984952,0.883545,0.829039,,0.988405,0.963967
1006,0.973314,0.509397,0.537315,0.80214,0.56109,0.79677,0.749275,0.695291,0.814927,0.805393,...,0.787294,0.675927,0.908963,0.506381,0.901201,0.612377,0.628725,,0.979273,0.855359


In [230]:
# Replace the names of the drugs

response3 = response3.T
conv_dict2 = conversion3.groupby(by = "DRUG_ID").first().to_dict()["DRUG_NAME"]
conv_dict2 = {str(k):v for k,v in conv_dict2.items()}
response3.rename(index = str, columns = conv_dict2, inplace = True)
response3.head()

Description,Erlotinib,AICA Ribonucleotide,Vinblastine,Cisplatin,Cytarabine,Docetaxel,Methotrexate,Tretinoin,Gefitinib,Navitoclax,...,CMK,Pyrimethamine,JW-7-52-1,A-443654,GW843682X,Entinostat,Parthenolide,MG-132,GSK319347A,TGX221
ACH-002137,,0.817796,0.681053,0.956814,0.973314,0.793255,0.961593,0.963326,0.361285,0.948181,...,,,,,,,,,,
ACH-000474,,0.943611,0.409378,0.966637,0.509397,0.339583,0.959671,0.831719,0.749995,,...,,,,,,,,,,
201T_LUNG,,0.971663,0.581949,0.792002,0.537315,0.668981,0.908456,0.970879,0.910169,0.978947,...,,,,,,,,,,
22RV1_PROSTATE,,0.899492,0.600888,0.913204,0.80214,0.700832,0.914641,0.980448,0.980936,0.976109,...,,,,,,,,,,
2313287_STOMACH,,0.939093,0.579856,0.969032,0.56109,0.785138,0.879085,0.984961,0.986605,0.970947,...,,,,,,,,,,


In [231]:
#Information

print("Number of different drugs: ",len(response3.columns))
print("Number of different cell lines: ", len(response3.index))
print("Total number of tests: ", response3.notnull().sum(axis = 0).sum())
print("Number of cell populations per drug compounds:")
drug_nb = pd.DataFrame(response3.notnull().astype(int).sum(axis = 0)).sort_values([0], ascending = False)
drug_nb.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(drug_nb.head())

Number of different drugs:  266
Number of different cell lines:  969
Total number of tests:  208734
Number of cell populations per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
Description,Unnamed: 1_level_1
SN-38,935
Bleomycin (50 uM),914
UNC0638,910
PFI-1,910
Piperlongumine,908


In [232]:
# Drug with largest number of cells: SN-38

final_depmap_matrix = response3.loc[:,["SN-38"]] #filter the responses df to keep only the selected drug
final_depmap_matrix = alterations.merge(final_depmap_matrix[['SN-38']], left_index = True, right_index = True).dropna(axis = 0)
final_depmap_matrix

Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,SN-38
22RV1_PROSTATE,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.588378
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.473547
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.720125
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559201
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527349
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.582721
A673_BONE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272082
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.267989
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449346
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.544522


In [233]:
# Information about that final matrix:

print("Number of different cell lines: ", len(final_depmap_matrix.index.unique()))
print("Number of different mutations: ", len(final_depmap_matrix.columns.unique()))
print("Total number of experiments: ", final_depmap_matrix.notnull().sum().sum())

Number of different cell lines:  852
Number of different mutations:  64145
Total number of experiments:  54651540


# Summary

In [234]:
summary = pd.DataFrame(index = ["Number of different drugs","Number of different cell lines","Total number of tests","Final number of cell lines","Final number of mutations","Final number of experiments"])

summary['Old Dataset'] = [len(response["Description"].unique()), len(response.columns.unique()), response.notnull().astype(int).sum().sum(), len(final_old_matrix.index.unique()), len(final_old_matrix.columns.unique()), final_old_matrix.notnull().sum().sum()]
summary['New Dataset'] = [len(response2.columns.unique()), len(response2.index.unique()), response2.notnull().sum(axis = 0).sum(), len(final_new_matrix.index.unique()), len(final_new_matrix.columns.unique()), final_new_matrix.notnull().sum().sum()]
summary['Depmap Dataset'] = [len(response3.columns), len(response3.index), response3.notnull().sum(axis = 0).sum(), len(final_depmap_matrix.index.unique()), len(final_depmap_matrix.columns.unique()), final_depmap_matrix.notnull().sum().sum()]

summary

Unnamed: 0,Old Dataset,New Dataset,Depmap Dataset
Number of different drugs,265,251,266
Number of different cell lines,744,387,969
Total number of tests,159083,78881,208734
Final number of cell lines,698,383,852
Final number of mutations,64145,64145,64145
Final number of experiments,44773210,24567535,54651540


## A new idea

Another way of creating the feature matrix would be to merge all three datasets together. We noticed that the main problem was usually coming from how the cell names were converted. If these do not overlap much, we could have a larger final dataset.

We will attempt this with Bleomycin, one of the drugs with the largest number of cells for all three datasets.

In [235]:
#Create a better response matrix
response1 = response.set_index("Description")
response1 = response1.T

In [236]:
def process_double_drugs(response1, response3):
    
    print("Number of responses in Afatinib (1): ",response1["Afatinib (1)"].notnull().sum())
    print("Number of responses in Afatinib (2): ",response1["Afatinib (2)"].notnull().sum())
    print("We will pick the second one!")
    
    # Dealing with response1
    response1["Refametinib"] = response1["Refametinib (1)"]
    response1["Afatinib"] = response1["Afatinib (2)"]
    
    # Dealing with response3
    response3["Refametinib_true"] = pd.DataFrame(response3["Refametinib"].iloc[:,0])
    response3["Afatinib_true"] = pd.DataFrame(response3["Afatinib"].iloc[:,1])
    response3 = response3.drop(labels = ["Afatinib", "Refametinib"], axis = 1)
    response3 = response3.rename(columns = {"Refametinib_true":"Refametinib", "Afatinib_true":"Afatinib"})
    
    print("Done!")
    
    return response1, response3

response1, response3 = process_double_drugs(response1, response3) #Don't run this cell twice!

Number of responses in Afatinib (1):  634
Number of responses in Afatinib (2):  688
We will pick the second one!


In [237]:
DRUG_NAMES = {"CI-1040": ["MYC_mut","RB1_mut","ERBB2_amp","BRAF_mut","KRAS_mut","NRAS_mut"],
              "PD0325901": ["MYC_mut","RB1_mut","ERBB2_amp","BRAF_mut","KRAS_mut","NRAS_mut"],
              "Refametinib":["MYC_mut","RB1_mut","ERBB2_amp","BRAF_mut","KRAS_mut","NRAS_mut"],
              "VX-11e":["RB1_mut","ERBB2_amp","CCND1_amp","BRAF_mut","KRAS_mut","NRAS_mut"],
              "Afatinib":["KRAS_mut","NRAS_mut","EGFR_amp","ERBB2_amp","FOXP3_del"],
              "Pelitinib":["BRAF_mut","RB1_mut","MAPK1_del","MYC_mut","EGFR_mut","CDKN1B_del"]
             }

In [238]:
def create_drug_dataframe(drug_names, response1, response2, response3, save_df = True):
    
    for drug in drug_names:
        print(drug,"...")
        
        #rep1 = response[response["Description"] == drug].set_index("Description").T
        rep1 = response1.loc[:,[drug]]
        rep2 = response2.loc[:,[drug]] # is it really the same AUC?
        rep3 = response3.loc[:,[drug]]

        Rep = pd.concat([rep1,rep2,rep3])
        Rep = Rep.drop_duplicates()
        Rep = Rep.reset_index(drop = False)
        Rep = Rep.groupby(by='index').mean()
        
        Rep.sort_index().head()

        filtered_matrix = alterations.loc[:,drug_names[drug]].merge(Rep[[drug]], left_index = True, right_index = True).dropna(axis = 0)
        
        if save_df:
            filtered_matrix.to_csv(f'data/Final matrices/{drug}.csv', index_label = "Cell_line")

    print("Done! Here's the final matrix of your last drug:")
    
    return filtered_matrix 

In [239]:
display(create_drug_dataframe(DRUG_NAMES, response1, response2, response3, save_df = True))

CI-1040 ...
PD0325901 ...
Refametinib ...
VX-11e ...
Afatinib ...
Pelitinib ...
Done! Here's the final matrix of your last drug:


Unnamed: 0,BRAF_mut,RB1_mut,MAPK1_del,MYC_mut,EGFR_mut,CDKN1B_del,Pelitinib
22RV1_PROSTATE,1.0,0.0,0.0,0.0,0.0,0.0,0.975606
A204_SOFT_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.885947
A253_SALIVARY_GLAND,0.0,0.0,0.0,0.0,0.0,0.0,0.853298
A427_LUNG,0.0,0.0,0.0,0.0,0.0,0.0,0.942586
A431_SKIN,0.0,0.0,0.0,0.0,0.0,0.0,0.525721
A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,1.0,0.0,0.0,0.950802
A673_BONE,1.0,0.0,0.0,0.0,0.0,0.0,0.982829
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.078683
BICR78_UPPER_AERODIGESTIVE_TRACT,0.0,0.0,0.0,0.0,0.0,0.0,0.935165
CADOES1_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.875099


We end up with 864 cells. This is a good improvement compared to the first dataset, which had 698 cell populations (with the Bleomyicin drug, not SR-38). 

It appears that an important work should be carried to improve the conversion from GDSC --> CCLE cell names.

# New conversion matrix and CTRP

Trying to increase the size of the dataset by using a new conversion matrix found online. Adding the CTRP drug sensitivity dataset to increase the size of our matrix.

Ok, the conversion matrix is actually the same as the one used above --> did not help. Let's just focus on the CTRP dataset. We will import the CTRP data and use conversion matrices to convert the drug names to "normal" names and the cell names to CCLE (so it can be associated with our mutations).

In [73]:
conversion4 = pd.read_excel("data/Depmap Project/GDSC-CCLE-CTRP_conversion.xlsx")
conversion4 = conversion4[["CCLE name","CTRP master ccl id"]].dropna()
conversion4.head()

Unnamed: 0,CCLE name,CTRP master ccl id
1,5637_URINARY_TRACT,3.0
2,22RV1_PROSTATE,7.0
4,42MGBA_CENTRAL_NERVOUS_SYSTEM,10.0
5,639V_URINARY_TRACT,14.0
7,769P_KIDNEY,16.0


In [74]:
ctrp21 = pd.read_csv("data/CTRP v2.1 [2016]/v21.data.auc_sensitivities.txt", sep = "\t")
#ctrp2 = pd.read_csv("data/Depmap Project/CTRP/v20.data.curves_post_qc.txt", sep = "\t")
ctrp21

Unnamed: 0,experiment_id,area_under_curve,master_cpd_id,master_ccl_id
0,1,14.7820,1788,130
1,2,15.6800,1788,569
2,3,15.4000,1788,682
3,4,14.3880,1788,9
4,5,14.8430,1788,61
5,6,16.5820,1788,62
6,8,16.9680,1788,111
7,9,14.5410,1788,115
8,10,14.3970,1788,119
9,11,14.5020,1788,455


In [75]:
ctrp_compound = pd.read_csv("data/CTRP v2.1 [2016]/v21.meta.per_compound.txt", sep = "\t")
ctrp_compound = ctrp_compound[["master_cpd_id","cpd_name"]]
ctrp_compound.head()

Unnamed: 0,master_cpd_id,cpd_name
0,1788,CIL55
1,3588,BRD4132
2,12877,BRD6340
3,17712,ML006
4,18311,Bax channel blocker


In [76]:
ctrp21 = ctrp21.merge(conversion4, left_on = 'master_ccl_id', right_on = 'CTRP master ccl id').drop(labels = ['master_ccl_id','CTRP master ccl id'], axis = 1)
ctrp21 = ctrp21.merge(ctrp_compound, on = 'master_cpd_id').drop(labels = ['master_cpd_id'], axis = 1)
ctrp21

Unnamed: 0,experiment_id,area_under_curve,CCLE name,cpd_name
0,1,14.7820,CAS1_CENTRAL_NERVOUS_SYSTEM,CIL55
1,6,16.5820,BFTC909_KIDNEY,CIL55
2,11,14.5020,HUCCT1_BILIARY_TRACT,CIL55
3,13,14.7700,RPMI7951_SKIN,CIL55
4,16,13.8750,HDQP1_BREAST,CIL55
5,25,14.7490,BCPAP_THYROID,CIL55
6,35,14.3280,8305C_THYROID,CIL55
7,39,14.7520,KMRC2_KIDNEY,CIL55
8,44,14.8030,A375_SKIN,CIL55
9,421,14.9490,A375_SKIN,CIL55


In [77]:
# Create the final CTRP dataset

ctrp21 = ctrp21.groupby(by = ["CCLE name","cpd_name"]).mean().reset_index(drop = False)
ctrp21 = ctrp21.pivot(index = "CCLE name", columns = "cpd_name", values = "area_under_curve")
ctrp21

cpd_name,16-beta-bromoandrosterone,"1S,3R-RSL-3",3-Cl-AHPC,968,A-804598,AA-COCF3,ABT-199,ABT-737,AC55649,AGK-2,...,trifluoperazine,triptolide,tubastatin A,valdecoxib,vandetanib,veliparib,vincristine,vorapaxar,vorinostat,zebularine
CCLE name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,14.565000,9.299700,10.259000,14.376,,12.043000,,12.9650,13.845000,15.266000,...,13.715000,12.347000,13.6230,12.451000,12.847000,13.509000,4.940900,13.320000,10.4180,13.110000
42MGBA_CENTRAL_NERVOUS_SYSTEM,14.917000,6.217400,8.920100,,14.1470,14.543000,15.1720,14.9800,,10.679000,...,14.385000,5.298400,,13.313000,12.770000,14.746000,8.214600,13.306000,12.3190,11.009000
5637_URINARY_TRACT,14.140000,7.763800,9.375100,14.045,,12.344000,,12.7460,13.304000,15.000000,...,13.529000,12.190000,12.0940,12.128000,10.357000,12.513000,6.547800,12.466000,10.8280,13.275000
639V_URINARY_TRACT,15.446000,13.301000,10.052000,15.448,,12.671000,,15.7390,15.679000,14.703000,...,14.521000,13.486000,15.9490,14.640000,13.810000,15.535000,10.170000,16.209000,13.8650,13.772000
769P_KIDNEY,14.252000,7.599100,9.591700,14.915,,13.280000,,13.4200,14.751000,15.958000,...,14.482000,13.046000,12.9690,13.316000,11.697000,14.768000,10.355000,14.516000,12.2340,13.477000
786O_KIDNEY,14.749000,5.981400,10.551000,,14.4980,14.793000,14.6080,12.9600,14.746000,14.633000,...,14.316000,12.378000,,13.490000,12.259000,15.288000,9.090200,14.384000,12.7890,13.009000
8305C_THYROID,15.176000,4.635600,10.687000,14.145,,11.355000,,13.9430,15.000000,14.233000,...,14.151000,11.449000,14.6480,12.665000,12.162000,14.401000,6.431200,14.226000,11.7810,13.777000
8505C_THYROID,13.815000,2.954500,10.243000,14.864,,12.168000,,13.9510,15.194000,14.099000,...,14.246000,13.080000,14.6820,12.928000,13.017000,15.000000,8.219400,14.205000,11.3680,14.443000
8MGBA_CENTRAL_NERVOUS_SYSTEM,14.767000,8.038500,10.804000,,14.3730,15.000000,15.0000,,14.417000,15.253000,...,14.446000,13.741000,,12.350000,14.433000,14.734000,11.185000,14.527000,14.0110,14.662000
A172_CENTRAL_NERVOUS_SYSTEM,14.612000,9.538700,12.221000,,14.2290,15.454000,13.1190,15.0000,15.058000,13.779000,...,14.501000,11.342000,,13.591000,13.062000,14.433000,3.577200,13.198000,12.7020,14.034000


In [78]:
# Information

print("Number of different drugs: ",len(ctrp21.columns.unique()))
print("Number of different cell lines: ", len(ctrp21.index.unique()))
print("Total number of tests: ", ctrp21.notnull().sum(axis = 0).sum())
print("Number of tests and cells per drug compounds:")
response_summary = pd.DataFrame(ctrp21.notnull().sum(axis = 0)).sort_values([0], ascending = False)
response_summary.rename(index = str, columns = {0 : 'Number of unique cell populations'}, inplace = True)
display(response_summary.head())

Number of different drugs:  481
Number of different cell lines:  303
Total number of tests:  120074
Number of tests and cells per drug compounds:


Unnamed: 0_level_0,Number of unique cell populations
cpd_name,Unnamed: 1_level_1
leptomycin B,302
vincristine,301
SNX-2112,300
3-Cl-AHPC,300
SB-225002,300


Now let's check whether we have drugs in common between all of these four datasets. It might be that CTRP has investigated different drugs and so it wouldn't be possible to merge them. 

In [79]:
drugs1 = set([item.lower() for item in response["Description"]])
drugs2 = set([item.lower() for item in response2.columns])
drugs3 = set([item.lower() for item in response3.columns])
drugs4 = set([item.lower() for item in ctrp21.columns])

common = list(drugs2.intersection(drugs1).intersection(drugs3).intersection(drugs4))
common

['tubastatin a',
 'dasatinib',
 'quizartinib',
 'linsitinib',
 'bosutinib',
 'paclitaxel',
 'azd8055',
 'azd7762',
 'temsirolimus',
 'parthenolide',
 'veliparib',
 'osi-027',
 'navitoclax',
 'gemcitabine',
 'ruxolitinib',
 'mk-2206',
 'osi-930',
 'tanespimycin',
 'tretinoin',
 'trametinib',
 'docetaxel',
 'mg-132',
 'zstk474',
 'saracatinib',
 'imatinib',
 'pha-793887',
 'sunitinib',
 'belinostat',
 'pi-103',
 'axitinib',
 'tamoxifen',
 'nilotinib',
 'pik-93',
 'ku-55933',
 'bi-2536',
 'tpca-1',
 'cabozantinib',
 'pevonedistat',
 'tivozanib',
 'serdemetan',
 'sorafenib',
 'nvp-tae684',
 'daporinad',
 'masitinib',
 'entinostat',
 'piperlongumine',
 'sn-38',
 'bexarotene',
 'crizotinib',
 'pac-1',
 'gefitinib',
 'temozolomide',
 'snx-2112',
 'pazopanib',
 'erlotinib',
 'dabrafenib',
 'methotrexate',
 'etoposide',
 'doxorubicin',
 'bms-754807',
 'bms-345541',
 'lapatinib',
 'linifanib',
 'bortezomib',
 'vorinostat']

We notice plenty of drugs in common! It should therefore be possible to push all four datasets in common and maybe end with an even bigger dataset. However, by looking at ctrp21, we see that the AUC value is different from the one of the first three datasets. Initially, the values were all comprised between 0 and 1, whereas in the CTRP data they are all around 15 or so. How can we transform those values so they are all on the same scale?

In [80]:
# Checking which of these drugs are the most common in the datasets

pd.DataFrame(ctrp21.loc[:,common].notnull().sum(axis = 0)).sort_values([0], ascending = False).head() 

#note that this is not going to work well as is because most of the drug names have been lowered for better association

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,0
cpd_name,Unnamed: 1_level_1
tanespimycin,300
veliparib,299
piperlongumine,297
dasatinib,297
saracatinib,296


## QAPC: a new winner?

I found this new guy by reading a paper [this one](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5239501/). You can find the dataset [here](http://tanlab.ucdenver.edu/QAPC/).

They tried to aggregate the data coming from all three datasets (CCLE, GDSC, CTRP). It is possible to download it on the website, but only one drug at a time. 

I didn't spend hours looking at which drug to pick so we are going to look at Erlotinib.

In [81]:
erlotinib = pd.read_excel("data/QAPC Project/Erlotinib.xls")
erlotinib.head()

Unnamed: 0,Database,Drug_name,Cell_line,Concentrations_nM,Responses,Minimal_tested_concentration_nM,Maximal_tested_concentration_nM,EC50_minimum_asymptote,EC50_maximum_asymptote,EC50_slope,...,EC50_nM,AUC_EC50,IC50_minimum_asymptote,IC50_maximum_asymptote,IC50_slope,IC50_calculated_nM,IC50_nM,AUC_IC50,EC50_nM_uncap,IC50_nM_uncap
0,CCLE,Erlotinib,1321N1,"2.5,8,25,80,250,800,2530,8000","-8.75,-5.04,-14.7,-3.3,3.9,10,16,26",2.5,8000,-13.644741,99.9999,0.364079,...,8000.0,0.282146,0.0,99.99999,0.741126,29316.915961,8000.0,0.173848,inf,inf
1,CCLE,Erlotinib,22RV1,"2.5,8,25,80,250,800,2530,8000","-4.74,16,-11.8,2.7,-6,9.7,-0.32,15",2.5,8000,0.0001,100.0,4.466694,...,8000.0,0.029489,1e-06,100.0,4.054069,12273.832957,8000.0,0.03186,inf,inf
2,CCLE,Erlotinib,42MGBA,"2.5,8,25,80,250,800,2530,8000","-6.65,-29.2,5.6,-8.33,2.1,-14.7,44,45",2.5,8000,-8.529698,44.998665,18.495501,...,2041.983631,0.366791,0.0,99.99999,1.105826,7259.286677,7259.286677,0.314146,2041.983631,7259.286677
3,CCLE,Erlotinib,5637,"2.5,8,25,80,250,800,2530,8000","-9.58,7,9.1,2.5,5.2,39,49,77",2.5,8000,-0.809706,100.0,0.851203,...,2082.238457,0.576638,0.0,99.99999,0.87039,2134.407865,2134.407865,0.573743,2082.238457,2134.407865
4,CCLE,Erlotinib,639V,8258025080025308000,"8.3,-1.24,10,-4.47,11,-5.74,27",8.0,8000,0.0001,99.9999,12.967876,...,8000.0,0.022432,1e-06,100.0,12.837047,8644.451769,8000.0,0.022644,inf,inf


In [82]:
erlo = erlotinib[["Drug_name","Cell_line","AUC_IC50"]]
erlo.head()

Unnamed: 0,Drug_name,Cell_line,AUC_IC50
0,Erlotinib,1321N1,0.173848
1,Erlotinib,22RV1,0.03186
2,Erlotinib,42MGBA,0.314146
3,Erlotinib,5637,0.573743
4,Erlotinib,639V,0.022644


In [83]:
erlo = erlo.groupby(["Drug_name","Cell_line"]).mean().reset_index(drop=False)
erlo = erlo.pivot(index = "Cell_line", values = "AUC_IC50", columns = "Drug_name")

print("Length of matrix: ", len(erlo.index))
erlo.head()

Length of matrix:  1083


Drug_name,Erlotinib
Cell_line,Unnamed: 1_level_1
1321N1,0.173848
2004,0.323526
22RV1,0.131851
2313287,0.389909
253J,0.701416


In [84]:
alt = alterations.reset_index()
alt["Cell lines"] = alt["index"].apply(lambda a: a.split("_")[0])
alt = alt.drop(labels = ["index"], axis = 1).set_index("Cell lines")

print("Length of matrix: ", len(alt.index))
alt.head()

Length of matrix:  1389


Description,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,HNRNPDL_del,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del
Cell lines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
127399,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22RV1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
erlo_matrix = alt.merge(erlo[['Erlotinib']], left_index = True, right_index = True).dropna(axis = 0)

print("Length of matrix: ", len(erlo_matrix.index))
erlo_matrix.head()

Length of matrix:  973


Unnamed: 0,PLCH2_mut,UBE4B_mut,ADGRB2_mut,ZSCAN20_mut,SZT2_mut,MOB3C_mut,ZFYVE9_mut,ST6GALNAC3_mut,TCHH_mut,HRNR_mut,...,DMTF1_del,PPP4R1_del,CDH1_del,SLC12A6_del,PTBP3_del,KCNE2_del,DGCR2_del,CASP8AP2_del,SCO2_del,Erlotinib
22RV1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131851
2313287,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389909
253J,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.701416
253JBV,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.522633
42MGBA,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284011


In [88]:
SAVE = True

if SAVE:
    erlo_matrix.to_csv('data/Depmap Project/final_matrix_erlotinib.csv')