In [2]:
import pandas as pd
import numpy as np

# CCLE Proteomics - Protein Levels

In [7]:
ccle_p = pd.read_csv("/content/CCLE_RPPA_20181003.csv")

ccle_p = ccle_p.rename({"Unnamed: 0": "CCLE_Name"}, axis = 'columns')
ccle_p

Unnamed: 0,CCLE_Name,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,4E-BP1_pT70,53BP1,A-Raf_pS299_Caution,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
0,DMS53_LUNG,-0.104888,0.060414,0.309068,-0.075506,0.230359,0.198304,-0.030541,0.455889,0.090484,...,-0.099433,-0.486715,-1.147858,0.133876,-0.075812,-0.144388,-1.090303,-2.109324,0.178104,0.246541
1,SW1116_LARGE_INTESTINE,0.358504,-0.180291,-0.041237,-0.286629,-0.877406,-1.026948,-0.462761,-0.011197,0.605330,...,-0.109777,0.349330,0.770148,0.984297,-0.168138,-0.004905,0.189294,-0.283593,0.255972,-0.121134
2,NCIH1694_LUNG,0.028738,0.071902,-0.094847,0.285069,1.321551,0.620703,-0.439484,0.195007,0.036221,...,0.154344,-0.478189,-1.185530,1.273013,-0.240413,0.476633,-1.367465,-2.525695,-0.137880,-0.451282
3,P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.120039,-0.066802,-0.128007,-0.552081,-0.292428,-1.415935,-0.138858,-0.066122,-0.346564,...,0.040106,5.923830,-3.893832,-2.499188,0.632758,0.025639,-1.189180,-3.056863,0.025997,-0.465205
4,HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-0.268997,-0.060281,-0.137881,-0.398729,-0.095622,-0.533905,0.054245,-0.573022,-0.162968,...,-0.466919,5.475880,-0.561973,-0.500953,-0.261494,0.358679,-0.951686,-3.247388,-0.151424,-0.145426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,UO31_KIDNEY,0.043231,0.093158,0.105461,0.011264,-0.264052,-0.079559,-0.000355,-0.308669,-0.025941,...,-0.109095,0.137608,0.618270,-0.541645,0.263695,-0.092934,0.433857,0.327542,0.108921,-0.197684
895,SF268_CENTRAL_NERVOUS_SYSTEM,-0.034597,0.065964,-0.329024,-0.540973,0.492108,0.137051,-0.003109,-0.276884,-0.390067,...,-0.339392,-0.054790,-0.271650,-1.735560,-0.397684,-0.166362,2.550478,3.225039,0.136907,0.426637
896,SF539_CENTRAL_NERVOUS_SYSTEM,-0.250998,0.420490,-0.334213,-0.156368,-0.219208,-0.187704,0.240884,0.352163,0.052992,...,0.094319,0.346651,-0.367292,-1.851276,0.069242,0.165163,-0.174660,-0.811089,0.067923,0.237027
897,SNB75_CENTRAL_NERVOUS_SYSTEM,-0.139833,0.194831,-0.135708,-0.434248,0.208941,-0.071338,0.145042,-0.457499,0.032008,...,0.159241,-0.140213,0.436948,-1.476417,-0.390487,-0.163100,0.407310,0.390911,-0.034470,0.153921


# **Cell Lines Map**
This dataset provides the Broad_ID (DepMap_ID), the Cancer Cell Line name, and COSMIC_ID, which is a unique identifier for tracking drug response data (to be used for joining with GSDC1 dataset later)

In [8]:
ccle_info = pd.read_csv("/content/DepMap-2018q3-celllines.csv")

# delete unneeded columns
ccle_info = ccle_info.drop(columns=['Aliases', 'Sanger ID', 'Subtype Disease', 'Gender', 'Source'])

ccle_info
# 1673 rows × 4 columns


Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,Primary Disease
0,ACH-000557,AML193_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Leukemia
1,ACH-001000,1321N1_CENTRAL_NERVOUS_SYSTEM,,Brain Cancer
2,ACH-000198,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906856.0,Leukemia
3,ACH-000956,22RV1_PROSTATE,924100.0,Prostate Cancer
4,ACH-000948,2313287_STOMACH,910924.0,Gastric Cancer
...,...,...,...,...
1668,ACH-002393,CROAP3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Lymphoma
1669,ACH-002394,GEO_LARGE_INTESTINE,,Colon Cancer
1670,ACH-002395,HUH6CLONE5_LIVER,,Liver Cancer
1671,ACH-002396,SARC9371_BONE,,Bone Cancer


# **Perform the mapping**

In [9]:
# Find matching of CCLE names between the CCLE proteonomics & mapping file

# CCLE_Name
common_values = ccle_info['CCLE_Name'].isin(ccle_p['CCLE_Name'])
match_count = common_values.sum()  # Counts True values (matches)
print(f"Number of matches: {match_count}")

# 882 / 899 rows can be matched (98%)

Number of matches: 882


In [13]:
# Merge proteonomics DataFrame with the ccle info DataFrame on CCLE name column

ccle_p_merged = pd.merge(ccle_p, ccle_info, on=['CCLE_Name'], how = 'right')


# Drop all rows with ANY missing values
ccle_p_merged = ccle_p_merged.dropna(axis='index')

# Final merged frame ~ multi-omics data
ccle_p_merged

#614 x 218 columns


Unnamed: 0,CCLE_Name,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,4E-BP1_pT70,53BP1,A-Raf_pS299_Caution,...,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102,Broad_ID,COSMIC_ID,Primary Disease
2,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.080966,0.064610,0.274801,1.166335,0.407919,1.098686,0.120650,-1.263650,0.112731,...,-0.680183,-0.133874,-0.090526,-1.059197,-2.598467,-0.240786,0.421736,ACH-000198,906856.0,Leukemia
3,22RV1_PROSTATE,0.108839,0.289584,0.045012,-0.396465,-0.390874,-0.753768,-0.078452,0.407514,-0.079914,...,2.422486,0.128371,0.574750,-0.699541,-1.009748,0.222496,0.090848,ACH-000956,924100.0,Prostate Cancer
4,2313287_STOMACH,0.133986,0.024192,0.676852,-0.471224,-0.684351,-0.904702,-0.123291,-3.971552,0.037784,...,1.138665,0.569062,-0.138691,0.570754,1.104430,-0.138848,-0.542029,ACH-000948,910924.0,Gastric Cancer
7,42MGBA_CENTRAL_NERVOUS_SYSTEM,0.426952,0.043067,0.223144,0.189454,0.346910,0.398910,0.274279,-0.763783,-0.178467,...,-1.727513,-0.017163,0.000119,0.920747,1.573710,-0.082276,-0.416194,ACH-000323,687561.0,Brain Cancer
9,5637_URINARY_TRACT,0.098320,0.468521,-0.472370,-0.124265,0.005014,-0.280498,-0.241964,0.662198,1.225230,...,4.397344,-0.151337,0.327659,0.701985,-0.503184,0.143011,0.011590,ACH-000905,687452.0,Bladder Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1347,YAPC_PANCREAS,0.102917,-0.167794,0.426968,-0.306847,-0.333987,-0.697848,-0.420424,0.145249,0.046873,...,0.249305,-0.130076,-0.315953,0.473735,1.288702,-0.157701,-0.079704,ACH-000332,909904.0,Pancreatic Cancer
1352,YH13_CENTRAL_NERVOUS_SYSTEM,-0.272537,-0.165541,-0.597877,-0.608083,-0.088654,-0.550212,-0.011799,0.236974,-0.441684,...,-1.819498,0.140461,0.309842,0.464327,0.835997,0.206393,0.390297,ACH-000469,909905.0,Brain Cancer
1353,YKG1_CENTRAL_NERVOUS_SYSTEM,-0.292245,0.105827,-0.336982,-0.207507,0.264828,0.272116,0.285140,0.482773,-0.261400,...,-2.123078,0.156974,0.202440,-0.488378,-0.146093,0.033230,0.322731,ACH-000570,687592.0,Brain Cancer
1356,ZR7530_BREAST,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,-0.575248,-0.494944,0.066445,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,ACH-000828,909907.0,Breast Cancer


In [37]:
# Move CCLE_name, cosmic_id, primary disease to the front, after Broad_ID

# move broad_ID
col_to_move = ccle_p_merged['Broad_ID']
ccle_p_merged.drop(labels=['Broad_ID'], axis=1, inplace = True)
ccle_p_merged.insert(0, 'Broad_ID', col_to_move)

# move ccle_name
col_to_move = ccle_p_merged['CCLE_Name']
ccle_p_merged.drop(labels=['CCLE_Name'], axis=1, inplace = True)
ccle_p_merged.insert(1, 'CCLE_Name', col_to_move)

# move cosmic_id -> join with GDSC data later
col_to_move = ccle_p_merged['COSMIC_ID']
ccle_p_merged.drop(labels=['COSMIC_ID'], axis=1, inplace = True)
ccle_p_merged.insert(2, 'COSMIC_ID', col_to_move)

# move primary_disease
col_to_move = ccle_p_merged['Primary Disease']
ccle_p_merged.drop(labels=['Primary Disease'], axis=1, inplace = True)
ccle_p_merged.insert(3, 'Primary Disease', col_to_move)

ccle_p_merged

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,Primary Disease,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
2,ACH-000198,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906856.0,Leukemia,0.080966,0.064610,0.274801,1.166335,0.407919,1.098686,...,0.202414,5.917212,-0.637708,-0.680183,-0.133874,-0.090526,-1.059197,-2.598467,-0.240786,0.421736
3,ACH-000956,22RV1_PROSTATE,924100.0,Prostate Cancer,0.108839,0.289584,0.045012,-0.396465,-0.390874,-0.753768,...,0.012693,0.118457,-0.831351,2.422486,0.128371,0.574750,-0.699541,-1.009748,0.222496,0.090848
4,ACH-000948,2313287_STOMACH,910924.0,Gastric Cancer,0.133986,0.024192,0.676852,-0.471224,-0.684351,-0.904702,...,-0.127784,-0.121144,0.330006,1.138665,0.569062,-0.138691,0.570754,1.104430,-0.138848,-0.542029
7,ACH-000323,42MGBA_CENTRAL_NERVOUS_SYSTEM,687561.0,Brain Cancer,0.426952,0.043067,0.223144,0.189454,0.346910,0.398910,...,-0.101686,0.068242,-1.559875,-1.727513,-0.017163,0.000119,0.920747,1.573710,-0.082276,-0.416194
9,ACH-000905,5637_URINARY_TRACT,687452.0,Bladder Cancer,0.098320,0.468521,-0.472370,-0.124265,0.005014,-0.280498,...,-0.214886,0.480196,0.614684,4.397344,-0.151337,0.327659,0.701985,-0.503184,0.143011,0.011590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1347,ACH-000332,YAPC_PANCREAS,909904.0,Pancreatic Cancer,0.102917,-0.167794,0.426968,-0.306847,-0.333987,-0.697848,...,-0.200756,3.597559,-0.059164,0.249305,-0.130076,-0.315953,0.473735,1.288702,-0.157701,-0.079704
1352,ACH-000469,YH13_CENTRAL_NERVOUS_SYSTEM,909905.0,Brain Cancer,-0.272537,-0.165541,-0.597877,-0.608083,-0.088654,-0.550212,...,-0.306649,-0.183262,-0.435001,-1.819498,0.140461,0.309842,0.464327,0.835997,0.206393,0.390297
1353,ACH-000570,YKG1_CENTRAL_NERVOUS_SYSTEM,687592.0,Brain Cancer,-0.292245,0.105827,-0.336982,-0.207507,0.264828,0.272116,...,0.960009,-0.259248,-0.944004,-2.123078,0.156974,0.202440,-0.488378,-0.146093,0.033230,0.322731
1356,ACH-000828,ZR7530_BREAST,909907.0,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,0.727708,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006


In [39]:
# rename primary disease
ccle_p_merged= ccle_p_merged.rename({"Primary Disease": "cancer_type"}, axis = 'columns')

# Convert 'COSMIC_ID' column values to int64 data type
ccle_p_merged['COSMIC_ID'] = ccle_p_merged['COSMIC_ID'].apply(np.int64)

ccle_p_merged

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
2,ACH-000198,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906856,Leukemia,0.080966,0.064610,0.274801,1.166335,0.407919,1.098686,...,0.202414,5.917212,-0.637708,-0.680183,-0.133874,-0.090526,-1.059197,-2.598467,-0.240786,0.421736
3,ACH-000956,22RV1_PROSTATE,924100,Prostate Cancer,0.108839,0.289584,0.045012,-0.396465,-0.390874,-0.753768,...,0.012693,0.118457,-0.831351,2.422486,0.128371,0.574750,-0.699541,-1.009748,0.222496,0.090848
4,ACH-000948,2313287_STOMACH,910924,Gastric Cancer,0.133986,0.024192,0.676852,-0.471224,-0.684351,-0.904702,...,-0.127784,-0.121144,0.330006,1.138665,0.569062,-0.138691,0.570754,1.104430,-0.138848,-0.542029
7,ACH-000323,42MGBA_CENTRAL_NERVOUS_SYSTEM,687561,Brain Cancer,0.426952,0.043067,0.223144,0.189454,0.346910,0.398910,...,-0.101686,0.068242,-1.559875,-1.727513,-0.017163,0.000119,0.920747,1.573710,-0.082276,-0.416194
9,ACH-000905,5637_URINARY_TRACT,687452,Bladder Cancer,0.098320,0.468521,-0.472370,-0.124265,0.005014,-0.280498,...,-0.214886,0.480196,0.614684,4.397344,-0.151337,0.327659,0.701985,-0.503184,0.143011,0.011590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1347,ACH-000332,YAPC_PANCREAS,909904,Pancreatic Cancer,0.102917,-0.167794,0.426968,-0.306847,-0.333987,-0.697848,...,-0.200756,3.597559,-0.059164,0.249305,-0.130076,-0.315953,0.473735,1.288702,-0.157701,-0.079704
1352,ACH-000469,YH13_CENTRAL_NERVOUS_SYSTEM,909905,Brain Cancer,-0.272537,-0.165541,-0.597877,-0.608083,-0.088654,-0.550212,...,-0.306649,-0.183262,-0.435001,-1.819498,0.140461,0.309842,0.464327,0.835997,0.206393,0.390297
1353,ACH-000570,YKG1_CENTRAL_NERVOUS_SYSTEM,687592,Brain Cancer,-0.292245,0.105827,-0.336982,-0.207507,0.264828,0.272116,...,0.960009,-0.259248,-0.944004,-2.123078,0.156974,0.202440,-0.488378,-0.146093,0.033230,0.322731
1356,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,0.727708,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006


In [41]:
# only retrieve breast cancer data
ccle_breast_cancer_p = ccle_p_merged[ccle_p_merged['cancer_type']== 'Breast Cancer']

ccle_breast_cancer_p

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
52,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,0.044849,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833
84,ACH-000536,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,-0.189922,-0.245781,...,0.916645,0.234112,0.490172,1.683944,-0.360089,0.293556,0.420244,0.128395,-0.391478,-0.055709
85,ACH-000927,BT474_BREAST,946359,Breast Cancer,0.776948,0.133628,0.258031,-0.6477,-0.480913,-0.211219,...,-0.065356,0.167518,-1.096166,1.737479,-0.11259,-0.027335,0.722628,1.913122,-0.175997,0.098205
86,ACH-000818,BT483_BREAST,949093,Breast Cancer,0.16083,0.014679,0.714691,1.154134,0.721715,1.552618,...,0.177582,0.527053,0.418898,-0.105524,-0.027436,0.496821,-0.258944,-1.943863,-0.120873,-0.667962
109,ACH-000212,CAL120_BREAST,906826,Breast Cancer,-0.018208,0.041425,-0.267999,1.616594,2.239583,1.927073,...,0.31184,-0.432027,0.670415,-1.327077,-0.014489,-0.053325,0.62591,1.021331,0.197539,0.775375
111,ACH-000902,CAL148_BREAST,924106,Breast Cancer,0.139363,0.056533,-0.226143,-0.026084,0.209709,0.131478,...,0.036881,-0.045182,0.060924,1.039848,-0.072051,0.473225,-0.498352,-1.470915,-0.408396,-0.455894
115,ACH-000856,CAL51_BREAST,910927,Breast Cancer,-0.01147,-0.159283,-0.628657,-0.193565,0.142774,-0.211827,...,-0.108068,-0.674662,0.607445,-0.078239,-0.354872,0.21211,0.617087,0.662649,-0.204806,0.297277
120,ACH-000857,CAL851_BREAST,910852,Breast Cancer,-0.173878,-0.111235,-0.131147,-0.456831,0.427919,0.238743,...,-0.343061,0.275918,0.837948,-0.025885,-0.430762,-0.262779,0.231815,0.918823,0.319674,0.206713
124,ACH-000783,CAMA1_BREAST,946382,Breast Cancer,0.395372,0.059579,0.529464,1.078023,1.010414,1.211523,...,0.747527,0.200036,0.513535,5.250939,-0.167643,0.506204,-0.143612,-0.19742,0.094575,-0.41106
234,ACH-000258,DU4475_BREAST,906844,Breast Cancer,-0.1796,0.13904,0.215091,0.324853,0.077344,-0.238779,...,-0.40891,4.003459,-0.634562,3.856702,-0.32643,-0.395935,-0.904689,-3.45847,-0.251527,0.013273


In [42]:
# reset index
ccle_breast_cancer_p.reset_index(drop=True, inplace=True)

ccle_breast_cancer_p

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
0,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,0.044849,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833
1,ACH-000536,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,-0.189922,-0.245781,...,0.916645,0.234112,0.490172,1.683944,-0.360089,0.293556,0.420244,0.128395,-0.391478,-0.055709
2,ACH-000927,BT474_BREAST,946359,Breast Cancer,0.776948,0.133628,0.258031,-0.6477,-0.480913,-0.211219,...,-0.065356,0.167518,-1.096166,1.737479,-0.11259,-0.027335,0.722628,1.913122,-0.175997,0.098205
3,ACH-000818,BT483_BREAST,949093,Breast Cancer,0.16083,0.014679,0.714691,1.154134,0.721715,1.552618,...,0.177582,0.527053,0.418898,-0.105524,-0.027436,0.496821,-0.258944,-1.943863,-0.120873,-0.667962
4,ACH-000212,CAL120_BREAST,906826,Breast Cancer,-0.018208,0.041425,-0.267999,1.616594,2.239583,1.927073,...,0.31184,-0.432027,0.670415,-1.327077,-0.014489,-0.053325,0.62591,1.021331,0.197539,0.775375
5,ACH-000902,CAL148_BREAST,924106,Breast Cancer,0.139363,0.056533,-0.226143,-0.026084,0.209709,0.131478,...,0.036881,-0.045182,0.060924,1.039848,-0.072051,0.473225,-0.498352,-1.470915,-0.408396,-0.455894
6,ACH-000856,CAL51_BREAST,910927,Breast Cancer,-0.01147,-0.159283,-0.628657,-0.193565,0.142774,-0.211827,...,-0.108068,-0.674662,0.607445,-0.078239,-0.354872,0.21211,0.617087,0.662649,-0.204806,0.297277
7,ACH-000857,CAL851_BREAST,910852,Breast Cancer,-0.173878,-0.111235,-0.131147,-0.456831,0.427919,0.238743,...,-0.343061,0.275918,0.837948,-0.025885,-0.430762,-0.262779,0.231815,0.918823,0.319674,0.206713
8,ACH-000783,CAMA1_BREAST,946382,Breast Cancer,0.395372,0.059579,0.529464,1.078023,1.010414,1.211523,...,0.747527,0.200036,0.513535,5.250939,-0.167643,0.506204,-0.143612,-0.19742,0.094575,-0.41106
9,ACH-000258,DU4475_BREAST,906844,Breast Cancer,-0.1796,0.13904,0.215091,0.324853,0.077344,-0.238779,...,-0.40891,4.003459,-0.634562,3.856702,-0.32643,-0.395935,-0.904689,-3.45847,-0.251527,0.013273


In [43]:
# Checking the single-omics dataset for any repeated no. of cell lines
ccle_breast_cancer_p['Broad_ID'].nunique()
ccle_breast_cancer_p['Broad_ID'].count()
ccle_breast_cancer_p['Broad_ID'].size

42

# **GDSC1_IC50 Dataset JOIN**

Drug response
The Genomics of Drug Sensitivity in Cancer (GDSC) dataset provides IC50 values for various drugs across multiple cancer cell lines. IC50 (half-maximal inhibitory concentration) is a measure of how much of a drug is needed to inhibit cell growth by 50%. This data helps researchers

GDSC 1 and GDSC 2
GDSC 1 is the first release of the GDSC dataset, it has more number of drugs and higher number of cell lines in comparison to GDSC 2

In [44]:
# GDSC dataset ~ drug response
gdsc1 = pd.read_csv("/content/GDSC1_fitted_dose_response_27Oct23.csv")
# delete first column
del gdsc1['DATASET'] # GDSC1 data only

In [45]:
gdsc1
# cosmic id, cell line name, drug name

Unnamed: 0,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.966813,0.985678,0.026081,1.299144
1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.692090,0.972690,0.110059,0.156076
2,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.477990,0.944459,0.087019,-0.035912
3,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.033564,0.950758,0.016290,-0.434437
4,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.966007,0.954778,0.180255,0.401702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333156,342,15911377,1659823,SNU-1040,SIDM00217,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,5.085294,0.972251,0.040661,0.860626
333157,342,15912122,1660035,SNU-61,SIDM00194,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,5.725399,0.976109,0.045453,1.785602
333158,342,15912431,1660036,SNU-81,SIDM00193,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,4.930753,0.970851,0.038612,0.637308
333159,342,15912739,1674021,SNU-C5,SIDM00498,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,4.551784,0.972330,0.042649,0.089683


In [48]:
# Trimming the dataset to only get relevant rows

gdsc1 = gdsc1[['DRUG_ID', 'COSMIC_ID','DRUG_NAME', 'LN_IC50']]

# LN_IC50 is going to be the target feature

In [49]:
gdsc1

Unnamed: 0,DRUG_ID,COSMIC_ID,DRUG_NAME,LN_IC50
0,1,684057,Erlotinib,3.966813
1,1,684059,Erlotinib,2.692090
2,1,684062,Erlotinib,2.477990
3,1,684072,Erlotinib,2.033564
4,1,687448,Erlotinib,2.966007
...,...,...,...,...
333156,1531,1659823,I-CBP112,5.085294
333157,1531,1660035,I-CBP112,5.725399
333158,1531,1660036,I-CBP112,4.930753
333159,1531,1674021,I-CBP112,4.551784


# **Dataset explanation**
There are multiple drug responses for the same cancer cell line, i.e., a single drug may be used across multiple cell lines.

There are 402 drugs, and each drug were tested against multiple cancer cell lines. E.g., Drug_ID 1 (Erlotinib) was tested against 393 cancer cell lines

In [50]:
# no. of drugs tested
gdsc1['DRUG_ID'].nunique()

402

In [52]:
gdsc1['DRUG_NAME'].nunique()

378

In [53]:
# Number of cancer cell lines the GDSC 1 data represents
gdsc1['COSMIC_ID'].nunique()

970

In [54]:
# No. of COSMIC_ID (cancer cell line tested) for each drug
gdsc1.groupby('DRUG_ID')['COSMIC_ID'].nunique()

Unnamed: 0_level_0,COSMIC_ID
DRUG_ID,Unnamed: 1_level_1
1,393
3,357
5,396
6,405
9,398
...,...
1526,861
1527,868
1529,692
1530,834


In [55]:
# Some drug name have the same ID, these may be due to different naming conventions.
# E.g., brand name VS generic names
gdsc1.groupby('DRUG_NAME')['DRUG_ID'].nunique().loc[lambda x: x > 1]

Unnamed: 0_level_0,DRUG_ID
DRUG_NAME,Unnamed: 1_level_1
AKT inhibitor VIII,2
AZD4547,2
AZD6482,2
AZD7762,2
Afatinib,2
Avagacestat,2
BMS-536924,2
Bicalutamide,2
CHIR-99021,2
Cisplatin,2


# **Merging proteomics breast cancer dataset with GDSC 1**

In [67]:
# Merge the gdsc data (drug response) with the multi-omics data, based on COSMIC_ID, to get a multi-omics + drug sensitivity dataset

proteo_gdsc1 = pd.merge(ccle_breast_cancer_p, gdsc1, on = ['COSMIC_ID'], how = 'left')

# Each row represents a breast cancer cell line + its transcriptomics (gene expression), its proteomics (protein level) + the drug response (based on LN_IC50)


In [68]:
# 3 columns added (drug_id, drug_name, LN_IC50)

proteo_gdsc1

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102,DRUG_ID,DRUG_NAME,LN_IC50
0,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,133,Doxorubicin,-2.020833
1,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,134,Etoposide,1.047349
2,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,135,Gemcitabine,-3.499537
3,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,136,Mitomycin-C,-0.633531
4,ACH-000248,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,140,Vinorelbine,-5.551392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14127,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,1490,SN-38,-0.692983
14128,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,1494,SN-38,-0.448730
14129,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,1495,Olaparib,5.755940
14130,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,1496,Cisplatin,4.176769


In [69]:
# There will be multiple drug responses per breast cancer cell line
proteo_gdsc1.groupby('Broad_ID')['DRUG_ID'].nunique()

Unnamed: 0_level_0,DRUG_ID
Broad_ID,Unnamed: 1_level_1
ACH-000019,345
ACH-000111,384
ACH-000117,352
ACH-000147,345
ACH-000148,352
ACH-000196,261
ACH-000212,347
ACH-000223,354
ACH-000248,324
ACH-000258,383


In [70]:
proteo_gdsc1 = proteo_gdsc1.dropna(axis='index')
len(proteo_gdsc1.index)

# No nulls in the dataset


14132

In [71]:
# move DRUG_ID to the front
col_to_move = proteo_gdsc1['DRUG_ID']
proteo_gdsc1.drop(labels=['DRUG_ID'], axis=1, inplace = True)
proteo_gdsc1.insert(1, 'DRUG_ID', col_to_move)


# move DRUG_NAME to the front
col_to_move = proteo_gdsc1['DRUG_NAME']
proteo_gdsc1.drop(labels=['DRUG_NAME'], axis=1, inplace = True)
proteo_gdsc1.insert(2, 'DRUG_NAME', col_to_move)

In [72]:
# final multi omics drug response dataset
proteo_gdsc1

# 14,132 columns correct


Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,...,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102,LN_IC50
0,ACH-000248,133,Doxorubicin,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-2.020833
1,ACH-000248,134,Etoposide,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,1.047349
2,ACH-000248,135,Gemcitabine,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-3.499537
3,ACH-000248,136,Mitomycin-C,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-0.633531
4,ACH-000248,140,Vinorelbine,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0.562434,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-5.551392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14127,ACH-000828,1490,SN-38,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,-0.692983
14128,ACH-000828,1494,SN-38,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,-0.448730
14129,ACH-000828,1495,Olaparib,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,5.755940
14130,ACH-000828,1496,Cisplatin,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.256894,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,4.176769


# **Drug info dataset (Join DRUG_ID with PubCHEM)**
# ISOMILES
Retrieve only GDSC1

In [73]:
drug_info = pd.read_csv("/content/Drug_Info.csv")
drug_info.head()

Unnamed: 0,Drug Id,Name,Synonyms,Targets,Target pathway,PubCHEM,Datasets,number of cell lines,Screening site
0,1242,(5Z)-7-Oxozeaenol,"5Z-7-Oxozeaenol, LL-Z1640-2",TAK1,"Other, kinases",9863776.0,GDSC1,899,SANGER
1,1824,123138,,,Unclassified,,GDSC2,717,SANGER
2,1820,123829,,,Unclassified,,GDSC2,717,SANGER
3,1836,150412,,,Unclassified,,GDSC2,717,SANGER
4,179,5-Fluorouracil,5-FU,Antimetabolite (DNA & RNA),Other,3385.0,GDSC1,907,MGH


In [74]:
# rename the columns
drug_info = drug_info.rename({"Drug Id": "DRUG_ID"}, axis = 'columns')
drug_info = drug_info.rename({" PubCHEM": "PubCHEM"}, axis = 'columns')

# Filter only GDSC 1
drug_info = drug_info[drug_info[" Datasets"] == 'GDSC1']

# Trim the dataset for neccessary columns for the JOIN
drug_info = drug_info[['DRUG_ID', 'PubCHEM']]

# retrieve only numeric PubCHEM values
drug_info = drug_info[drug_info['PubCHEM'].apply(lambda x: str(x).isdigit())]

drug_info.head()

Unnamed: 0,DRUG_ID,PubCHEM
0,1242,9863776
4,179,3385
20,86,10172943
21,55,9549184
29,1001,65110


# **Merge with the proteomic drug response**

In [75]:
proteo_gdsc1 = pd.merge(proteo_gdsc1, drug_info, on = ['DRUG_ID'], how = 'left')

# remove rows that may be empty or duplicated
proteo_gdsc1 = proteo_gdsc1.dropna(axis='index')

proteo_gdsc1

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,...,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102,LN_IC50,PubCHEM
0,ACH-000248,133,Doxorubicin,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-2.020833,31703
1,ACH-000248,134,Etoposide,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,1.047349,36462
2,ACH-000248,135,Gemcitabine,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-3.499537,60750
3,ACH-000248,136,Mitomycin-C,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-0.633531,5746
4,ACH-000248,140,Vinorelbine,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,-0.196016,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,-5.551392,5311497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14084,ACH-000828,1377,Afatinib,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,-0.335696,10184653
14085,ACH-000828,1378,Bleomycin (50 uM),ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,6.689241,5460769
14128,ACH-000828,1494,SN-38,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,-0.448730,104842
14129,ACH-000828,1495,Olaparib,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0.265081,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,5.755940,23725625


In [76]:
# PubCHEM is str data type
proteo_gdsc1["PubCHEM"].apply(type).value_counts()

Unnamed: 0_level_0,count
PubCHEM,Unnamed: 1_level_1
<class 'str'>,8078


# **PubCHEM data**
Each PubCHEM ID represents a specific drug compound that was used

# **ISOMILES**
Describes molecular structure of the drugs and how it interacts with specific genes.

Combining with multi-omics helps to better understand drug response mechanisms.

E.g., If two drugs have similar ISOMILES representations, but one is more effective in breast cancer than another, the structural differences may explain why.

In [87]:
drug_data = pd.read_csv("/content/PubChemCompound.csv")

In [88]:
drug_data

Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,gpidcnt,gpfamilycnt,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation
0,1,Acetyl-DL-carnitine,Acetyl-DL-carnitine|acetylcarnitine|DL-Acetylc...,203.24,C9H17NO4,66.4,214.0,0.4,14,0,...,3332,1232,Acetylcarnitine,Interactions and Pathways|Chemical and Physica...,10,,20050623,10X CHEM|3WAY PHARM INC|A2B Chem|AA BLOCKS|Aba...,Chemical Vendors|Curation Efforts|Governmental...,D002491 - Central Nervous System Agents > D018...
1,4,1-Amino-2-propanol,1-Aminopropan-2-ol|78-96-6|1-AMINO-2-PROPANOL|...,75.11,C3H9NO,46.3,22.9,-1.0,5,2,...,75460,31266,,Biological Test Results|Interactions and Pathw...,15,155|157|161|165|167|175|1188|23443|158688|6516...,20050326,001Chemical|10X CHEM|3B Scientific (Wuhan) Cor...,Chemical Vendors|Curation Efforts|Governmental...,
2,5,3-Amino-2-oxopropyl phosphate,3-Amino-2-oxopropyl phosphate|3-amino-2-oxopro...,169.07,C3H8NO5P,110.0,162.0,-5.0,10,3,...,7,4,,Interactions and Pathways|Classification|Liter...,5,,20050601,AAA Chemistry|ABI Chem|BenchChem|BIND|BioCyc|C...,Chemical Vendors|Curation Efforts|Governmental...,
3,6,Dinitrochlorobenzene,"1-chloro-2,4-dinitrobenzene|2,4-Dinitrochlorob...",202.55,C6H3ClN2O4,91.6,224.0,2.3,13,0,...,12139,4796,Dinitrochlorobenzene,Biological Test Results|Interactions and Pathw...,17,155|157|161|165|167|175|179|192|220|300|302|11...,20050326,10X CHEM|3B Scientific (Wuhan) Corp|3WAY PHARM...,Chemical Vendors|Curation Efforts|Governmental...,C308 - Immunotherapeutic Agent > C2139 - Immun...
4,7,9-Ethyladenine,9-Ethyladenine|2715-68-6|9-ethyl-9h-purin-6-am...,163.18,C7H9N5,69.6,162.0,0.2,12,1,...,284,138,,Biological Test Results|Interactions and Pathw...,9,22829|32339|33597|155650|155653|155654|156497|...,20050326,001Chemical|10X CHEM|4C Pharma Scientific Inc|...,Chemical Vendors|Curation Efforts|Governmental...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,2431,Bretylium,Bretylium|59-41-6|Bretylium tolsylate|Bretyliu...,243.16,C11H17BrN+,0.0,156.0,2.8,13,0,...,6155,1541,,Biological Test Results|Interactions and Pathw...,13,1332|2062|496817|496818|496819|496820|496821|4...,20050325,001Chemical|10X CHEM|3B Scientific (Wuhan) Cor...,Chemical Vendors|Curation Efforts|Governmental...,C78272 - Agent Affecting Nervous System > C297...
1062,2432,"2-[(12-Hydroxy-1,3,11,24,31,41,44-heptamethyl-...","2-[(12-Hydroxy-1,3,11,24,31,41,44-heptamethyl-...",895.10,C50H70O14,156.0,1940.0,3.7,64,1,...,0,0,,Interactions and Pathways|Literature|Pharmacol...,5,,20050325,ABI Chem|Alfa Chemistry|BenchChem|Biocore|BOC ...,Chemical Vendors|Curation Efforts|Legacy Depos...,
1063,2435,Brimonidine,"brimonidine|59803-98-4|UK 14,304|Bromoxidine|5...",292.13,C11H10BrN5,62.2,308.0,0.6,17,2,...,13678,2697,Brimonidine Tartrate,Biological Test Results|Interactions and Pathw...,16,155|157|161|165|167|175|200|206|212|220|256|26...,20050325,001Chemical|10X CHEM|3B Scientific (Wuhan) Cor...,Chemical Vendors|Curation Efforts|Governmental...,C78272 - Agent Affecting Nervous System > C297...
1064,2438,3-(1-methylpiperidin-4-yl)-1H-indol-5-ol,57477-39-1|BRL 54443|3-(1-methylpiperidin-4-yl...,230.31,C14H18N2O,39.3,263.0,1.5,17,2,...,69,20,,Biological Test Results|Interactions and Pathw...,7,357|410|411|444|445|446|447|448|450|451|526|53...,20050325,001Chemical|10X CHEM|4C Pharma Scientific Inc|...,Chemical Vendors|Curation Efforts|Governmental...,




In [111]:
# renaming cid to PubCHEM
drug_data = drug_data.rename({" cid": "PubCHEM"}, axis = 'columns')
drug_data = drug_data[['PubCHEM', 'smiles']]

# convert PubCHEM to str
drug_data['PubCHEM'] = drug_data['PubCHEM'].astype(str)

drug_data.head()

Unnamed: 0,PubCHEM,smiles
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,4,CC(CN)O
2,5,C(C(=O)COP(=O)(O)O)N
3,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
4,7,CCN1C=NC2=C(N=CN=C21)N


# **Merge with proteomics drug response data**

In [112]:
# merge on PubCHEM
proteo_gdsc1_iso = pd.merge(proteo_gdsc1, drug_data, on = ['PubCHEM'], how = 'left')

# remove rows that may be empty
proteo_gdsc1_iso = proteo_gdsc1_iso.dropna(axis='index')

# reset index
proteo_gdsc1_iso.reset_index(drop=True, inplace=True)


proteo_gdsc1_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,...,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102,LN_IC50,PubCHEM,smiles
0,ACH-000248,150,Bicalutamide,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,1.992325,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...
1,ACH-000248,1049,PD173074,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,3.788251,0.384709,-0.186325,-0.032694,-0.635673,0.086141,-0.221833,3.672991,1401,CCN(CC)CCCCNC1=NC2=NC(=C(C=C2C=N1)C3=CC(=CC(=C...
2,ACH-000536,150,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,1.683944,-0.360089,0.293556,0.420244,0.128395,-0.391478,-0.055709,2.966103,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...
3,ACH-000536,1049,PD173074,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,1.683944,-0.360089,0.293556,0.420244,0.128395,-0.391478,-0.055709,3.706382,1401,CCN(CC)CCCCNC1=NC2=NC(=C(C=C2C=N1)C3=CC(=CC(=C...
4,ACH-000536,1502,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,1.683944,-0.360089,0.293556,0.420244,0.128395,-0.391478,-0.055709,5.169668,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,ACH-000554,150,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,-0.359968,-0.083193,-0.498993,-0.343300,-0.783833,-0.138098,0.448448,3.118816,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...
113,ACH-000554,1049,PD173074,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,-0.359968,-0.083193,-0.498993,-0.343300,-0.783833,-0.138098,0.448448,3.045996,1401,CCN(CC)CCCCNC1=NC2=NC(=C(C=C2C=N1)C3=CC(=CC(=C...
114,ACH-000554,1502,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,-0.359968,-0.083193,-0.498993,-0.343300,-0.783833,-0.138098,0.448448,3.969894,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...
115,ACH-000828,150,Bicalutamide,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,3.671256,-0.220346,-0.679478,-0.548776,-1.837067,-0.030909,0.003006,3.353186,2375,CC(CS(=O)(=O)C1=CC=C(C=C1)F)(C(=O)NC2=CC(=C(C=...


In [114]:
# Each cancer cell line has been tested with multiple different drugs with available
proteo_gdsc1_iso.groupby('DRUG_ID')['COSMIC_ID'].count()

Unnamed: 0_level_0,COSMIC_ID
DRUG_ID,Unnamed: 1_level_1
150,41
1049,39
1502,37


In [115]:
# should be same as COSMIC_ID
proteo_gdsc1_iso.groupby('DRUG_ID')['PubCHEM'].count()

# for each drug, there are a couple of varianbts

Unnamed: 0_level_0,PubCHEM
DRUG_ID,Unnamed: 1_level_1
150,41
1049,39
1502,37


# **Converting ISOMILES to bit (categorical data)**

In [116]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [117]:
import numpy
print(numpy.__version__)

2.0.2


In [118]:
pip install numpy==1.21

Collecting numpy==1.21
  Downloading numpy-1.21.0.zip (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: numpy
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for numpy [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for numpy (pyproject.toml) ... [?25l[?25herror
[31m  ERROR: Failed building wheel for numpy[0m[31m
[0mFailed to build numpy
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projec

In [119]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [121]:
arr = []

# Generate Morgan fingerprints for each compound and store them in arr list
for i in range(len(proteo_gdsc1_iso)):

    # For each row's isosmiles
    smiles = proteo_gdsc1_iso['smiles'][i]

    # Generate Rockit molecule object from SMILES
    mol = Chem.MolFromSmiles(smiles)

    # Generate Morgan fingerprint with radius 2 and 256 bits
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits =256)

    # Convert fingerprint to a numpy array
    fp_array = np.zeros((1,), dtype = np.int64)

    # Display the hashed count Morgan fingerprint
    AllChem.DataStructs.ConvertToNumpyArray(fp, fp_array)
    arr.append(fp_array)



In [122]:
morgan_data = pd.DataFrame(arr)

# Holds the bit value of the isosmiles for each corresponding row
morgan_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
113,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
114,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
115,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


# **Merge morgan data with proteomic drug response dataframe**

In [123]:
# Can join without merge
proteo_gdsc1_iso_morgan = proteo_gdsc1_iso.join(morgan_data)

proteo_gdsc1_iso_morgan

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,...,246,247,248,249,250,251,252,253,254,255
0,ACH-000248,150,Bicalutamide,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0,0,0,0,0,1,0,0,0,0
1,ACH-000248,1049,PD173074,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0,0,0,0,0,1,0,0,0,1
2,ACH-000536,150,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,0,1,0,0,0,0
3,ACH-000536,1049,PD173074,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,0,1,0,0,0,1
4,ACH-000536,1502,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,ACH-000554,150,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,0,1,0,0,0,0
113,ACH-000554,1049,PD173074,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,0,1,0,0,0,1
114,ACH-000554,1502,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,0,1,0,0,0,0
115,ACH-000828,150,Bicalutamide,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0,0,0,0,0,1,0,0,0,0


# **Final processing**

In [124]:
# Reposition LN_IC50 to the last column
col_to_move = proteo_gdsc1_iso_morgan['LN_IC50']
proteo_gdsc1_iso_morgan.drop(labels=['LN_IC50'], axis=1, inplace = True)
proteo_gdsc1_iso_morgan.insert(len(proteo_gdsc1_iso_morgan.columns), 'LN_IC50', col_to_move)

proteo_gdsc1_iso_morgan

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,...,247,248,249,250,251,252,253,254,255,LN_IC50
0,ACH-000248,150,Bicalutamide,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0,0,0,0,1,0,0,0,0,1.992325
1,ACH-000248,1049,PD173074,AU565_BREAST,910704,Breast Cancer,0.061856,0.026326,-0.293987,-0.982703,...,0,0,0,0,1,0,0,0,1,3.672991
2,ACH-000536,150,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,1,0,0,0,0,2.966103
3,ACH-000536,1049,PD173074,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,1,0,0,0,1,3.706382
4,ACH-000536,1502,Bicalutamide,BT20_BREAST,906801,Breast Cancer,0.151272,-0.026378,0.236972,-0.071538,...,0,0,0,0,1,0,0,0,0,5.169668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,ACH-000554,150,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,1,0,0,0,0,3.118816
113,ACH-000554,1049,PD173074,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,1,0,0,0,1,3.045996
114,ACH-000554,1502,Bicalutamide,UACC893_BREAST,909778,Breast Cancer,-0.099328,0.092439,-0.000268,0.461909,...,0,0,0,0,1,0,0,0,0,3.969894
115,ACH-000828,150,Bicalutamide,ZR7530_BREAST,909907,Breast Cancer,-0.040076,-0.265286,1.017082,-0.666393,...,0,0,0,0,1,0,0,0,0,3.353186


In [127]:
# Drop unncessary columns
to_drop = ['smiles', 'cancer_type', 'Broad_ID']
proteo_gdsc1_iso_morgan.drop(to_drop, inplace=True, axis=1)


proteo_gdsc1_iso_morgan

Unnamed: 0,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,...,247,248,249,250,251,252,253,254,255,LN_IC50
0,150,Bicalutamide,AU565_BREAST,910704,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,0,0,0,0,1,0,0,0,0,1.992325
1,1049,PD173074,AU565_BREAST,910704,0.061856,0.026326,-0.293987,-0.982703,-0.559906,-1.254418,...,0,0,0,0,1,0,0,0,1,3.672991
2,150,Bicalutamide,BT20_BREAST,906801,0.151272,-0.026378,0.236972,-0.071538,-0.189922,-0.245781,...,0,0,0,0,1,0,0,0,0,2.966103
3,1049,PD173074,BT20_BREAST,906801,0.151272,-0.026378,0.236972,-0.071538,-0.189922,-0.245781,...,0,0,0,0,1,0,0,0,1,3.706382
4,1502,Bicalutamide,BT20_BREAST,906801,0.151272,-0.026378,0.236972,-0.071538,-0.189922,-0.245781,...,0,0,0,0,1,0,0,0,0,5.169668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,150,Bicalutamide,UACC893_BREAST,909778,-0.099328,0.092439,-0.000268,0.461909,0.698914,0.702645,...,0,0,0,0,1,0,0,0,0,3.118816
113,1049,PD173074,UACC893_BREAST,909778,-0.099328,0.092439,-0.000268,0.461909,0.698914,0.702645,...,0,0,0,0,1,0,0,0,1,3.045996
114,1502,Bicalutamide,UACC893_BREAST,909778,-0.099328,0.092439,-0.000268,0.461909,0.698914,0.702645,...,0,0,0,0,1,0,0,0,0,3.969894
115,150,Bicalutamide,ZR7530_BREAST,909907,-0.040076,-0.265286,1.017082,-0.666393,-0.367123,-0.784569,...,0,0,0,0,1,0,0,0,0,3.353186


In [130]:
# Outputting the dataframe
proteo_gdsc1_iso_morgan.to_csv('FINAL_proteo_gdsc1_iso_morgan.csv')

# **Useful information & Checks**

In [131]:
# The number of drugs (DRUG_ID) that has
proteo_gdsc1_iso_morgan['DRUG_ID'].nunique()

3

In [132]:
# check -> still the same number of breast cancer cell lines?
proteo_gdsc1_iso_morgan['COSMIC_ID'].nunique()

# 42 -> yes

42

In [133]:
proteo_gdsc1_iso_morgan['PubCHEM'].nunique()

2

In [134]:
proteo_gdsc1_iso_morgan.groupby('DRUG_ID')['PubCHEM'].count()

Unnamed: 0_level_0,PubCHEM
DRUG_ID,Unnamed: 1_level_1
150,41
1049,39
1502,37
