# Transcriptomics + GDSC2+ isoSMILES

In [1]:
import pandas as pd
import numpy as np

# Raw datas
## CCLE Transcriptomics - gene expression

In [2]:
# Transcriptomics data
ccle_t = pd.read_csv("../Raw_files/CCLE_Transcriptomics.csv")
ccle_t = ccle_t.rename({"Unnamed: 0": "Broad_ID"}, axis = 'columns')

In [3]:
ccle_t
# 1406 rows × 19222 columns

Unnamed: 0,Broad_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-001113,4.331992,0.000000,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.000000,0.275007,0.0,0.000000,0.000000
1,ACH-001289,4.566815,0.584963,7.106537,2.543496,3.504620,0.000000,0.189034,3.813525,4.221104,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.000000,0.014355,0.0,0.000000,0.000000
2,ACH-001339,3.150560,0.000000,7.379032,2.333424,4.227279,0.056584,1.310340,6.687061,3.682573,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.000000,0.084064,0.0,0.000000,0.042644
3,ACH-001538,5.085340,0.000000,7.154109,2.545968,3.084064,0.000000,5.868143,6.165309,4.489928,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.000000,0.028569,0.0,0.000000,0.000000
4,ACH-000242,6.729145,0.000000,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,...,1.117695,2.358959,0.084064,1.910733,0.000000,0.000000,0.464668,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,ACH-000285,0.056584,0.000000,6.604071,3.264536,4.972693,0.411426,0.097611,0.704872,4.829850,...,2.229588,0.084064,1.310340,3.039138,0.344828,0.000000,0.000000,0.0,0.475085,0.042644
1402,ACH-002669,3.109361,0.000000,7.031219,1.541019,3.664483,0.014355,3.624101,6.805292,4.472488,...,0.189034,0.400538,0.356144,1.327687,0.000000,0.000000,0.014355,0.0,0.000000,0.000000
1403,ACH-001858,4.390943,0.000000,7.013127,1.887525,3.252476,0.028569,3.286881,6.902074,5.410748,...,1.097611,0.400538,0.613532,1.992768,0.704872,0.000000,1.464668,0.0,0.000000,0.526069
1404,ACH-001997,5.057017,0.000000,7.814935,2.538538,3.893362,0.028569,4.078951,6.971429,4.469886,...,0.831877,0.847997,1.292782,2.153805,0.687061,0.000000,0.000000,0.0,0.000000,0.000000


## Cell Lines Map
This dataset provides the Broad_ID (DepMap_ID), the Cancer Cell Line name, and COSMIC_ID, which is a unique identifier for tracking drug response data (to be used for joining with GSDC1 dataset later)

In [4]:
ccle_info = pd.read_csv("../Raw_files/DepMap-2018q3-celllines.csv")

# delete unneeded columns
ccle_info = ccle_info.drop(columns=['Aliases', 'Sanger ID', 'Subtype Disease', 'Gender', 'Source'])

ccle_info
# 1673 rows × 4 columns

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,Primary Disease
0,ACH-000557,AML193_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Leukemia
1,ACH-001000,1321N1_CENTRAL_NERVOUS_SYSTEM,,Brain Cancer
2,ACH-000198,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906856.0,Leukemia
3,ACH-000956,22RV1_PROSTATE,924100.0,Prostate Cancer
4,ACH-000948,2313287_STOMACH,910924.0,Gastric Cancer
...,...,...,...,...
1668,ACH-002393,CROAP3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Lymphoma
1669,ACH-002394,GEO_LARGE_INTESTINE,,Colon Cancer
1670,ACH-002395,HUH6CLONE5_LIVER,,Liver Cancer
1671,ACH-002396,SARC9371_BONE,,Bone Cancer


## Perform the mapping

In [5]:
# Find matching of ACH IDs between the CCLE Transcriptomics & mapping file

# Broad ID ~ ACH_ID
common_values = ccle_t['Broad_ID'].isin(ccle_info['Broad_ID'])
match_count = common_values.sum()  # Counts True values (matches)
print(f"Number of matches: {match_count}")

# 1227 / 1406 rows can be matched (87%)

Number of matches: 1227


In [6]:
# Merge transcriptomics DataFrame with the ccle info DataFrame on CCLE name column
ccle_t_name = pd.merge(ccle_t, ccle_info, on=['Broad_ID'])


# Drop any rows with NaN COSMIC_ID values, as this column is needed to join with COSMIC_ID
ccle_t_name = ccle_t_name.dropna(axis='index')

# Final merged frame ~ multi-omics data
ccle_t_name

# 697 rows × 19225 columns (number of columns correct)

Unnamed: 0,Broad_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),CCLE_Name,COSMIC_ID,Primary Disease
3,ACH-000242,6.729145,0.000000,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,...,1.910733,0.000000,0.000000,0.464668,0.000000,0.000000,0.000000,RT4_URINARY_TRACT,687455.0,Bladder Cancer
4,ACH-000708,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,0.097611,4.888013,4.926474,...,1.891419,0.201634,0.000000,0.000000,0.000000,0.000000,0.000000,SNU283_LARGE_INTESTINE,1659929.0,Colon Cancer
5,ACH-000327,3.337711,0.000000,5.927185,1.944858,2.678072,0.014355,3.089159,6.011227,3.642702,...,1.327687,0.000000,0.124328,0.176323,0.000000,0.084064,0.238787,NCIH1395_LUNG,684681.0,Lung NSCLC
6,ACH-000233,0.056584,0.000000,6.093602,3.970854,3.731183,0.028569,6.092969,3.033863,3.422233,...,3.157044,0.226509,0.000000,0.000000,0.056584,0.000000,0.000000,DEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906836.0,Lymphoma
9,ACH-000528,4.512227,0.000000,7.099821,2.843984,4.672425,0.014355,0.815575,6.709015,3.982765,...,2.592158,0.000000,0.028569,0.042644,0.000000,0.028569,0.000000,ABC1_LUNG,906791.0,Lung NSCLC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,ACH-000114,3.263034,0.028569,6.525912,2.784504,2.629939,0.565597,4.770829,5.781097,5.506208,...,1.782409,0.356144,0.000000,0.790772,0.000000,0.000000,0.014355,SU8686_PANCREAS,1240218.0,Pancreatic Cancer
1220,ACH-001578,6.344828,3.400538,7.211207,2.533563,4.373648,0.042644,0.014355,5.785027,3.058316,...,1.970854,0.201634,0.000000,0.028569,0.000000,0.000000,0.201634,NCCIT_TESTES,908441.0,Embryonal Cancer
1222,ACH-000973,4.328406,0.000000,7.058749,1.891419,3.529821,0.000000,3.878725,6.432792,4.698774,...,2.403268,0.150560,0.042644,0.014355,0.000000,0.042644,0.084064,639V_URINARY_TRACT,906798.0,Bladder Cancer
1224,ACH-000750,3.533563,0.000000,6.488322,1.823749,3.308885,0.014355,0.137504,5.020591,4.536053,...,3.008989,0.238787,0.000000,0.124328,0.000000,0.000000,0.201634,LOXIMVI_SKIN,905974.0,Skin Cancer


In [7]:
# Move CCLE_name, cosmic_id, primary disease to the front, after Broad_ID

# move ccle_name
col_to_move = ccle_t_name['CCLE_Name']
ccle_t_name.drop(labels=['CCLE_Name'], axis=1, inplace = True)
ccle_t_name.insert(1, 'CCLE_Name', col_to_move)

# move cosmic_id -> join with GDSC data later
col_to_move = ccle_t_name['COSMIC_ID']
ccle_t_name.drop(labels=['COSMIC_ID'], axis=1, inplace = True)
ccle_t_name.insert(2, 'COSMIC_ID', col_to_move)

# move primary_disease
col_to_move = ccle_t_name['Primary Disease']
ccle_t_name.drop(labels=['Primary Disease'], axis=1, inplace = True)
ccle_t_name.insert(3, 'Primary Disease', col_to_move)

In [8]:
# rename primary disease
ccle_t_name = ccle_t_name.rename({"Primary Disease": "cancer_type"}, axis = 'columns')

# Convert 'COSMIC_ID' column values to int64 data type
ccle_t_name['COSMIC_ID'] = ccle_t_name['COSMIC_ID'].apply(np.int64)

In [9]:
# only retrieve breast cancer data
ccle_t_breast_colon = ccle_t_name[ccle_t_name['cancer_type'].isin(['Breast Cancer', 'Colon Cancer'])]

## Final Dataframe: data of Transcriptomics - gene expression

In [10]:
# reset index
ccle_t_breast_colon.reset_index(drop=True, inplace=True)

ccle_t_breast_colon

# 45 rows × 19225 columns

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.367371,1.480265,0.321928,1.891419,0.201634,0.000000,0.000000,0.000000,0.000000,0.000000
1,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,3.646163,0.000000,...,0.918386,0.565597,0.422233,1.257011,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ACH-000421,SW837_LARGE_INTESTINE,909755,Colon Cancer,4.730640,0.333424,7.068778,2.946731,4.745775,0.189034,...,0.189034,2.090853,0.000000,1.709291,0.000000,0.028569,1.007196,0.028569,0.028569,0.000000
3,ACH-000117,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,3.670161,0.014355,...,1.257011,1.464668,0.263034,1.726831,0.000000,0.000000,0.014355,0.028569,0.000000,0.084064
4,ACH-000999,SNU1040_LARGE_INTESTINE,1659823,Colon Cancer,4.489286,0.475085,6.759955,2.451541,3.305971,1.464668,...,0.555816,0.695994,0.536053,0.659925,0.650765,0.000000,0.014355,0.070389,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,ACH-001399,SW626_OVARY,909753,Colon Cancer,5.181898,0.000000,7.835103,1.722466,3.699330,0.014355,...,1.244887,1.594549,0.238787,2.169925,0.748461,0.000000,0.000000,0.000000,0.000000,0.000000
87,ACH-000350,COLO678_LARGE_INTESTINE,910689,Colon Cancer,4.548437,0.000000,6.777420,4.746850,4.615887,0.056584,...,0.356144,0.000000,0.613532,1.981853,0.000000,0.000000,0.000000,0.000000,0.070389,0.000000
88,ACH-000970,SNUC5_LARGE_INTESTINE,1674021,Colon Cancer,3.855990,0.000000,6.507953,2.545968,3.621759,0.070389,...,1.176323,1.580145,0.137504,2.456806,0.000000,0.014355,0.028569,0.014355,0.014355,0.000000
89,ACH-000288,BT549_BREAST,905951,Breast Cancer,3.390943,0.000000,7.142924,2.169925,4.195348,0.014355,...,0.713696,0.941106,0.298658,2.933573,0.389567,0.226509,0.000000,0.097611,0.000000,0.000000


In [11]:
# Checking the dataset for any repeated no. of cell lines
ccle_t_breast_colon['Broad_ID'].nunique()

91

----

# GDSC1_IC50 Dataset  <span style="color:red">JOIN </span> 
## Drug response
The Genomics of Drug Sensitivity in Cancer (GDSC) dataset provides IC50 values for various drugs across multiple cancer cell lines. IC50 (half-maximal inhibitory concentration) is a measure of how much of a drug is needed to inhibit cell growth by 50%. This data helps researchers

### GDSC 1 and GDSC 2
GDSC 1 is the first release of the GDSC dataset, it has more number of drugs and higher number of cell lines in comparison to GDSC 2

### COSMIC_ID is the unique identifier assigned to each cancer cell line in GDSC dataset

In [12]:
# GDSC dataset ~ drug response
gdsc2 = pd.read_csv("../Raw_files/GDSC2_fitted_dose_response_27Oct23.csv")
# delete first column
del gdsc2['DATASET'] # GDSC1 data only

In [13]:
gdsc2
# cosmic id, cell line name, drug name

# LN_IC50, AUC, RMSE, Z_SCORE

Unnamed: 0,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-1.463887,0.930220,0.089052,0.433123
1,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-4.869455,0.614970,0.111351,-1.421100
2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.360586,0.791072,0.142855,-0.599569
3,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-5.044940,0.592660,0.135539,-1.516647
4,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.741991,0.734047,0.128059,-0.807232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,343,16188242,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.127082,0.976746,0.074498,0.156872
242032,343,16188695,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.576377,0.913378,0.057821,-1.626959
242033,343,16188953,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.519636,0.975001,0.058090,0.608442
242034,343,16189493,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.694579,0.969969,0.101013,0.809684


In [14]:
# Trimming the dataset to only get relevant rows (Could potentially look at AUC, RMSE, Z_SCORE)

gdsc2 = gdsc2[['DRUG_ID', 'COSMIC_ID','DRUG_NAME', 'LN_IC50']]

# LN_IC50 is going to be the target feature

In [16]:
gdsc2

Unnamed: 0,DRUG_ID,COSMIC_ID,DRUG_NAME,LN_IC50
0,1003,683667,Camptothecin,-1.463887
1,1003,684052,Camptothecin,-4.869455
2,1003,684057,Camptothecin,-3.360586
3,1003,684059,Camptothecin,-5.044940
4,1003,684062,Camptothecin,-3.741991
...,...,...,...,...
242031,2499,1659928,N-acetyl cysteine,10.127082
242032,2499,1660034,N-acetyl cysteine,8.576377
242033,2499,1660035,N-acetyl cysteine,10.519636
242034,2499,1674021,N-acetyl cysteine,10.694579


### Dataset explanation
There are multiple drug responses for the same cancer cell line, i.e., a single drug may be used across multiple cell lines.

There are 402 drugs, and each drug were tested against multiple cancer cell lines. E.g., Drug_ID 1 (Erlotinib) was tested against 393 cancer cell lines

In [17]:
# no. of drugs tested
gdsc2['DRUG_ID'].nunique()

295

In [18]:
gdsc2['DRUG_NAME'].nunique()

286

In [19]:
# Number of cancer cell lines the GDSC 1 tested against
gdsc2['COSMIC_ID'].nunique()

969

In [20]:
# No. of COSMIC_ID (cancer cell line tested) for each drug
gdsc2.groupby('DRUG_ID')['COSMIC_ID'].nunique()

DRUG_ID
1003    968
1004    741
1005    760
1006    743
1007    967
       ... 
2362    731
2438    732
2439    732
2498    735
2499    735
Name: COSMIC_ID, Length: 295, dtype: int64

In [30]:
# Some drug name have the same ID, these may be due to different naming conventions.
# E.g., brand name VS generic names
gdsc2.groupby('DRUG_NAME')['DRUG_ID'].nunique().loc[lambda x: x > 1]

DRUG_NAME
Acetalax        2
Dactinomycin    2
Docetaxel       2
Fulvestrant     2
GSK343          2
Oxaliplatin     2
Selumetinib     2
Ulixertinib     2
Uprosertib      2
Name: DRUG_ID, dtype: int64

----------

# Merging transcriptomics breast cancer dataset with GDSC 1

In [31]:
# Merge the gdsc data (drug response) with the multi-omics data, based on COSMIC_ID, to get a multi-omics + drug sensitivity dataset

ccle_t_breast_colon_gdsc2 = pd.merge(ccle_t_breast_colon, gdsc2, on = ['COSMIC_ID'])

# Each row represents a breast cancer cell line + its transcriptomics (gene expression) + the respective drugs' responses (based on LN_IC50)

In [32]:
# 3 columns added (drug_id, drug_name, LN_IC50)
ccle_t_breast_colon_gdsc2

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),DRUG_ID,DRUG_NAME,LN_IC50
0,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1003,Camptothecin,-0.914349
1,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1007,Docetaxel,-4.355127
2,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1008,Methotrexate,3.263943
3,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1009,Tretinoin,5.600959
4,ACH-000708,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1010,Gefitinib,1.837465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23105,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,2362,THR-103,3.454157
23106,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,2438,ascorbate (vitamin C),10.699975
23107,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,2439,glutathione,9.380823
23108,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,2498,alpha-lipoic acid,6.883694


In [34]:
# There will be multiple drug responses per breast cancer cell line
ccle_t_breast_colon_gdsc2.groupby('Broad_ID')['DRUG_ID'].nunique()

Broad_ID
ACH-000007    222
ACH-000009    280
ACH-000019    283
ACH-000111    282
ACH-000117    281
             ... 
ACH-000997    282
ACH-000998    281
ACH-000999    148
ACH-001345    279
ACH-001399    279
Name: DRUG_ID, Length: 88, dtype: int64

In [35]:
# move DRUG_ID to the front
col_to_move = ccle_t_breast_colon_gdsc2['DRUG_ID']
ccle_t_breast_colon_gdsc2.drop(labels=['DRUG_ID'], axis=1, inplace = True)
ccle_t_breast_colon_gdsc2.insert(1, 'DRUG_ID', col_to_move)


# move DRUG_NAME to the front
col_to_move = ccle_t_breast_colon_gdsc2['DRUG_NAME']
ccle_t_breast_colon_gdsc2.drop(labels=['DRUG_NAME'], axis=1, inplace = True)
ccle_t_breast_colon_gdsc2.insert(2, 'DRUG_NAME', col_to_move)

In [36]:
# final transcriptomics + their drug responses dataset
ccle_t_breast_colon_gdsc2

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50
0,ACH-000708,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.480265,0.321928,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,-0.914349
1,ACH-000708,1007,Docetaxel,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.480265,0.321928,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,-4.355127
2,ACH-000708,1008,Methotrexate,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.480265,0.321928,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,3.263943
3,ACH-000708,1009,Tretinoin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.480265,0.321928,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,5.600959
4,ACH-000708,1010,Gefitinib,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.480265,0.321928,1.891419,0.201634,0.0,0.000000,0.000000,0.0,0.0,1.837465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23105,ACH-000934,2362,THR-103,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,3.454157
23106,ACH-000934,2438,ascorbate (vitamin C),MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,10.699975
23107,ACH-000934,2439,glutathione,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,9.380823
23108,ACH-000934,2498,alpha-lipoic acid,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.000000,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,6.883694


----

# Drug info dataset (Join DRUG_ID with PubCHEM)

### ISOMILES

Retrieve only GDSC1

In [37]:
drug_info = pd.read_csv("../Raw_files/Drug_info.csv")
drug_info.head()

Unnamed: 0,Drug Id,Name,Synonyms,Targets,Target pathway,PubCHEM,Datasets,number of cell lines,Screening site
0,1242,(5Z)-7-Oxozeaenol,"5Z-7-Oxozeaenol, LL-Z1640-2",TAK1,"Other, kinases",9863776.0,GDSC1,899,SANGER
1,1824,123138,,,Unclassified,,GDSC2,717,SANGER
2,1820,123829,,,Unclassified,,GDSC2,717,SANGER
3,1836,150412,,,Unclassified,,GDSC2,717,SANGER
4,179,5-Fluorouracil,5-FU,Antimetabolite (DNA & RNA),Other,3385.0,GDSC1,907,MGH


In [38]:
# rename the columns
drug_info = drug_info.rename({"Drug Id": "DRUG_ID"}, axis = 'columns')
drug_info = drug_info.rename({" PubCHEM": "PubCHEM"}, axis = 'columns')

# Filter only GDSC 1
# drug_info = drug_info[drug_info[" Datasets"] == 'GDSC1']

# Trim the dataset for neccessary columns for the JOIN
drug_info = drug_info[['DRUG_ID', 'PubCHEM']]

# retrieve only numeric PubCHEM values
drug_info = drug_info[drug_info['PubCHEM'].apply(lambda x: str(x).isdigit())]

drug_info

Unnamed: 0,DRUG_ID,PubCHEM
0,1242,9863776
4,179,3385
5,1073,3385
20,86,10172943
21,55,9549184
...,...,...
684,45,16760646
688,1050,9914412
689,1050,9914412
690,223,11647372


## Merge with the transcript_gdsc2

In [39]:
ccle_t_breast_colon_gdsc2 = pd.merge(ccle_t_breast_colon_gdsc2, drug_info, on = ['DRUG_ID'])

In [40]:
# Drop rows with missing values (NaN) from gene_drug_response_pubchem DataFrame
ccle_t_breast_colon_gdsc2 = ccle_t_breast_colon_gdsc2.dropna(axis='index')

# Drop duplicate rows 
ccle_t_breast_colon_gdsc2 = ccle_t_breast_colon_gdsc2.drop_duplicates()

In [41]:
ccle_t_breast_colon_gdsc2

# before removing duplicates = 19663 rows × 19229 columns
# after removing duplicates = 14418 rows × 19229 columns

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50,PubCHEM
0,ACH-000708,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,0.321928,1.891419,0.201634,0.000000,0.000000,0.000000,0.000000,0.000000,-0.914349,24360
1,ACH-000828,1003,Camptothecin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0.422233,1.257011,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.824561,24360
2,ACH-000421,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,Colon Cancer,4.730640,0.333424,7.068778,2.946731,...,0.000000,1.709291,0.000000,0.028569,1.007196,0.028569,0.028569,0.000000,-0.932695,24360
3,ACH-000117,1003,Camptothecin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0.263034,1.726831,0.000000,0.000000,0.014355,0.028569,0.000000,0.084064,1.555769,24360
4,ACH-000999,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,Colon Cancer,4.489286,0.475085,6.759955,2.451541,...,0.536053,0.659925,0.650765,0.000000,0.014355,0.070389,0.000000,0.000000,0.764638,24360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19658,ACH-001399,2359,GSK2830371,SW626_OVARY,909753,Colon Cancer,5.181898,0.000000,7.835103,1.722466,...,0.238787,2.169925,0.748461,0.000000,0.000000,0.000000,0.000000,0.000000,6.993705,70983932
19659,ACH-000350,2359,GSK2830371,COLO678_LARGE_INTESTINE,910689,Colon Cancer,4.548437,0.000000,6.777420,4.746850,...,0.613532,1.981853,0.000000,0.000000,0.000000,0.000000,0.070389,0.000000,7.730321,70983932
19660,ACH-000970,2359,GSK2830371,SNUC5_LARGE_INTESTINE,1674021,Colon Cancer,3.855990,0.000000,6.507953,2.545968,...,0.137504,2.456806,0.000000,0.014355,0.028569,0.014355,0.014355,0.000000,6.135335,70983932
19661,ACH-000288,2359,GSK2830371,BT549_BREAST,905951,Breast Cancer,3.390943,0.000000,7.142924,2.169925,...,0.298658,2.933573,0.389567,0.226509,0.000000,0.097611,0.000000,0.000000,6.211845,70983932


In [43]:
# PubCHEM is str data type
ccle_t_breast_colon_gdsc2["PubCHEM"].apply(type).value_counts()

<class 'str'>    14418
Name: PubCHEM, dtype: int64

----

# PubCHEM data

Each PubCHEM ID represents a specific drug compound that was used -> **used to join with Drug information dataset**

### ISOMILES

Describes molecular structure of the drugs and how it interacts with specific genes.

Combining with multi-omics helps to better understand drug response mechanisms.

E.g., If two drugs have similar ISOMILES representations, but one is more effective in breast cancer than another, the structural differences may explain why. 

In [44]:
drug_data = pd.read_csv("../Raw_files/PubChem_compound_list.csv")

In [45]:
# renaming cid to PubCHEM
drug_data = drug_data.rename({"cid": "PubCHEM"}, axis = 'columns')
drug_data = drug_data[['PubCHEM', 'isosmiles']]

# convert PubCHEM to str
drug_data['PubCHEM'] = drug_data['PubCHEM'].astype(str)

drug_data

Unnamed: 0,PubCHEM,isosmiles
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
2,11,C(CCl)Cl
3,34,C(CCl)O
4,38,CC(C)(CO)C(=O)C(=O)O
...,...,...
19531,170855691,[H+].C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H...
19532,170907453,C[C@H]1C(=O)N[C@H]2CSSC[C@H]3C(=O)N[C@H](C(=O)...
19533,170907974,CC[C@H](C)[C@@H](C(=O)N[C@@H](CCC(=O)O)C(=O)N[...
19534,170908028,CC[C@H](C)[C@H]1C(=O)N[C@@H](C(=O)N[C@@H](C(=O...


--------

# Merge with transcript_gdsc1 data

In [46]:
# merge on PubCHEM
ccle_t_breast_colon_gdsc2_iso = pd.merge(ccle_t_breast_colon_gdsc2, drug_data, on = ['PubCHEM'])

# reset index
ccle_t_breast_colon_gdsc2_iso.reset_index(drop=True, inplace=True)


ccle_t_breast_colon_gdsc2_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50,PubCHEM,isosmiles
0,ACH-000708,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,1.891419,0.201634,0.000000,0.000000,0.000000,0.000000,0.000000,-0.914349,24360,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...
1,ACH-000828,1003,Camptothecin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,1.257011,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.824561,24360,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...
2,ACH-000421,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,Colon Cancer,4.730640,0.333424,7.068778,2.946731,...,1.709291,0.000000,0.028569,1.007196,0.028569,0.028569,0.000000,-0.932695,24360,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...
3,ACH-000117,1003,Camptothecin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,1.726831,0.000000,0.000000,0.014355,0.028569,0.000000,0.084064,1.555769,24360,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...
4,ACH-000999,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,Colon Cancer,4.489286,0.475085,6.759955,2.451541,...,0.659925,0.650765,0.000000,0.014355,0.070389,0.000000,0.000000,0.764638,24360,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,ACH-001399,1378,Bleomycin (50 uM),SW626_OVARY,909753,Colon Cancer,5.181898,0.000000,7.835103,1.722466,...,2.169925,0.748461,0.000000,0.000000,0.000000,0.000000,0.000000,4.728102,5460769,CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)...
8470,ACH-000350,1378,Bleomycin (50 uM),COLO678_LARGE_INTESTINE,910689,Colon Cancer,4.548437,0.000000,6.777420,4.746850,...,1.981853,0.000000,0.000000,0.000000,0.000000,0.070389,0.000000,4.655638,5460769,CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)...
8471,ACH-000970,1378,Bleomycin (50 uM),SNUC5_LARGE_INTESTINE,1674021,Colon Cancer,3.855990,0.000000,6.507953,2.545968,...,2.456806,0.000000,0.014355,0.028569,0.014355,0.014355,0.000000,3.691268,5460769,CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)...
8472,ACH-000288,1378,Bleomycin (50 uM),BT549_BREAST,905951,Breast Cancer,3.390943,0.000000,7.142924,2.169925,...,2.933573,0.389567,0.226509,0.000000,0.097611,0.000000,0.000000,5.583365,5460769,CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)...


In [47]:
# Each cancer cell line has been tested with multiple different drugs with available 
ccle_t_breast_colon_gdsc2_iso.groupby('DRUG_ID')['COSMIC_ID'].count()

DRUG_ID
1003    87
1004    76
1005    82
1006    76
1007    87
        ..
2045    81
2046    81
2048    86
2106    75
2169    48
Name: COSMIC_ID, Length: 104, dtype: int64

In [48]:
# should be same as COSMIC_ID
ccle_t_breast_colon_gdsc2_iso.groupby('DRUG_ID')['PubCHEM'].count()

# for each drug, there are a couple of varianbts

DRUG_ID
1003    87
1004    76
1005    82
1006    76
1007    87
        ..
2045    81
2046    81
2048    86
2106    75
2169    48
Name: PubCHEM, Length: 104, dtype: int64

# Converting ISOMILES to bit (categorical data)

In [49]:
import numpy
print(numpy.__version__)

1.24.3


In [36]:
#pip install numpy==1.21

In [50]:
from rdkit import Chem
from rdkit.DataStructs import ConvertToNumpyArray
from rdkit.Chem import AllChem

In [51]:
arr = []

# Initialize Morgan fingerprint generator (radius=2, nBits=256)
morgan_generator = AllChem.GetMorganGenerator(radius=2, fpSize=256)

for smiles in ccle_t_breast_colon_gdsc2_iso['isosmiles']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:  # Check if molecule is valid
        # Generate Morgan fingerprint using the generator
        fp = morgan_generator.GetFingerprint(mol)
        
        # Convert to numpy array
        fp_array = np.zeros((256,), dtype=np.int64)
        ConvertToNumpyArray(fp, fp_array)
        arr.append(fp_array)
    else:
        print(f"Invalid SMILES: {smiles}")
        arr.append(np.zeros((256,), dtype=np.int64))  # Handle invalid SMILES

# Convert list of arrays into a single numpy array
fingerprints = np.vstack(arr)

In [52]:
morgan_data = pd.DataFrame(arr)

# Holds the bit value of the isosmiles for each corresponding row
morgan_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,1,1,1,1,1
8470,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,1,1,1,1,1
8471,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,1,1,1,1,1
8472,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,1,1,1,1,1


## Merge morgan data with multi-omics drug response dataframe

In [53]:
# Can join without merge
ccle_t_breast_colon_gdsc2_iso = ccle_t_breast_colon_gdsc2_iso.join(morgan_data)

ccle_t_breast_colon_gdsc2_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,246,247,248,249,250,251,252,253,254,255
0,ACH-000708,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,0,0,1,0,1,0,0,0,0,0
1,ACH-000828,1003,Camptothecin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0,0,1,0,1,0,0,0,0,0
2,ACH-000421,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,Colon Cancer,4.730640,0.333424,7.068778,2.946731,...,0,0,1,0,1,0,0,0,0,0
3,ACH-000117,1003,Camptothecin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0,0,1,0,1,0,0,0,0,0
4,ACH-000999,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,Colon Cancer,4.489286,0.475085,6.759955,2.451541,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,ACH-001399,1378,Bleomycin (50 uM),SW626_OVARY,909753,Colon Cancer,5.181898,0.000000,7.835103,1.722466,...,0,0,0,0,1,1,1,1,1,1
8470,ACH-000350,1378,Bleomycin (50 uM),COLO678_LARGE_INTESTINE,910689,Colon Cancer,4.548437,0.000000,6.777420,4.746850,...,0,0,0,0,1,1,1,1,1,1
8471,ACH-000970,1378,Bleomycin (50 uM),SNUC5_LARGE_INTESTINE,1674021,Colon Cancer,3.855990,0.000000,6.507953,2.545968,...,0,0,0,0,1,1,1,1,1,1
8472,ACH-000288,1378,Bleomycin (50 uM),BT549_BREAST,905951,Breast Cancer,3.390943,0.000000,7.142924,2.169925,...,0,0,0,0,1,1,1,1,1,1


# Final processing

In [54]:
# Reposition LN_IC50 to the last column
col_to_move = ccle_t_breast_colon_gdsc2_iso['LN_IC50']
ccle_t_breast_colon_gdsc2_iso.drop(labels=['LN_IC50'], axis=1, inplace = True)

NameError: name 'ccle_t_breast_cancer_gdsc2_iso' is not defined

In [55]:
ccle_t_breast_colon_gdsc2_iso.insert(len(ccle_t_breast_colon_gdsc2_iso.columns), 'LN_IC50', col_to_move)

ccle_t_breast_colon_gdsc2_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,ACH-000708,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,Colon Cancer,4.272023,0.189034,7.022923,2.555816,...,0,1,0,1,0,0,0,0,0,-0.914349
1,ACH-000828,1003,Camptothecin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0,1,0,1,0,0,0,0,0,2.824561
2,ACH-000421,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,Colon Cancer,4.730640,0.333424,7.068778,2.946731,...,0,1,0,1,0,0,0,0,0,-0.932695
3,ACH-000117,1003,Camptothecin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0,1,0,1,0,0,0,0,0,1.555769
4,ACH-000999,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,Colon Cancer,4.489286,0.475085,6.759955,2.451541,...,0,1,0,1,0,0,0,0,0,0.764638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,ACH-001399,1378,Bleomycin (50 uM),SW626_OVARY,909753,Colon Cancer,5.181898,0.000000,7.835103,1.722466,...,0,0,0,1,1,1,1,1,1,4.728102
8470,ACH-000350,1378,Bleomycin (50 uM),COLO678_LARGE_INTESTINE,910689,Colon Cancer,4.548437,0.000000,6.777420,4.746850,...,0,0,0,1,1,1,1,1,1,4.655638
8471,ACH-000970,1378,Bleomycin (50 uM),SNUC5_LARGE_INTESTINE,1674021,Colon Cancer,3.855990,0.000000,6.507953,2.545968,...,0,0,0,1,1,1,1,1,1,3.691268
8472,ACH-000288,1378,Bleomycin (50 uM),BT549_BREAST,905951,Breast Cancer,3.390943,0.000000,7.142924,2.169925,...,0,0,0,1,1,1,1,1,1,5.583365


In [56]:
# Drop unncessary columns
to_drop = ['isosmiles', 'cancer_type', 'Broad_ID']
ccle_t_breast_colon_gdsc2_iso.drop(to_drop, inplace=True, axis=1)


ccle_t_breast_colon_gdsc2_iso

Unnamed: 0,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,0,1,0,1,0,0,0,0,0,-0.914349
1,1003,Camptothecin,ZR7530_BREAST,909907,3.472488,0.000000,5.959306,3.878725,3.646163,0.000000,...,0,1,0,1,0,0,0,0,0,2.824561
2,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,4.730640,0.333424,7.068778,2.946731,4.745775,0.189034,...,0,1,0,1,0,0,0,0,0,-0.932695
3,1003,Camptothecin,EFM192A_BREAST,1290798,3.444932,0.056584,7.722193,2.978196,3.670161,0.014355,...,0,1,0,1,0,0,0,0,0,1.555769
4,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,4.489286,0.475085,6.759955,2.451541,3.305971,1.464668,...,0,1,0,1,0,0,0,0,0,0.764638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,1378,Bleomycin (50 uM),SW626_OVARY,909753,5.181898,0.000000,7.835103,1.722466,3.699330,0.014355,...,0,0,0,1,1,1,1,1,1,4.728102
8470,1378,Bleomycin (50 uM),COLO678_LARGE_INTESTINE,910689,4.548437,0.000000,6.777420,4.746850,4.615887,0.056584,...,0,0,0,1,1,1,1,1,1,4.655638
8471,1378,Bleomycin (50 uM),SNUC5_LARGE_INTESTINE,1674021,3.855990,0.000000,6.507953,2.545968,3.621759,0.070389,...,0,0,0,1,1,1,1,1,1,3.691268
8472,1378,Bleomycin (50 uM),BT549_BREAST,905951,3.390943,0.000000,7.142924,2.169925,4.195348,0.014355,...,0,0,0,1,1,1,1,1,1,5.583365


-----------

In [58]:
to_drop = ['PubCHEM']
ccle_t_breast_colon_gdsc2_iso.drop(to_drop, inplace=True, axis=1)


ccle_t_breast_colon_gdsc2_iso

Unnamed: 0,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,1003,Camptothecin,SNU283_LARGE_INTESTINE,1659929,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,...,0,1,0,1,0,0,0,0,0,-0.914349
1,1003,Camptothecin,ZR7530_BREAST,909907,3.472488,0.000000,5.959306,3.878725,3.646163,0.000000,...,0,1,0,1,0,0,0,0,0,2.824561
2,1003,Camptothecin,SW837_LARGE_INTESTINE,909755,4.730640,0.333424,7.068778,2.946731,4.745775,0.189034,...,0,1,0,1,0,0,0,0,0,-0.932695
3,1003,Camptothecin,EFM192A_BREAST,1290798,3.444932,0.056584,7.722193,2.978196,3.670161,0.014355,...,0,1,0,1,0,0,0,0,0,1.555769
4,1003,Camptothecin,SNU1040_LARGE_INTESTINE,1659823,4.489286,0.475085,6.759955,2.451541,3.305971,1.464668,...,0,1,0,1,0,0,0,0,0,0.764638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,1378,Bleomycin (50 uM),SW626_OVARY,909753,5.181898,0.000000,7.835103,1.722466,3.699330,0.014355,...,0,0,0,1,1,1,1,1,1,4.728102
8470,1378,Bleomycin (50 uM),COLO678_LARGE_INTESTINE,910689,4.548437,0.000000,6.777420,4.746850,4.615887,0.056584,...,0,0,0,1,1,1,1,1,1,4.655638
8471,1378,Bleomycin (50 uM),SNUC5_LARGE_INTESTINE,1674021,3.855990,0.000000,6.507953,2.545968,3.621759,0.070389,...,0,0,0,1,1,1,1,1,1,3.691268
8472,1378,Bleomycin (50 uM),BT549_BREAST,905951,3.390943,0.000000,7.142924,2.169925,4.195348,0.014355,...,0,0,0,1,1,1,1,1,1,5.583365


# Output into CSV

In [59]:
ccle_t_breast_colon_gdsc2_iso.to_csv("CC_BC_trans_gdsc2_isoSMILES.csv")