# Transcriptomics + GDSC1 + isoSMILES

In [1]:
import pandas as pd
import numpy as np

# Raw datas
## CCLE Transcriptomics - gene expression

In [2]:
# Transcriptomics data
ccle_t = pd.read_csv("../Raw_files/CCLE_Transcriptomics.csv")
ccle_t = ccle_t.rename({"Unnamed: 0": "Broad_ID"}, axis = 'columns')

In [3]:
ccle_t
# 1406 rows × 19222 columns

Unnamed: 0,Broad_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-001113,4.331992,0.000000,7.364397,2.792855,4.470537,0.028569,1.226509,3.042644,6.499686,...,2.689299,0.189034,0.201634,2.130931,0.555816,0.000000,0.275007,0.0,0.000000,0.000000
1,ACH-001289,4.566815,0.584963,7.106537,2.543496,3.504620,0.000000,0.189034,3.813525,4.221104,...,1.286881,1.049631,0.321928,1.464668,0.632268,0.000000,0.014355,0.0,0.000000,0.000000
2,ACH-001339,3.150560,0.000000,7.379032,2.333424,4.227279,0.056584,1.310340,6.687061,3.682573,...,0.594549,1.097611,0.831877,2.946731,0.475085,0.000000,0.084064,0.0,0.000000,0.042644
3,ACH-001538,5.085340,0.000000,7.154109,2.545968,3.084064,0.000000,5.868143,6.165309,4.489928,...,0.214125,0.632268,0.298658,1.641546,0.443607,0.000000,0.028569,0.0,0.000000,0.000000
4,ACH-000242,6.729145,0.000000,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,...,1.117695,2.358959,0.084064,1.910733,0.000000,0.000000,0.464668,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,ACH-000285,0.056584,0.000000,6.604071,3.264536,4.972693,0.411426,0.097611,0.704872,4.829850,...,2.229588,0.084064,1.310340,3.039138,0.344828,0.000000,0.000000,0.0,0.475085,0.042644
1402,ACH-002669,3.109361,0.000000,7.031219,1.541019,3.664483,0.014355,3.624101,6.805292,4.472488,...,0.189034,0.400538,0.356144,1.327687,0.000000,0.000000,0.014355,0.0,0.000000,0.000000
1403,ACH-001858,4.390943,0.000000,7.013127,1.887525,3.252476,0.028569,3.286881,6.902074,5.410748,...,1.097611,0.400538,0.613532,1.992768,0.704872,0.000000,1.464668,0.0,0.000000,0.526069
1404,ACH-001997,5.057017,0.000000,7.814935,2.538538,3.893362,0.028569,4.078951,6.971429,4.469886,...,0.831877,0.847997,1.292782,2.153805,0.687061,0.000000,0.000000,0.0,0.000000,0.000000


## Cell Lines Map
This dataset provides the Broad_ID (DepMap_ID), the Cancer Cell Line name, and COSMIC_ID, which is a unique identifier for tracking drug response data (to be used for joining with GSDC1 dataset later)

In [4]:
ccle_info = pd.read_csv("../Raw_files/DepMap-2018q3-celllines.csv")

# delete unneeded columns
ccle_info = ccle_info.drop(columns=['Aliases', 'Sanger ID', 'Subtype Disease', 'Gender', 'Source'])

ccle_info
# 1673 rows × 4 columns

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,Primary Disease
0,ACH-000557,AML193_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Leukemia
1,ACH-001000,1321N1_CENTRAL_NERVOUS_SYSTEM,,Brain Cancer
2,ACH-000198,EOL1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906856.0,Leukemia
3,ACH-000956,22RV1_PROSTATE,924100.0,Prostate Cancer
4,ACH-000948,2313287_STOMACH,910924.0,Gastric Cancer
...,...,...,...,...
1668,ACH-002393,CROAP3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,Lymphoma
1669,ACH-002394,GEO_LARGE_INTESTINE,,Colon Cancer
1670,ACH-002395,HUH6CLONE5_LIVER,,Liver Cancer
1671,ACH-002396,SARC9371_BONE,,Bone Cancer


## Perform the mapping

In [5]:
# Find matching of ACH IDs between the CCLE Transcriptomics & mapping file

# Broad ID ~ ACH_ID
common_values = ccle_t['Broad_ID'].isin(ccle_info['Broad_ID'])
match_count = common_values.sum()  # Counts True values (matches)
print(f"Number of matches: {match_count}")

# 1227 / 1406 rows can be matched (87%)

Number of matches: 1227


In [6]:
# Merge transcriptomics DataFrame with the ccle info DataFrame on CCLE name column
ccle_t_name = pd.merge(ccle_t, ccle_info, on=['Broad_ID'])


# Drop any rows with NaN COSMIC_ID values, as this column is needed to join with COSMIC_ID
ccle_t_name = ccle_t_name.dropna(axis='index')

# Final merged frame ~ multi-omics data
ccle_t_name

# 697 rows × 19225 columns (number of columns correct)

Unnamed: 0,Broad_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),CCLE_Name,COSMIC_ID,Primary Disease
3,ACH-000242,6.729145,0.000000,6.537607,2.456806,3.867896,0.799087,7.208381,5.569856,7.127014,...,1.910733,0.000000,0.000000,0.464668,0.000000,0.000000,0.000000,RT4_URINARY_TRACT,687455.0,Bladder Cancer
4,ACH-000708,4.272023,0.189034,7.022923,2.555816,3.841973,0.000000,0.097611,4.888013,4.926474,...,1.891419,0.201634,0.000000,0.000000,0.000000,0.000000,0.000000,SNU283_LARGE_INTESTINE,1659929.0,Colon Cancer
5,ACH-000327,3.337711,0.000000,5.927185,1.944858,2.678072,0.014355,3.089159,6.011227,3.642702,...,1.327687,0.000000,0.124328,0.176323,0.000000,0.084064,0.238787,NCIH1395_LUNG,684681.0,Lung NSCLC
6,ACH-000233,0.056584,0.000000,6.093602,3.970854,3.731183,0.028569,6.092969,3.033863,3.422233,...,3.157044,0.226509,0.000000,0.000000,0.056584,0.000000,0.000000,DEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,906836.0,Lymphoma
9,ACH-000528,4.512227,0.000000,7.099821,2.843984,4.672425,0.014355,0.815575,6.709015,3.982765,...,2.592158,0.000000,0.028569,0.042644,0.000000,0.028569,0.000000,ABC1_LUNG,906791.0,Lung NSCLC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,ACH-000114,3.263034,0.028569,6.525912,2.784504,2.629939,0.565597,4.770829,5.781097,5.506208,...,1.782409,0.356144,0.000000,0.790772,0.000000,0.000000,0.014355,SU8686_PANCREAS,1240218.0,Pancreatic Cancer
1220,ACH-001578,6.344828,3.400538,7.211207,2.533563,4.373648,0.042644,0.014355,5.785027,3.058316,...,1.970854,0.201634,0.000000,0.028569,0.000000,0.000000,0.201634,NCCIT_TESTES,908441.0,Embryonal Cancer
1222,ACH-000973,4.328406,0.000000,7.058749,1.891419,3.529821,0.000000,3.878725,6.432792,4.698774,...,2.403268,0.150560,0.042644,0.014355,0.000000,0.042644,0.084064,639V_URINARY_TRACT,906798.0,Bladder Cancer
1224,ACH-000750,3.533563,0.000000,6.488322,1.823749,3.308885,0.014355,0.137504,5.020591,4.536053,...,3.008989,0.238787,0.000000,0.124328,0.000000,0.000000,0.201634,LOXIMVI_SKIN,905974.0,Skin Cancer


In [7]:
# Move CCLE_name, cosmic_id, primary disease to the front, after Broad_ID

# move ccle_name
col_to_move = ccle_t_name['CCLE_Name']
ccle_t_name.drop(labels=['CCLE_Name'], axis=1, inplace = True)
ccle_t_name.insert(1, 'CCLE_Name', col_to_move)

# move cosmic_id -> join with GDSC data later
col_to_move = ccle_t_name['COSMIC_ID']
ccle_t_name.drop(labels=['COSMIC_ID'], axis=1, inplace = True)
ccle_t_name.insert(2, 'COSMIC_ID', col_to_move)

# move primary_disease
col_to_move = ccle_t_name['Primary Disease']
ccle_t_name.drop(labels=['Primary Disease'], axis=1, inplace = True)
ccle_t_name.insert(3, 'Primary Disease', col_to_move)

In [8]:
# rename primary disease
ccle_t_name = ccle_t_name.rename({"Primary Disease": "cancer_type"}, axis = 'columns')

# Convert 'COSMIC_ID' column values to int64 data type
ccle_t_name['COSMIC_ID'] = ccle_t_name['COSMIC_ID'].apply(np.int64)

In [9]:
# only retrieve breast cancer data
ccle_t_breast_cancer = ccle_t_name[ccle_t_name['cancer_type']== 'Breast Cancer']

## Final Dataframe: Multi-omics data of Transcriptomics - gene expression

In [10]:
# reset index
ccle_t_breast_cancer.reset_index(drop=True, inplace=True)

ccle_t_breast_cancer

# 45 rows × 19225 columns

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,H3C2 (8358),H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038)
0,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.0,...,0.918386,0.565597,0.422233,1.257011,0.0,0.0,0.0,0.0,0.0,0.0
1,ACH-000117,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,3.670161,0.014355,...,1.257011,1.464668,0.263034,1.726831,0.0,0.0,0.014355,0.028569,0.0,0.084064
2,ACH-000554,UACC893_BREAST,909778,Breast Cancer,3.98823,0.0,6.234195,4.189825,4.377818,0.070389,...,1.367371,0.344828,0.505891,1.427606,0.536053,0.0,0.0,0.0,0.0,0.0
3,ACH-000276,HCC38_BREAST,749717,Breast Cancer,3.934517,0.0,6.519479,2.57289,4.303781,0.084064,...,3.017922,2.260026,1.182692,2.286881,0.970854,0.056584,2.776104,0.214125,0.056584,0.333424
4,ACH-000818,BT483_BREAST,949093,Breast Cancer,3.528571,0.0,6.583308,3.72465,3.235727,0.070389,...,0.454176,0.097611,0.475085,1.454176,0.056584,0.028569,0.056584,0.028569,0.028569,0.0
5,ACH-000856,CAL51_BREAST,910927,Breast Cancer,5.705425,0.0,6.413289,3.298658,4.052242,0.124328,...,1.608809,0.0,0.485427,2.646163,0.201634,0.0,0.056584,0.0,0.124328,0.0
6,ACH-000223,HCC1937_BREAST,749714,Breast Cancer,4.903038,0.0,7.177719,2.744161,4.648465,0.15056,...,2.440952,1.695994,0.367371,1.505891,0.214125,0.137504,0.704872,0.0,0.0,0.0
7,ACH-000019,MCF7_BREAST,905946,Breast Cancer,2.403268,0.0,7.490249,2.606442,3.177918,0.014355,...,3.419539,1.731183,0.250962,2.003602,0.356144,0.0,0.0,0.0,0.0,0.0
8,ACH-000330,EFM19_BREAST,906851,Breast Cancer,3.485427,0.028569,6.984134,2.560715,3.806324,0.485427,...,0.659925,1.316146,0.321928,1.722466,0.0,0.0,0.0,0.0,0.0,0.0
9,ACH-000725,HCC202_BREAST,1290906,Breast Cancer,3.049631,0.0,6.811214,3.667892,4.337711,0.056584,...,3.364572,1.510962,0.594549,1.847997,0.042644,0.097611,0.014355,0.0,0.097611,0.0


In [11]:
# Checking the dataset for any repeated no. of cell lines
ccle_t_breast_cancer['Broad_ID'].nunique()

45

----

# GDSC1_IC50 Dataset  <span style="color:red">JOIN </span> 
## Drug response
The Genomics of Drug Sensitivity in Cancer (GDSC) dataset provides IC50 values for various drugs across multiple cancer cell lines. IC50 (half-maximal inhibitory concentration) is a measure of how much of a drug is needed to inhibit cell growth by 50%. This data helps researchers

### GDSC 1 and GDSC 2
GDSC 1 is the first release of the GDSC dataset, it has more number of drugs and higher number of cell lines in comparison to GDSC 2

### COSMIC_ID is the unique identifier assigned to each cancer cell line in GDSC dataset

In [12]:
# GDSC dataset ~ drug response
gdsc1 = pd.read_csv("../Raw_files/GDSC1_IC50.csv")
# delete first column
del gdsc1['DATASET'] # GDSC1 data only

In [13]:
gdsc1
# cosmic id, cell line name, drug name

# LN_IC50, AUC, RMSE, Z_SCORE

Unnamed: 0,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.966813,0.985678,0.026081,1.299144
1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.692090,0.972690,0.110059,0.156076
2,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.477990,0.944459,0.087019,-0.035912
3,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.033564,0.950758,0.016290,-0.434437
4,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.966007,0.954778,0.180255,0.401702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333156,342,15911377,1659823,SNU-1040,SIDM00217,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,5.085294,0.972251,0.040661,0.860626
333157,342,15912122,1660035,SNU-61,SIDM00194,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,5.725399,0.976109,0.045453,1.785602
333158,342,15912431,1660036,SNU-81,SIDM00193,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,4.930753,0.970851,0.038612,0.637308
333159,342,15912739,1674021,SNU-C5,SIDM00498,COREAD,1531,I-CBP112,"EP300, CBP",Chromatin histone acetylation,1005,Y,0.039063,10.0,4.551784,0.972330,0.042649,0.089683


In [14]:
# Trimming the dataset to only get relevant rows (Could potentially look at AUC, RMSE, Z_SCORE)

gdsc1 = gdsc1[['DRUG_ID', 'COSMIC_ID','DRUG_NAME', 'LN_IC50']]

# LN_IC50 is going to be the target feature

In [15]:
gdsc1

Unnamed: 0,DRUG_ID,COSMIC_ID,DRUG_NAME,LN_IC50
0,1,684057,Erlotinib,3.966813
1,1,684059,Erlotinib,2.692090
2,1,684062,Erlotinib,2.477990
3,1,684072,Erlotinib,2.033564
4,1,687448,Erlotinib,2.966007
...,...,...,...,...
333156,1531,1659823,I-CBP112,5.085294
333157,1531,1660035,I-CBP112,5.725399
333158,1531,1660036,I-CBP112,4.930753
333159,1531,1674021,I-CBP112,4.551784


### Dataset explanation
There are multiple drug responses for the same cancer cell line, i.e., a single drug may be used across multiple cell lines.

There are 402 drugs, and each drug were tested against multiple cancer cell lines. E.g., Drug_ID 1 (Erlotinib) was tested against 393 cancer cell lines

In [16]:
# no. of drugs tested
gdsc1['DRUG_ID'].nunique()

402

In [17]:
gdsc1['DRUG_NAME'].nunique()

378

In [18]:
# Number of cancer cell lines the GDSC 1 tested against
gdsc1['COSMIC_ID'].nunique()

970

In [19]:
# No. of COSMIC_ID (cancer cell line tested) for each drug
gdsc1.groupby('DRUG_ID')['COSMIC_ID'].nunique()

DRUG_ID
1       393
3       357
5       396
6       405
9       398
       ... 
1526    861
1527    868
1529    692
1530    834
1531    830
Name: COSMIC_ID, Length: 402, dtype: int64

In [20]:
# Some drug name have the same ID, these may be due to different naming conventions.
# E.g., brand name VS generic names
gdsc1.groupby('DRUG_NAME')['DRUG_ID'].nunique().loc[lambda x: x > 1]

DRUG_NAME
AKT inhibitor VIII    2
AZD4547               2
AZD6482               2
AZD7762               2
Afatinib              2
Avagacestat           2
BMS-536924            2
Bicalutamide          2
CHIR-99021            2
Cisplatin             2
Doxorubicin           2
GSK269962A            2
Gemcitabine           2
JQ1                   2
NG-25                 2
Olaparib              2
PLX-4720              2
Pictilisib            2
QL-XII-47             2
Refametinib           2
SB505124              2
SN-38                 2
Selumetinib           2
UNC0638               2
Name: DRUG_ID, dtype: int64

----------

# Merging transcriptomics breast cancer dataset with GDSC 1

In [21]:
# Merge the gdsc data (drug response) with the multi-omics data, based on COSMIC_ID, to get a multi-omics + drug sensitivity dataset

ccle_t_breast_cancer_gdsc1 = pd.merge(ccle_t_breast_cancer, gdsc1, on = ['COSMIC_ID'])

# Each row represents a breast cancer cell line + its transcriptomics (gene expression) + the respective drugs' responses (based on LN_IC50)

In [22]:
# 3 columns added (drug_id, drug_name, LN_IC50)
ccle_t_breast_cancer_gdsc1

Unnamed: 0,Broad_ID,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),DRUG_ID,DRUG_NAME,LN_IC50
0,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,133,Doxorubicin,-3.317780
1,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,134,Etoposide,0.750978
2,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,135,Gemcitabine,-2.043029
3,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,136,Mitomycin-C,-0.579304
4,ACH-000828,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,140,Vinorelbine,-4.121744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15215,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,1526,Refametinib,2.736710
15216,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,1527,Pictilisib,-0.580975
15217,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,1529,Pevonedistat,-0.281137
15218,ACH-000934,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,3.766595,0.028569,...,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,1530,PFI-3,4.731167


In [23]:
# There will be multiple drug responses per breast cancer cell line
ccle_t_breast_cancer_gdsc1.groupby('Broad_ID')['DRUG_ID'].nunique()

Broad_ID
ACH-000019    345
ACH-000111    384
ACH-000117    352
ACH-000147    345
ACH-000148    352
ACH-000196    261
ACH-000212    347
ACH-000223    354
ACH-000248    324
ACH-000258    383
ACH-000276    354
ACH-000277    352
ACH-000288    346
ACH-000330    344
ACH-000349    358
ACH-000352    353
ACH-000374    168
ACH-000536    351
ACH-000554    341
ACH-000568    200
ACH-000573    342
ACH-000621    347
ACH-000624    345
ACH-000643    346
ACH-000668    346
ACH-000691    397
ACH-000699    350
ACH-000711    300
ACH-000725    312
ACH-000755    394
ACH-000759    352
ACH-000768    346
ACH-000783    347
ACH-000818    353
ACH-000828    340
ACH-000849    350
ACH-000856    345
ACH-000857    346
ACH-000859    352
ACH-000876    336
ACH-000902    217
ACH-000910    346
ACH-000927    398
ACH-000930    354
ACH-000934    345
Name: DRUG_ID, dtype: int64

In [24]:
# move DRUG_ID to the front
col_to_move = ccle_t_breast_cancer_gdsc1['DRUG_ID']
ccle_t_breast_cancer_gdsc1.drop(labels=['DRUG_ID'], axis=1, inplace = True)
ccle_t_breast_cancer_gdsc1.insert(1, 'DRUG_ID', col_to_move)


# move DRUG_NAME to the front
col_to_move = ccle_t_breast_cancer_gdsc1['DRUG_NAME']
ccle_t_breast_cancer_gdsc1.drop(labels=['DRUG_NAME'], axis=1, inplace = True)
ccle_t_breast_cancer_gdsc1.insert(2, 'DRUG_NAME', col_to_move)

In [25]:
# final transcriptomics + their drug responses dataset
ccle_t_breast_cancer_gdsc1

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,H3C3 (8352),AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50
0,ACH-000828,133,Doxorubicin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,...,0.565597,0.422233,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,-3.317780
1,ACH-000828,134,Etoposide,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,...,0.565597,0.422233,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.750978
2,ACH-000828,135,Gemcitabine,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,...,0.565597,0.422233,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,-2.043029
3,ACH-000828,136,Mitomycin-C,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,...,0.565597,0.422233,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,-0.579304
4,ACH-000828,140,Vinorelbine,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.0,5.959306,3.878725,...,0.565597,0.422233,1.257011,0.000000,0.0,0.000000,0.000000,0.0,0.0,-4.121744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15215,ACH-000934,1526,Refametinib,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,2.736710
15216,ACH-000934,1527,Pictilisib,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,-0.580975
15217,ACH-000934,1529,Pevonedistat,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,-0.281137
15218,ACH-000934,1530,PFI-3,MDAMB361_BREAST,908121,Breast Cancer,1.855990,0.0,6.736740,2.885574,...,0.815575,0.389567,1.967169,0.565597,0.0,0.263034,0.263034,0.0,0.0,4.731167


----

# Drug info dataset (Join DRUG_ID with PubCHEM)

### ISOMILES

Retrieve only GDSC1

In [26]:
drug_info = pd.read_csv("../Raw_files/Drug_info.csv")
drug_info.head()

Unnamed: 0,Drug Id,Name,Synonyms,Targets,Target pathway,PubCHEM,Datasets,number of cell lines,Screening site
0,1242,(5Z)-7-Oxozeaenol,"5Z-7-Oxozeaenol, LL-Z1640-2",TAK1,"Other, kinases",9863776.0,GDSC1,899,SANGER
1,1824,123138,,,Unclassified,,GDSC2,717,SANGER
2,1820,123829,,,Unclassified,,GDSC2,717,SANGER
3,1836,150412,,,Unclassified,,GDSC2,717,SANGER
4,179,5-Fluorouracil,5-FU,Antimetabolite (DNA & RNA),Other,3385.0,GDSC1,907,MGH


In [27]:
# rename the columns
drug_info = drug_info.rename({"Drug Id": "DRUG_ID"}, axis = 'columns')
drug_info = drug_info.rename({" PubCHEM": "PubCHEM"}, axis = 'columns')

# Filter only GDSC 1
drug_info = drug_info[drug_info[" Datasets"] == 'GDSC1']

# Trim the dataset for neccessary columns for the JOIN
drug_info = drug_info[['DRUG_ID', 'PubCHEM']]

# retrieve only numeric PubCHEM values
drug_info = drug_info[drug_info['PubCHEM'].apply(lambda x: str(x).isdigit())]

drug_info.head()

Unnamed: 0,DRUG_ID,PubCHEM
0,1242,9863776
4,179,3385
20,86,10172943
21,55,9549184
29,1001,65110


## Merge with the transcript_gdsc1

In [28]:
ccle_t_breast_cancer_gdsc1 = pd.merge(ccle_t_breast_cancer_gdsc1, drug_info, on = ['DRUG_ID'])


ccle_t_breast_cancer_gdsc1

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,AC098582.1 (8916),DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50,PubCHEM
0,ACH-000828,133,Doxorubicin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0.422233,1.257011,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-3.317780,31703
1,ACH-000117,133,Doxorubicin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0.263034,1.726831,0.000000,0.000000,0.014355,0.028569,0.000000,0.084064,-3.354291,31703
2,ACH-000554,133,Doxorubicin,UACC893_BREAST,909778,Breast Cancer,3.988230,0.000000,6.234195,4.189825,...,0.505891,1.427606,0.536053,0.000000,0.000000,0.000000,0.000000,0.000000,-0.466619,31703
3,ACH-000276,133,Doxorubicin,HCC38_BREAST,749717,Breast Cancer,3.934517,0.000000,6.519479,2.572890,...,1.182692,2.286881,0.970854,0.056584,2.776104,0.214125,0.056584,0.333424,-3.105127,31703
4,ACH-000818,133,Doxorubicin,BT483_BREAST,949093,Breast Cancer,3.528571,0.000000,6.583308,3.724650,...,0.475085,1.454176,0.056584,0.028569,0.056584,0.028569,0.028569,0.000000,1.818639,31703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8702,ACH-000111,127,GSK269962A,HCC1187_BREAST,749711,Breast Cancer,5.241840,0.201634,5.615299,3.090853,...,0.422233,1.250962,0.505891,0.111031,0.070389,0.000000,0.111031,0.000000,1.913253,16095342
8703,ACH-000755,127,GSK269962A,HCC2218_BREAST,749716,Breast Cancer,3.587365,0.000000,6.563463,2.636915,...,0.150560,0.773996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.910669,16095342
8704,ACH-000902,127,GSK269962A,CAL148_BREAST,924106,Breast Cancer,4.041769,0.000000,5.964168,3.155425,...,0.613532,2.272023,0.214125,0.000000,0.000000,0.028569,0.000000,0.000000,2.184206,16095342
8705,ACH-000927,127,GSK269962A,BT474_BREAST,946359,Breast Cancer,2.238787,0.000000,7.446174,4.002703,...,0.321928,1.790772,0.669027,0.000000,0.028569,0.070389,0.000000,0.000000,3.668434,16095342


In [29]:
# PubCHEM is str data type
ccle_t_breast_cancer_gdsc1["PubCHEM"].apply(type).value_counts()

<class 'str'>    8707
Name: PubCHEM, dtype: int64

----

# PubCHEM data

Each PubCHEM ID represents a specific drug compound that was used -> **used to join with Drug information dataset**

### ISOMILES

Describes molecular structure of the drugs and how it interacts with specific genes.

Combining with multi-omics helps to better understand drug response mechanisms.

E.g., If two drugs have similar ISOMILES representations, but one is more effective in breast cancer than another, the structural differences may explain why. 

In [30]:
drug_data = pd.read_csv("../Raw_files/PubChem_compound_list.csv")

In [31]:
# renaming cid to PubCHEM
drug_data = drug_data.rename({"cid": "PubCHEM"}, axis = 'columns')
drug_data = drug_data[['PubCHEM', 'isosmiles']]

# convert PubCHEM to str
drug_data['PubCHEM'] = drug_data['PubCHEM'].astype(str)

drug_data.head()

Unnamed: 0,PubCHEM,isosmiles
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl
2,11,C(CCl)Cl
3,34,C(CCl)O
4,38,CC(C)(CO)C(=O)C(=O)O


--------

# Merge with transcript_gdsc1 data

In [32]:
# merge on PubCHEM
ccle_t_breast_cancer_gdsc1_iso = pd.merge(ccle_t_breast_cancer_gdsc1, drug_data, on = ['PubCHEM'])

# reset index
ccle_t_breast_cancer_gdsc1_iso.reset_index(drop=True, inplace=True)


ccle_t_breast_cancer_gdsc1_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,DUS4L-BCAP29 (115253422),C8orf44-SGK3 (100533105),ELOA3B (728929),NPBWR1 (2831),ELOA3D (100506888),ELOA3 (162699),CDR1 (1038),LN_IC50,PubCHEM,isosmiles
0,ACH-000828,133,Doxorubicin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,1.257011,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-3.317780,31703,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...
1,ACH-000117,133,Doxorubicin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,1.726831,0.000000,0.000000,0.014355,0.028569,0.000000,0.084064,-3.354291,31703,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...
2,ACH-000554,133,Doxorubicin,UACC893_BREAST,909778,Breast Cancer,3.988230,0.000000,6.234195,4.189825,...,1.427606,0.536053,0.000000,0.000000,0.000000,0.000000,0.000000,-0.466619,31703,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...
3,ACH-000276,133,Doxorubicin,HCC38_BREAST,749717,Breast Cancer,3.934517,0.000000,6.519479,2.572890,...,2.286881,0.970854,0.056584,2.776104,0.214125,0.056584,0.333424,-3.105127,31703,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...
4,ACH-000818,133,Doxorubicin,BT483_BREAST,949093,Breast Cancer,3.528571,0.000000,6.583308,3.724650,...,1.454176,0.056584,0.028569,0.056584,0.028569,0.028569,0.000000,1.818639,31703,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,ACH-000568,119,Lapatinib,UACC812_BREAST,910910,Breast Cancer,2.482848,0.000000,7.835545,2.266037,...,1.321928,0.000000,0.000000,0.097611,0.000000,0.000000,0.000000,1.574658,208908,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...
4451,ACH-000111,119,Lapatinib,HCC1187_BREAST,749711,Breast Cancer,5.241840,0.201634,5.615299,3.090853,...,1.250962,0.505891,0.111031,0.070389,0.000000,0.111031,0.000000,2.957433,208908,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...
4452,ACH-000755,119,Lapatinib,HCC2218_BREAST,749716,Breast Cancer,3.587365,0.000000,6.563463,2.636915,...,0.773996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-2.692161,208908,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...
4453,ACH-000902,119,Lapatinib,CAL148_BREAST,924106,Breast Cancer,4.041769,0.000000,5.964168,3.155425,...,2.272023,0.214125,0.000000,0.000000,0.028569,0.000000,0.000000,2.965317,208908,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...


In [33]:
# Each cancer cell line has been tested with multiple different drugs with available 
ccle_t_breast_cancer_gdsc1_iso.groupby('DRUG_ID')['COSMIC_ID'].count()

DRUG_ID
1        8
5        7
11       7
30       9
32       7
        ..
1498    41
1502    40
1526    39
1527    40
1529    33
Name: COSMIC_ID, Length: 118, dtype: int64

In [34]:
# should be same as COSMIC_ID
ccle_t_breast_cancer_gdsc1_iso.groupby('DRUG_ID')['PubCHEM'].count()

# for each drug, there are a couple of varianbts

DRUG_ID
1        8
5        7
11       7
30       9
32       7
        ..
1498    41
1502    40
1526    39
1527    40
1529    33
Name: PubCHEM, Length: 118, dtype: int64

# Converting ISOMILES to bit (categorical data)

In [35]:
import numpy
print(numpy.__version__)

1.24.3


In [40]:
#pip install numpy==1.21

Collecting numpy==1.21
  Downloading numpy-1.21.0-cp38-cp38-win_amd64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.1
    Uninstalling numpy-1.20.1:
      Successfully uninstalled numpy-1.20.1
Successfully installed numpy-1.21.0
Note: you may need to restart the kernel to use updated packages.


In [38]:
from rdkit import Chem
from rdkit.DataStructs import ConvertToNumpyArray
from rdkit.Chem import AllChem

In [39]:
arr = []

# Initialize Morgan fingerprint generator (radius=2, nBits=256)
morgan_generator = AllChem.GetMorganGenerator(radius=2, fpSize=256)

for smiles in ccle_t_breast_cancer_gdsc1_iso['isosmiles']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:  # Check if molecule is valid
        # Generate Morgan fingerprint using the generator
        fp = morgan_generator.GetFingerprint(mol)
        
        # Convert to numpy array
        fp_array = np.zeros((256,), dtype=np.int64)
        ConvertToNumpyArray(fp, fp_array)
        arr.append(fp_array)
    else:
        print(f"Invalid SMILES: {smiles}")
        arr.append(np.zeros((256,), dtype=np.int64))  # Handle invalid SMILES

# Convert list of arrays into a single numpy array
fingerprints = np.vstack(arr)

In [40]:
morgan_data = pd.DataFrame(arr)

# Holds the bit value of the isosmiles for each corresponding row
morgan_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4451,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


## Merge morgan data with multi-omics drug response dataframe

In [42]:
# Can join without merge
ccle_t_breast_cancer_gdsc1_iso = ccle_t_breast_cancer_gdsc1_iso.join(morgan_data)

ccle_t_breast_cancer_gdsc1_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,246,247,248,249,250,251,252,253,254,255
0,ACH-000828,133,Doxorubicin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0,0,0,1,0,1,0,1,0,0
1,ACH-000117,133,Doxorubicin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0,0,0,1,0,1,0,1,0,0
2,ACH-000554,133,Doxorubicin,UACC893_BREAST,909778,Breast Cancer,3.988230,0.000000,6.234195,4.189825,...,0,0,0,1,0,1,0,1,0,0
3,ACH-000276,133,Doxorubicin,HCC38_BREAST,749717,Breast Cancer,3.934517,0.000000,6.519479,2.572890,...,0,0,0,1,0,1,0,1,0,0
4,ACH-000818,133,Doxorubicin,BT483_BREAST,949093,Breast Cancer,3.528571,0.000000,6.583308,3.724650,...,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,ACH-000568,119,Lapatinib,UACC812_BREAST,910910,Breast Cancer,2.482848,0.000000,7.835545,2.266037,...,0,0,0,0,1,0,0,0,0,1
4451,ACH-000111,119,Lapatinib,HCC1187_BREAST,749711,Breast Cancer,5.241840,0.201634,5.615299,3.090853,...,0,0,0,0,1,0,0,0,0,1
4452,ACH-000755,119,Lapatinib,HCC2218_BREAST,749716,Breast Cancer,3.587365,0.000000,6.563463,2.636915,...,0,0,0,0,1,0,0,0,0,1
4453,ACH-000902,119,Lapatinib,CAL148_BREAST,924106,Breast Cancer,4.041769,0.000000,5.964168,3.155425,...,0,0,0,0,1,0,0,0,0,1


# Final processing

In [43]:
# Reposition LN_IC50 to the last column
col_to_move = ccle_t_breast_cancer_gdsc1_iso['LN_IC50']
ccle_t_breast_cancer_gdsc1_iso.drop(labels=['LN_IC50'], axis=1, inplace = True)
ccle_t_breast_cancer_gdsc1_iso.insert(len(ccle_t_breast_cancer_gdsc1_iso.columns), 'LN_IC50', col_to_move)

ccle_t_breast_cancer_gdsc1_iso

Unnamed: 0,Broad_ID,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,cancer_type,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,ACH-000828,133,Doxorubicin,ZR7530_BREAST,909907,Breast Cancer,3.472488,0.000000,5.959306,3.878725,...,0,0,1,0,1,0,1,0,0,-3.317780
1,ACH-000117,133,Doxorubicin,EFM192A_BREAST,1290798,Breast Cancer,3.444932,0.056584,7.722193,2.978196,...,0,0,1,0,1,0,1,0,0,-3.354291
2,ACH-000554,133,Doxorubicin,UACC893_BREAST,909778,Breast Cancer,3.988230,0.000000,6.234195,4.189825,...,0,0,1,0,1,0,1,0,0,-0.466619
3,ACH-000276,133,Doxorubicin,HCC38_BREAST,749717,Breast Cancer,3.934517,0.000000,6.519479,2.572890,...,0,0,1,0,1,0,1,0,0,-3.105127
4,ACH-000818,133,Doxorubicin,BT483_BREAST,949093,Breast Cancer,3.528571,0.000000,6.583308,3.724650,...,0,0,1,0,1,0,1,0,0,1.818639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,ACH-000568,119,Lapatinib,UACC812_BREAST,910910,Breast Cancer,2.482848,0.000000,7.835545,2.266037,...,0,0,0,1,0,0,0,0,1,1.574658
4451,ACH-000111,119,Lapatinib,HCC1187_BREAST,749711,Breast Cancer,5.241840,0.201634,5.615299,3.090853,...,0,0,0,1,0,0,0,0,1,2.957433
4452,ACH-000755,119,Lapatinib,HCC2218_BREAST,749716,Breast Cancer,3.587365,0.000000,6.563463,2.636915,...,0,0,0,1,0,0,0,0,1,-2.692161
4453,ACH-000902,119,Lapatinib,CAL148_BREAST,924106,Breast Cancer,4.041769,0.000000,5.964168,3.155425,...,0,0,0,1,0,0,0,0,1,2.965317


In [44]:
# Drop unncessary columns
to_drop = ['isosmiles', 'cancer_type', 'Broad_ID']
ccle_t_breast_cancer_gdsc1_iso.drop(to_drop, inplace=True, axis=1)


ccle_t_breast_cancer_gdsc1_iso

Unnamed: 0,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,133,Doxorubicin,ZR7530_BREAST,909907,3.472488,0.000000,5.959306,3.878725,3.646163,0.000000,...,0,0,1,0,1,0,1,0,0,-3.317780
1,133,Doxorubicin,EFM192A_BREAST,1290798,3.444932,0.056584,7.722193,2.978196,3.670161,0.014355,...,0,0,1,0,1,0,1,0,0,-3.354291
2,133,Doxorubicin,UACC893_BREAST,909778,3.988230,0.000000,6.234195,4.189825,4.377818,0.070389,...,0,0,1,0,1,0,1,0,0,-0.466619
3,133,Doxorubicin,HCC38_BREAST,749717,3.934517,0.000000,6.519479,2.572890,4.303781,0.084064,...,0,0,1,0,1,0,1,0,0,-3.105127
4,133,Doxorubicin,BT483_BREAST,949093,3.528571,0.000000,6.583308,3.724650,3.235727,0.070389,...,0,0,1,0,1,0,1,0,0,1.818639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,119,Lapatinib,UACC812_BREAST,910910,2.482848,0.000000,7.835545,2.266037,2.707083,0.097611,...,0,0,0,1,0,0,0,0,1,1.574658
4451,119,Lapatinib,HCC1187_BREAST,749711,5.241840,0.201634,5.615299,3.090853,3.732269,0.084064,...,0,0,0,1,0,0,0,0,1,2.957433
4452,119,Lapatinib,HCC2218_BREAST,749716,3.587365,0.000000,6.563463,2.636915,3.640390,0.014355,...,0,0,0,1,0,0,0,0,1,-2.692161
4453,119,Lapatinib,CAL148_BREAST,924106,4.041769,0.000000,5.964168,3.155425,4.427606,0.028569,...,0,0,0,1,0,0,0,0,1,2.965317


-----------

# Output into CSV

In [45]:
ccle_t_breast_cancer_gdsc1_iso.to_csv("ccle_t_breast_cancer_gdsc1_iso.csv")