# Exploring the GDSC dataset

## Dataset notes

- response
  - number of cell lines: 969
  - number of drugs: 295
- gene expression
  - 17737 genes measured
    - 17419 with gene symbol
    - 318 without gene symbol
- drug annotations
  - 608 with identifiable name

## Imports and constants

In [91]:
import pandas as pd
import re
import math

In [70]:
datadir = "/Volumes/OXYTOCIN/datasets/gdsc"
response_fname = "GDSC2_fitted_dose_response_24Jul22.xlsx" # drug response data
expr_fname = "Cell_line_RMA_proc_basalExp.txt" # gene expression data
cmpds_fname = "Drug_list Jan 20 2023.csv" # compound metadata

## Data exploration

### Drug response data

In [17]:
response = pd.read_excel(f"{datadir}/{response_fname}")

In [73]:
response

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,401,18945558,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-1.462148,0.930105,0.088999,0.432482
1,GDSC2,401,18945796,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-4.869447,0.614932,0.111423,-1.420322
2,GDSC2,401,18946078,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.360684,0.790953,0.142754,-0.599894
3,GDSC2,401,18946335,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-5.045014,0.592624,0.135642,-1.515791
4,GDSC2,401,18946617,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.741620,0.733992,0.128066,-0.807038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,GDSC2,401,19187490,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.134495,0.976798,0.074441,0.159946
242032,GDSC2,401,19187943,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.575555,0.913182,0.057743,-1.626059
242033,GDSC2,401,19188201,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.520666,0.974889,0.058094,0.602364
242034,GDSC2,401,19188741,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.701430,0.970009,0.100980,0.809457


In [18]:
# basic info about response dataset
print("num cell lines:", len(response["CELL_LINE_NAME"].unique()))
print("num drugs:", len(response["DRUG_ID"].unique()))

num cell lines: 969
num drugs: 295


### Expression data

In [67]:
df = pd.read_csv(f"{datadir}/{expr_fname}", sep="\t")

In [12]:
len(df.columns)

1020

In [13]:
df.columns

Index(['GENE_SYMBOLS', 'GENE_title', 'DATA.906826', 'DATA.687983',
       'DATA.910927', 'DATA.1240138', 'DATA.1240139', 'DATA.906792',
       'DATA.910688', 'DATA.1240135',
       ...
       'DATA.753584', 'DATA.907044', 'DATA.998184', 'DATA.908145',
       'DATA.1659787', 'DATA.1298157', 'DATA.1480372', 'DATA.1298533',
       'DATA.930299', 'DATA.905954.1'],
      dtype='object', length=1020)

In [34]:
len(df["GENE_SYMBOLS"].dropna()), len(set(df["GENE_SYMBOLS"].dropna())), len(df["GENE_SYMBOLS"])

(17419, 17419, 17737)

### Drug metadata

In [80]:
cmpds = pd.read_csv(f"{datadir}/{cmpds_fname}")

In [81]:
cmpds.columns

Index(['Drug Id', 'Name', 'Synonyms', 'Targets', 'Target pathway', 'PubCHEM',
       'Datasets', 'number of cell lines', 'Screening site'],
      dtype='object')

In [111]:
# keep compounds with pubchem id
cmpds = cmpds[cmpds["Datasets"] == "GDSC2"]
cmpds = cmpds[cmpds["PubCHEM"].notna()]
cmpds = cmpds[cmpds["PubCHEM"] != "None"]
cmpds = cmpds[cmpds["PubCHEM"] != "none"]
cmpds = cmpds[cmpds["PubCHEM"] != "several"]
cmpds["PubCHEM_single"] = cmpds["PubCHEM"].map(lambda f: f.split(",")[0])
cmpds = cmpds.sort_values(by="PubCHEM_single")
print(len(cmpds))

186
