In [None]:
import pandas as pd
import pubchempy as pcp
import pickle
import os.path

# File Requirements

This Jupyter notebook processes the necessary data for the later models.

The following files are required in the `data/STEP00` folder:
1. [Cell_line_RMA_proc_basalExp.txt](https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Home.html)
2. [GDSC2_fitted_dose_response_27Oct23.xlsx](https://www.cancerrxgene.org/downloads/bulk_download)
3. [drug_list.csv](https://www.cancerrxgene.org/compounds)

As the cancer cell line data is too large, it must be downloaded separately. For this purpose, the “RMA normalized expression data for cell-lines” data set must be downloaded via the link above. 

## Load the data


In [None]:
ccl_file = "data/STEP00/Cell_line_RMA_proc_basalExp.txt"
ic50_file = "data/STEP00/GDSC2_fitted_dose_response_27Oct23.xlsx"
drug_file = "data/STEP00/drug_list.csv"

ic50_df = pd.read_excel(ic50_file)
ccl_df = pd.read_csv(ccl_file, sep="\t")
drugs_df = pd.read_csv(drug_file, skipinitialspace=True)

ic50_df = ic50_df[["COSMIC_ID", "DRUG_ID", "LN_IC50"]]

In [None]:
ic50_df.head(3)

In [None]:
ccl = ic50_df["COSMIC_ID"].unique()
drugs = ic50_df["DRUG_ID"].unique()
print(f"Number of unique COSMIC IDs: {len(ccl)}")
print(f"Number of unique drug IDs: {len(drugs)}")

## Find SMILES codes of drugs

To find the SMILES codes, the PubCHEM IDs of the respective drugs are required, although not all of them have a PubCHEM ID. 
This code cell does not necessarily have to be executed, as the ALL_SMILES.csv is already available in the GitHub repository.

In [None]:
if not os.path.isfile("data/STEP00/ALL_SMILES.csv"): 
    smiles_df = drugs_df[["Drug Id", "PubCHEM", "Datasets"]]
    
    # Deletes rows with NA/None/none/several 
    smiles_df = smiles_df.dropna(subset=["PubCHEM"])
    smiles_df = smiles_df[(smiles_df["PubCHEM"] != "none")
                        & (smiles_df["PubCHEM"] != "None")
                        & (smiles_df["PubCHEM"] != "several")]
    
    smiles_codes = []
    
    for index, row in smiles_df.iterrows():
        pubchem_id = row["PubCHEM"].split(',')[0]
        compound = pcp.Compound.from_cid(pubchem_id)
        smiles = compound.isomeric_smiles
        smiles_codes.append(smiles)
    
    smiles_df["smiles"] = smiles_codes
    
    smiles_df.to_csv("data/STEP00/ALL_SMILES.csv")
    
else:
    print("File already exists.")

## Filter missing cancer cell line data
Gene expression data is not available for every cosmic_id, so these must be filtered.

In [None]:
ccl_values = []
na_cosmic_ids = []

for cosmic_id in ccl:
    id = "DATA." + str(cosmic_id)
    if id in ccl_df.columns:
        ccl_values.append(ccl_df[str(id)].values.tolist())
    else:
        ccl_values.append("NA")
        na_cosmic_ids.append(cosmic_id)
        
na_count = sum(1 for item in ccl_values if item == "NA")

print(f"Number of total cell lines: {len(ccl_values)}")
print(f"Number of filtered cell lines: {len(ccl_values)-na_count}")

## Filter missing drug data



In [None]:
smiles_df = pd.read_csv("data/STEP00/ALL_SMILES.csv")
drugs_values = []
na_drug_ids = []

for drug in drugs:
    if drug in smiles_df["Drug Id"].values:
        matching_smiles = smiles_df.loc[smiles_df["Drug Id"] == drug, "smiles"].values
        drugs_values.append(matching_smiles[0]) 
    else:
        drugs_values.append("NA")
        na_drug_ids.append(drug)

na_count = sum(1 for item in drugs_values if item == "NA")

print(f"Number of total drugs: {len(drugs_values)}")
print(f"Number of filtered drugs: {len(drugs_values)-na_count}")

## Remove pairs with missing data

In [None]:
ic50_df_filtered = ic50_df[~ic50_df["COSMIC_ID"].isin(na_cosmic_ids)]

print(f"Original number of rows: {len(ic50_df)}")
print(f"Number of rows after filtering NA COSMIC IDs: {len(ic50_df_filtered)}")

ic50_df_filtered = ic50_df_filtered[~ic50_df_filtered["DRUG_ID"].isin(na_drug_ids)]

print(f"Number of rows after filtering NA DRUG IDs: {len(ic50_df_filtered)}")

In [None]:
ic50_df_filtered = ic50_df_filtered.reset_index(drop=True)

In [None]:
ic50_df_filtered.to_csv("data/STEP00/CCL_SMILES_IC50.csv", index_label="index")

## Generate lookup dictionaries

In [None]:
ccl_dict = dict(zip(ccl, ccl_values))
smiles_dict = dict(zip(drugs, drugs_values))

with open("data/STEP00/ccl_lookup.pkl", "wb") as f:
    pickle.dump(ccl_dict, f)
    
with open("data/STEP00/smiles_lookup.pkl", "wb") as f:
    pickle.dump(smiles_dict, f)    

In [None]:
from collections import Counter

# Two drugs share the same SMILES code
smiles_dict = {key: value for key, value in smiles_dict.items() if value != "NA"}
values = list(smiles_dict.values())
value_counts = Counter(values)
print(value_counts)