In [7]:
# -------------
# Basic Imports
# -------------
import numpy as np, pandas as pd, gzip, csv, os, matplotlib.pyplot as plt, warnings, requests
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.preprocessing import StandardScaler


%matplotlib inline
warnings.filterwarnings("ignore")

# --------------------------
# Imports for Bioinformatics
# --------------------------
import scanpy as sc
import deepchem as dc
dc.__version__

from rdkit import Chem
from rdkit.Chem import RDKFingerprint

# import desc as desc

# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3             
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.5 pynndescent==0.5.8


--------------------
## **Reading Labels and Smiles**
--------------------

In [8]:
# --------------------------------
# Reading the Smiles & Labels Data
# --------------------------------
smiles = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\smiles.csv",header=None)
smiles.columns = ["drug_name", "drug_smiles"]
labels = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\labels.csv")
labels = labels.rename(columns={"Unnamed: 0":"information"})
print(f"Shape of Smiles Dataset: {smiles.shape}")
display(smiles.head(3))
print(f"\nShape of labels Dataset: {labels.shape}")
display(labels.head(3))

Shape of Smiles Dataset: (38, 2)


Unnamed: 0,drug_name,drug_smiles
0,5-FU,O=c1[nH]cc(F)c(=O)[nH]1
1,ABT-888,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
2,AZD1775,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)cc3)nc2n1-...



Shape of labels Dataset: (23052, 6)


Unnamed: 0,information,drug_a_name,drug_b_name,cell_line,synergy,fold
0,5-FU_ABT-888_A2058,5-FU,ABT-888,A2058,7.69353,2
1,5-FU_ABT-888_A2780,5-FU,ABT-888,A2780,7.778053,2
2,5-FU_ABT-888_A375,5-FU,ABT-888,A375,-1.198505,2


In [9]:
# Making a final Dataframe for Smiles and Labels
# ----------------------------------------------
smiles_labels_final = labels[['drug_a_name', 'drug_b_name', 'cell_line', 'fold']]
smiles_labels_final.head()


Unnamed: 0,drug_a_name,drug_b_name,cell_line,fold
0,5-FU,ABT-888,A2058,2
1,5-FU,ABT-888,A2780,2
2,5-FU,ABT-888,A375,2
3,5-FU,ABT-888,A427,2
4,5-FU,ABT-888,CAOV3,2


In [10]:
# Adding smiles structure for Drug A
smiles_labels_final = pd.merge(smiles_labels_final, smiles, left_on='drug_a_name', right_on='drug_name', how='left')
smiles_labels_final.rename(columns={"drug_smiles":"drug_a_structure"}, inplace=True)


# Adding smiles structure for Drug B
smiles_labels_final = pd.merge(smiles_labels_final, smiles, left_on='drug_b_name', right_on='drug_name', how='left')
smiles_labels_final.rename(columns={"drug_smiles":"drug_b_structure"}, inplace=True)

smiles_labels_final.drop(["drug_name_x", "drug_name_y"], axis=1, inplace=True)

smiles_labels_final.head()

Unnamed: 0,drug_a_name,drug_b_name,cell_line,fold,drug_a_structure,drug_b_structure
0,5-FU,ABT-888,A2058,2,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
1,5-FU,ABT-888,A2780,2,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
2,5-FU,ABT-888,A375,2,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
3,5-FU,ABT-888,A427,2,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
4,5-FU,ABT-888,CAOV3,2,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1


In [14]:
# Converting Final data to csv
# ----------------------------
smiles_labels_final.to_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Processed\smiles_labels_final.csv", index=False)

-------------------------
## **Reading Cancer Cell Lines**
-------------------------

In [25]:
# Read attribute matrix
ccle_attr = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\gene_attribute_matrix.txt.gz",sep='\t')
print(ccle_attr.shape)
display(ccle_attr.head())

(18027, 1040)


Unnamed: 0,#,#.1,CellLine,CHL1,HMCB,HS852T,HS695T,A101D,HS294T,SNU466,...,HEL9217,HEL,UT7,SET2,MOLM16,KU812,TF1,MEG01,KYO1,K562
0,#,#,Tissue,skin,skin,skin,skin,skin,skin,central nervous system,...,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue
1,GeneSym,,GeneID/NA,na,na,na,na,na,na,na,...,na,na,na,na,na,na,na,na,na,na
2,FBN1,na,2200,0.000000,0.000000,0.000000,-0.000000,0.000000,0.000000,0.000000,...,-0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,0.000000,-0.000000,-0.000000
3,ITGBL1,na,9358,-0.000000,-0.000000,0.000000,-0.000000,-0.000000,-0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000
4,LRP1,na,4035,-0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000


In [17]:
cell2ind = pd.DataFrame(columns=['index', 'Cell_lines'])
cell2ind['Cell_lines'] = list(ccle_attr.columns[3:])
cell2ind['index'] = list(cell2ind.index)
display("Cell Data: ",cell2ind.head())

gene2ind = pd.DataFrame(columns=['index', 'Genes', 'Gene_id'])
gene2ind['Genes'] = list(ccle_attr['#'].values[2:])
gene2ind['index'] = list(gene2ind.index)
gene2ind['Gene_id'] = list(ccle_attr['CellLine'].values[2:])
display("Gene Data: ",gene2ind.head())

'Cell Data: '

Unnamed: 0,index,Cell_lines
0,0,CHL1
1,1,HMCB
2,2,HS852T
3,3,HS695T
4,4,A101D


'Gene Data: '

Unnamed: 0,index,Genes,Gene_id
0,0,FBN1,2200
1,1,ITGBL1,9358
2,2,LRP1,4035
3,3,LTBP2,4053
4,4,PARVA,55742


In [19]:
# Getting the Cell Expressions
# ----------------------------
cell2exp = pd.DataFrame(columns=['expression'])
l = []
for cells_ind in range(len(cell2ind['Cell_lines'])):
    l.append(list(ccle_attr[cell2ind['Cell_lines'].iloc[cells_ind]].values[2:]))
cell2exp['expression'] = l
display(cell2exp.head())

Unnamed: 0,expression
0,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
1,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
2,"[0.000000, 0.000000, 0.000000, -0.000000, 0.00..."
3,"[-0.000000, -0.000000, 0.000000, 0.000000, 0.0..."
4,"[0.000000, -0.000000, 0.000000, -0.000000, 0.0..."


-------------------------
## **Reading Protein-Protein Interaction Data**
-------------------------

In [26]:
pp_int = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\PP-Pathways_ppi.csv.gz",header=None)
pp_int.columns = ['Protein1', 'Protein2']
pp_int.head()

Unnamed: 0,Protein1,Protein2
0,1394,2778
1,6331,17999
2,122704,54460
3,2597,2911
4,4790,79155
