In [1]:
# -------------
# Basic Imports
# -------------
import numpy as np, pandas as pd, gzip, csv, os, matplotlib.pyplot as plt, warnings, requests
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.preprocessing import StandardScaler


%matplotlib inline
warnings.filterwarnings("ignore")

# --------------------------
# Imports for Bioinformatics
# --------------------------
import scanpy as sc
import deepchem as dc
dc.__version__

from rdkit import Chem
from rdkit.Chem import RDKFingerprint

# import desc as desc

# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3             
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (c:\Users\nitan\anaconda3\envs\bioinformatics\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. jax requires jaxlib to be installed. See https://github.com/google/jax#installation for installation instructions.


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.5 pynndescent==0.5.8


--------------------
## **Reading Labels and Smiles**
--------------------

In [2]:
# --------------------------------
# Reading the Smiles & Labels Data
# --------------------------------
smiles = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\smiles.csv",header=None)
smiles.columns = ["drug_name", "drug_smiles"]
labels = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\labels.csv")
labels = labels.rename(columns={"Unnamed: 0":"information"})
print(f"Shape of Smiles Dataset: {smiles.shape}")
display(smiles.head(3))
print(f"\nShape of labels Dataset: {labels.shape}")
display(labels.head(3))

Shape of Smiles Dataset: (38, 2)


Unnamed: 0,drug_name,drug_smiles
0,5-FU,O=c1[nH]cc(F)c(=O)[nH]1
1,ABT-888,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
2,AZD1775,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)cc3)nc2n1-...



Shape of labels Dataset: (23052, 6)


Unnamed: 0,information,drug_a_name,drug_b_name,cell_line,synergy,fold
0,5-FU_ABT-888_A2058,5-FU,ABT-888,A2058,7.69353,2
1,5-FU_ABT-888_A2780,5-FU,ABT-888,A2780,7.778053,2
2,5-FU_ABT-888_A375,5-FU,ABT-888,A375,-1.198505,2


In [3]:
# Making a final Dataframe for Smiles and Labels
# ----------------------------------------------
smiles_labels_final = labels[['drug_a_name', 'drug_b_name', 'cell_line', 'fold', 'synergy']]
smiles_labels_final.head()


Unnamed: 0,drug_a_name,drug_b_name,cell_line,fold,synergy
0,5-FU,ABT-888,A2058,2,7.69353
1,5-FU,ABT-888,A2780,2,7.778053
2,5-FU,ABT-888,A375,2,-1.198505
3,5-FU,ABT-888,A427,2,2.595684
4,5-FU,ABT-888,CAOV3,2,-5.139971


In [4]:
# Adding smiles structure for Drug A
smiles_labels_final = pd.merge(smiles_labels_final, smiles, left_on='drug_a_name', right_on='drug_name', how='left')
smiles_labels_final.rename(columns={"drug_smiles":"drug_a_structure"}, inplace=True)

# Adding smiles structure for Drug B
smiles_labels_final = pd.merge(smiles_labels_final, smiles, left_on='drug_b_name', right_on='drug_name', how='left')
smiles_labels_final.rename(columns={"drug_smiles":"drug_b_structure"}, inplace=True)
smiles_labels_final.drop(["drug_name_x", "drug_name_y"], axis=1, inplace=True)
smiles_labels_final.head()

Unnamed: 0,drug_a_name,drug_b_name,cell_line,fold,synergy,drug_a_structure,drug_b_structure
0,5-FU,ABT-888,A2058,2,7.69353,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
1,5-FU,ABT-888,A2780,2,7.778053,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
2,5-FU,ABT-888,A375,2,-1.198505,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
3,5-FU,ABT-888,A427,2,2.595684,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
4,5-FU,ABT-888,CAOV3,2,-5.139971,O=c1[nH]cc(F)c(=O)[nH]1,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1


In [5]:
# Converting Final data to csv
# ----------------------------
smiles_labels_final.to_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Processed\smiles_labels_final.csv", index=False)

-------------------------
## **Reading Cancer Cell Lines**
-------------------------

In [6]:
# Read attribute matrix
ccle_attr = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\gene_attribute_matrix.txt.gz",sep='\t')
print(ccle_attr.shape)
display(ccle_attr.head())

(18027, 1040)


Unnamed: 0,#,#.1,CellLine,CHL1,HMCB,HS852T,HS695T,A101D,HS294T,SNU466,...,HEL9217,HEL,UT7,SET2,MOLM16,KU812,TF1,MEG01,KYO1,K562
0,#,#,Tissue,skin,skin,skin,skin,skin,skin,central nervous system,...,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue
1,GeneSym,,GeneID/NA,na,na,na,na,na,na,na,...,na,na,na,na,na,na,na,na,na,na
2,FBN1,na,2200,0.000000,0.000000,0.000000,-0.000000,0.000000,0.000000,0.000000,...,-0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,0.000000,-0.000000,-0.000000
3,ITGBL1,na,9358,-0.000000,-0.000000,0.000000,-0.000000,-0.000000,-0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000
4,LRP1,na,4035,-0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000


In [7]:
cell2ind = pd.DataFrame(columns=['index', 'Cell_lines'])
cell2ind['Cell_lines'] = list(ccle_attr.columns[3:])
cell2ind['index'] = list(cell2ind.index)
display("Cell Data: ",cell2ind.head())

gene2ind = pd.DataFrame(columns=['index', 'Genes', 'Gene_id'])
gene2ind['Genes'] = list(ccle_attr['#'].values[2:])
gene2ind['index'] = list(gene2ind.index)
gene2ind['Gene_id'] = list(ccle_attr['CellLine'].values[2:])
display("Gene Data: ",gene2ind.head())

'Cell Data: '

Unnamed: 0,index,Cell_lines
0,0,CHL1
1,1,HMCB
2,2,HS852T
3,3,HS695T
4,4,A101D


'Gene Data: '

Unnamed: 0,index,Genes,Gene_id
0,0,FBN1,2200
1,1,ITGBL1,9358
2,2,LRP1,4035
3,3,LTBP2,4053
4,4,PARVA,55742


In [8]:
# Getting the Cell Expressions
# ----------------------------
cell2exp = pd.DataFrame(columns=['expression'])
l = []
for cells_ind in range(len(cell2ind['Cell_lines'])):
    l.append(list(ccle_attr[cell2ind['Cell_lines'].iloc[cells_ind]].values[2:]))
cell2exp['expression'] = l
display(cell2exp.head())

Unnamed: 0,expression
0,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
1,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
2,"[0.000000, 0.000000, 0.000000, -0.000000, 0.00..."
3,"[-0.000000, -0.000000, 0.000000, 0.000000, 0.0..."
4,"[0.000000, -0.000000, 0.000000, -0.000000, 0.0..."


-------------------------
## **Reading Protein-Protein Interaction Data**
-------------------------

In [9]:
# Reading the Gene-Ids in the Protein-Protein Interaction Data
# ------------------------------------------------------------
pp_int = pd.read_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Raw\PP-Pathways_ppi.csv.gz",header=None)
pp_int.columns = ['Protein1', 'Protein2']
pp_int.head()

Unnamed: 0,Protein1,Protein2
0,1394,2778
1,6331,17999
2,122704,54460
3,2597,2911
4,4790,79155


-----------------------------------
# **Filtering out the Data**
-----------------------------------

In [11]:
# If any genes are cell line ids, we will remove them
# ---------------------------------------------------

cell_ids = cell2ind['Cell_lines'].unique()
to_drop = []
for i in range(len(gene2ind)):
  if(gene2ind.iloc[i]['Genes'] in cell_ids):
    to_drop.append(i)

print("Number of genes to be Dropped: ",len(to_drop))
gene2ind = gene2ind.drop(to_drop, axis = 0)
print("Shape of gene2ind: ",gene2ind.shape)

Number of genes to be Dropped:  0
Shape of gene2ind:  (18019, 3)


In [18]:
# Considering the Protein-Protein interaction data
#   check and take intersection of gene_ids  
# -----------------------------------------------------------

p1p2 = set(np.concatenate((pp_int['Protein1'].unique(), pp_int['Protein1'].unique())))
print('Total gene ids in pp_int dataset: ', len(p1p2))

Total gene ids in pp_int dataset:  15849


In [21]:
## Check if any gene ids from deepsynergy dataset is missing in this p1p2

common_gene_ids = []; uncommon_gene_ids = [];

for i in range(len(gene2ind)):
  if(gene2ind.iloc[i]['Gene_id'] in p1p2):
    common_gene_ids.append(gene2ind.iloc[i]['Gene_id'])
  else:
    uncommon_gene_ids.append(gene2ind.iloc[i]['Gene_id'])

print('Total Common intersecting gene ids:       ', len(common_gene_ids))
print('Total uncommon non-intersecting gene ids: ', len(uncommon_gene_ids))

Total Common intersecting gene ids:               12579
 and Total uncommon non-intersecting gene ids:    5440


In [24]:
# Considering the ccle_attr (Cancer Cell Line Attribute data)
#   Filtering out by removing gene ids note present in the gene2ind
#   In other words, remove rows where id not present in common gene ids.
# 
old_shape = ccle_attr.shape
to_drop = []
for i in range(2, len(ccle_attr)):
  if(ccle_attr.iloc[i]['CellLine'] not in common_gene_ids):
    to_drop.append(i)

print(f"Dropping {len(to_drop)} genes")
ccle_attr = ccle_attr.drop(to_drop, axis = 0)
print(f"Old Shape of Cancer Cell Line Attributes data: {old_shape}")
print(f"New Shape of Cancer Cell Line Attributes data: {ccle_attr.shape}")
display(ccle_attr.head())

Dropping 0 genes
Old Shape of Cancer Cell Line Attributes data: (12581, 1040)
New Shape of Cancer Cell Line Attributes data: (12581, 1040)


Unnamed: 0,#,#.1,CellLine,CHL1,HMCB,HS852T,HS695T,A101D,HS294T,SNU466,...,HEL9217,HEL,UT7,SET2,MOLM16,KU812,TF1,MEG01,KYO1,K562
0,#,#,Tissue,skin,skin,skin,skin,skin,skin,central nervous system,...,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue,haematopoietic and lymphoid tissue
1,GeneSym,,GeneID/NA,na,na,na,na,na,na,na,...,na,na,na,na,na,na,na,na,na,na
512,LBH,na,81606,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0
513,GLI2,na,2736,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
514,PAPPA,na,5069,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0


In [25]:
# Considering the Protein-Protein Interaction dataset - pp_int
#   Dropping uncommon genes from pp_int
# --------------------------------------------------------------
to_drop = set()
for i in range(len(pp_int)):
  if(pp_int.iloc[i]['Protein1'] not in common_gene_ids):    to_drop.add(i)
  if(pp_int.iloc[i]['Protein2'] not in common_gene_ids):    to_drop.add(i)

to_drop = list(to_drop)
print("Shape of PP Interaction before dropping: ",pp_int.shape)
pp_int = pp_int.drop(to_drop, axis = 0)
print("Shape of PP Interaction after dropping:  ",pp_int.shape)

Shape of PP Interaction before dropping:  (342353, 2)
Shape of PP Interaction after dropping:   (251482, 2)


In [26]:
# Resetting the indexes for both ccle_attr and pp_int:
# -----------------------------------------------------
ccle_attr = ccle_attr.reset_index().drop(['index'], axis = 1)
pp_int = pp_int.reset_index().drop(['index'], axis = 1)

In [27]:
print('Unique gene ids in pp_int data:      ', len(set(np.concatenate((pp_int['Protein1'].unique(), 
                                                                       pp_int['Protein2'].unique()))) ))
print('Unique gene ids in ccle_attr data:   ', len(ccle_attr['CellLine'][2:].unique()))

Unique gene ids in pp_int data:       12512
Unique gene ids in ccle_attr data:    12579


We observe that these two datasets do not match.

Thus we will remove all extra genes from ccle_attr dataset

In [28]:
genes_in_pp = list(set(np.concatenate((pp_int['Protein1'].unique(), 
                                       pp_int['Protein2'].unique()))))
to_drop = []
old_shape = ccle_attr.shape

for i in range(2, len(ccle_attr)):
  if(ccle_attr.iloc[i]['CellLine'] not in genes_in_pp):
    to_drop.append(i)

print(f'dropping {len(to_drop)} genes')
ccle_attr = ccle_attr.drop(to_drop, axis = 0)
print("Old shape of ccle_attr: ", old_shape)
print("New shape of ccle_attr: ", ccle_attr.shape)


dropping 67 genes
Old shape of ccle_attr:  (12581, 1040)
New shape of ccle_attr:  (12514, 1040)


In [29]:
print('Unique gene ids in pp_int data:   ', len(set(np.concatenate((pp_int['Protein1'].unique(), 
                                                                    pp_int['Protein2'].unique())))))
print('Unique gene ids in gene2ind data: ', len(ccle_attr['CellLine'].iloc[2:].unique()))

Unique gene ids in pp_int data:    12512
Unique gene ids in gene2ind data:  12512


In [30]:
ccle_attr = ccle_attr.reset_index().drop(['index'], axis = 1)
pp_int = pp_int.reset_index().drop(['index'], axis = 1)
print(ccle_attr.shape, 
      pp_int.shape)

(12514, 1040) (251482, 2)


In [31]:
# Saving these two dataset
# ------------------------
ccle_attr.to_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Processed\ccle_attr_updated.csv", index=False)
pp_int.to_csv(r"G:\My Drive\Study\Project-cancer-drug-synergy-prediction\Data\Processed\pp_int_updated.csv", index=False)

-----------------------------
### **Reading Complete**
-----------------------------