In [77]:
# # #--------------------------
# # #-------Installation-------
# # #--------------------------
# !pip3 install episcanpy
# !pip3 install scanpy
# !pip3 install anndata
# !pip install desc
# !pip install display
# !pip3 install --pre deepchem --user
# !pip install nfp
# !pip install --force-reinstall numpy==1.19.5
# !pip install pyarrow

In [2]:
import warnings
warnings.filterwarnings('ignore')


# -------------
# Basic Imports
# -------------
import numpy as np, pandas as pd, gzip, csv, os, matplotlib.pyplot as plt, requests
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.preprocessing import StandardScaler
%matplotlib inline


# --------------------------
# Imports for Bioinformatics
# --------------------------
import scanpy as sc
import episcanpy
# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3             
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#import episcanpy
from rdkit import Chem
from rdkit.Chem import RDKFingerprint
from rdkit import RDLogger
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolsToGridImage
RDLogger.DisableLog("rdApp.*")

import deepchem as dc 
#print("Deepchem version: ",dc.__version__)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.5 python-igraph==0.10.3 louvain==0.8.0 pynndescent==0.5.8
Deepchem version:  2.7.1


----------------------------

In [73]:
# Reading 
# -------------------------------------
path_raw_gene = (r"G:/My Drive/[Study]/$ Project - ML Bioinformatics/Data/Data - Raw/CosmicCLP_RawGeneExpression.tsv.gz")
raw_gene_exp_df = pd.read_csv(path_raw_gene, sep='\t')
raw_gene_exp_df.head()

Unnamed: 0,SAMPLE_ID,SAMPLE_NAME,GENE_NAME,GENE_EXPRESSION
0,683665,MC-CAR,A1BG,-0.43
1,683665,MC-CAR,A1CF,-0.86
2,683665,MC-CAR,A2M,1.07
3,683665,MC-CAR,A2ML1,0.42
4,683665,MC-CAR,A3GALT2,2.74


In [87]:
path = r"G:\My Drive\[Study]\$ Project - ML Bioinformatics\Data\Data - Raw"

attribute_mtx_path = os.path.join(path, "gene_attribute_matrix.txt.gz")

# Read attribute matrix
ccle_attr = pd.read_csv(attribute_mtx_path, sep='\t')
print(ccle_attr.shape)

(18027, 1040)


In [88]:
cell2ind = pd.DataFrame(columns = ['index','Cell_lines'])
cell2ind['Cell_lines'] = list(ccle_attr.columns[3:])
cell2ind['index'] =  list(cell2ind.index)
display(cell2ind.head())

Unnamed: 0,index,Cell_lines
0,0,CHL1
1,1,HMCB
2,2,HS852T
3,3,HS695T
4,4,A101D


In [89]:
gene2ind = pd.DataFrame(columns = ['index','Genes', 'Gene_id'])
gene2ind['Genes'] =  list(ccle_attr['#'].values[2:])
gene2ind['index'] =  list(gene2ind.index)
gene2ind['Gene_id'] =  list(ccle_attr['CellLine'].values[2:])
display(gene2ind.head())

Unnamed: 0,index,Genes,Gene_id
0,0,FBN1,2200
1,1,ITGBL1,9358
2,2,LRP1,4035
3,3,LTBP2,4053
4,4,PARVA,55742


In [90]:
cell2exp = pd.DataFrame(columns = ['expression'])
l = []
for cells_ind in range(len(cell2ind['Cell_lines'])):
    l.append(list(ccle_attr[cell2ind['Cell_lines'].iloc[cells_ind]].values[2:]))
    
cell2exp['expression'] = l
display(cell2exp.head())

Unnamed: 0,expression
0,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
1,"[0.000000, -0.000000, -0.000000, -0.000000, -0..."
2,"[0.000000, 0.000000, 0.000000, -0.000000, 0.00..."
3,"[-0.000000, -0.000000, 0.000000, 0.000000, 0.0..."
4,"[0.000000, -0.000000, 0.000000, -0.000000, 0.0..."


In [93]:
# -------------------------------------------
#    Reading the Protein-Protein pathways
# -------------------------------------------
pp_path = os.path.join(path, "pp_int_updated.csv")
pp_int = pd.read_csv(pp_path, header=None)
pp_int.columns = ['Protein1', 'Protein2']
pp_int.drop(index=0, inplace=True)# data.drop(index=0)
pp_int.head()

Unnamed: 0,Protein1,Protein2
1,1394,2778
2,122704,54460
3,2597,2911
4,4790,79155
5,109,27115


In [76]:
# Reading 
# -------------------------------------

path = r"G:\My Drive\[Study]\$ Project - ML Bioinformatics\Data\Data - Raw"

labels_path = os.path.join(path,"labels.csv")
smiles_path = os.path.join(path,"smiles.csv")

df_labels = pd.read_csv(labels_path)
df_smiles = pd.read_csv(smiles_path)

display(df_labels.head())
display(df_smiles.head())

Unnamed: 0.1,Unnamed: 0,drug_a_name,drug_b_name,cell_line,synergy,fold
0,5-FU_ABT-888_A2058,5-FU,ABT-888,A2058,7.69353,2
1,5-FU_ABT-888_A2780,5-FU,ABT-888,A2780,7.778053,2
2,5-FU_ABT-888_A375,5-FU,ABT-888,A375,-1.198505,2
3,5-FU_ABT-888_A427,5-FU,ABT-888,A427,2.595684,2
4,5-FU_ABT-888_CAOV3,5-FU,ABT-888,CAOV3,-5.139971,2


Unnamed: 0,5-FU,O=c1[nH]cc(F)c(=O)[nH]1
0,ABT-888,CC1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1
1,AZD1775,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)cc3)nc2n1-...
2,BEZ-235,Cn1c(=O)n(-c2ccc(C(C)(C)C#N)cc2)c2c3cc(-c4cnc5...
3,BORTEZOMIB,CC(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)c1cnccn1)B(O)O
4,CARBOPLATIN,N.N.O=C(O)C1(C(=O)O)CCC1.[Pt]
