# Training Toy Dataset

In [None]:
!git clone https://github.com/GIST-CSBL/DeepConv-DTI

Cloning into 'DeepConv-DTI'...
remote: Enumerating objects: 97, done.[K
remote: Total 97 (delta 0), reused 0 (delta 0), pack-reused 97[K
Unpacking objects: 100% (97/97), done.


In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
3.2399101599999938
GPU (s):
0.03953037900001277
GPU speedup over CPU: 81x


In [None]:
import pandas as pd

In [None]:
traning_comp = pd.read_csv("/content/DeepConv-DTI/toy_examples/training_dataset/training_compound.csv")
traning_comp.head()

Unnamed: 0.1,Unnamed: 0,Compound_ID,SMILES,InChI,Drugbank,IUPHAR,KEGG,morgan_fp_r2,morgan_fp_r3,morgan_fp_r1,Mol2Vec,PubChem_fingerprint
0,62,DRUG00063,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,True,True,True,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,7.884823\t-8.122994\t-5.77131\t18.334135\t1.02...,1.0\t1.0\t1.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1...
1,90,DRUG00091,CN1C=NC2=C1C(=O)N(C)C(=O)N2C,InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)...,True,True,True,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,3.7801247000000004\t-1.3690111999999999\t-2.34...,1.0\t1.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1...
2,1981,DRUG01982,O[C@@H]1[C@H](O)[C@@H](OP(O)(O)=O)[C@H](OP(O)(...,"InChI=1S/C6H15O15P3/c7-1-2(8)5(20-23(13,14)15)...",True,True,False,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,4.0659432\t-4.6435547\t-3.5455644\t13.78389\t2...,1.0\t1.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1...
3,4702,DRUG04703,OCC(O)CO,"InChI=1S/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",True,True,False,0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,1.2004018\t0.16939774\t0.16100265\t-1.1320121\...,1.0\t1.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1...
4,5230,DRUG05231,Clc1c(Cl)cccc1c1nnnn1Cc1cccnc1,InChI=1S/C13H9Cl2N5/c14-11-5-1-4-10(12(11)15)1...,False,True,False,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,2.4320082999999997\t-3.9578580000000003\t-2.03...,1.0\t1.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1...


In [None]:
!python3 /content/DeepConv-DTI/DeepConvDTI.py /content/dataset/training/training_dti.csv /content/dataset/training/training_compound.csv /content/dataset/training/training_protein.csv --validation -n validation_dataset -i /content/dataset/validation/validation_dti.csv -d /content/dataset/validation/validation_compound.csv -t /content/dataset/validation/validation_protein.csv -W -c 512 128 -w 10 15 20 25 30 -p 128 -f 128 -r 0.0001 -n 30 -v Convolution -l 2500 -L 2048 -D 0 -a elu -F 128 -b 32 -y 0.0001 -o ./validation_output.csv -m ./model.model -e 1

2021-01-10 18:17:54.268742: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
	model parameters summary	
drug_layers          : [512, 128]
protein_strides      : [10, 15, 20, 25, 30]
protein_layers       : [128]     
fc_layers            : [128]     
learning_rate        : 0.0001    
decay                : 0.0001    
activation           : elu       
filters              : 128       
dropout              : 0.0       
prot_vec             : Convolution
prot_len             : 2500      
drug_vec             : morgan_fp 
drug_len             : 2048      
2021-01-10 18:17:55.936178: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-01-10 18:17:55.937062: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-01-10 18:17:55.941130: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successfu

In [None]:
!python /content/DeepConv-DTI/DeepConvDTI.py /content/DeepConv-DTI/toy_examples/training_dataset/training_dti.csv /content/DeepConv-DTI/toy_examples/training_dataset/training_compound.csv /content/DeepConv-DTI/toy_examples/training_dataset/training_protein.csv --predict -n predict -i /content/DeepConv-DTI/toy_examples/test_dataset/test_dti.csv -d /content/DeepConv-DTI/toy_examples/test_dataset/test_compound.csv -t /content/DeepConv-DTI/toy_examples/test_dataset/test_protein.csv -c 512 128 -w 10 15 20 25 30 -p 128 -f 128 -r 0.0001 -n 30 -v Convolution -l 2500 -V morgan_fp_r2 -L 2048 -D 0 -a elu -F 128 -b 32 -y 0.0001 -o ./test_output.csv -m ./model.model -e 15 -W

# Load BIOSNAP DATASET

In [None]:
!git clone https://github.com/kexinhuang12345/MolTrans

Cloning into 'MolTrans'...
^C


In [None]:
#RDKIT is only setupable with conda that's we setup condaloab
!pip install -q condacolab
import condacolab
condacolab.install()


In [None]:
# RDKIT LIBRARY FOR MORGAN FINGERPRINTS
!conda install -y -c rdkit rdkit;

In [None]:
biosnap_datasets = {'training':"/content/MolTrans/dataset/BIOSNAP/full_data/train.csv",
                    'validation':"/content/MolTrans/dataset/BIOSNAP/full_data/val.csv",
                    'test':"/content/MolTrans/dataset/BIOSNAP/full_data/test.csv"}

In [None]:
train_df = pd.read_csv(biosnap_datasets['training'])
val_df = pd.read_csv(biosnap_datasets['validation'])
test_df = pd.read_csv(biosnap_datasets['test'])

In [None]:
import os 

dataset_dir = "/content/dataset"

def create_dti_csv(df,directory,clas):
  if clas == "DTI":
    csv_name = directory+"_dti.csv"
    data = df.loc[:,['DrugBank ID','Gene','Label']]
    data.columns = ['Compound_ID','Protein_ID','Label']
    data['Label'] = data['Label'].apply(int)
    path = os.path.join(dataset_dir,directory)
    if not os.path.exists(path):
      os.makedirs(path)
    data.to_csv(path+"/"+csv_name)
  elif clas =="COMPOUND":
    csv_name = directory+"_compound.csv"
    data = df.loc[:,['DrugBank ID','SMILES','morgan_fp_r1']]
    data.columns = ['Compound_ID','SMILES','morgan_fp_r1']
    path = os.path.join(dataset_dir,directory)
    if not os.path.exists(path):
      os.makedirs(path)
    data.to_csv(path+"/"+csv_name)
  elif clas == "PROTEIN":
    csv_name = directory+"_protein.csv"
    data = df.loc[:,['Gene','Target Sequence']]
    data.columns = ['Protein_ID','Sequence']
    path = os.path.join(dataset_dir,directory)
    if not os.path.exists(path):
      os.makedirs(path)
    data.to_csv(path+"/"+csv_name)
  else:
    print("GG")

In [None]:
from rdkit.Chem import AllChem as Chem
import numpy as np

def morgan_fingerprint(df):
  all_fps = []
  for smiles in df['SMILES'].values:
    mol = Chem.MolFromSmiles(smiles)
    fp = Chem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2,nBits=2048)
    vector = np.array(fp)
    str_array = [str(x) for x in vector]
    fp_with_t = "\t".join(str_array)
    all_fps.append(fp_with_t)
  df['morgan_fp_r1'] = all_fps
  return df

train_df_fp = morgan_fingerprint(train_df)
print("Train df finish...")
test_df_fp = morgan_fingerprint(test_df)
print("Test df finish...")
val_df_fp = morgan_fingerprint(val_df)

In [None]:
df_s = {"training":train_df_fp,
        "test":test_df_fp,
        "validation":val_df_fp}

In [None]:
for df in list(df_s.keys()):
  for c in ("DTI","COMPOUND","PROTEIN"):   
    create_dti_csv(df_s[df],df,c)

In [None]:
deneme = pd.read_csv("/content/dataset/training/training_compound.csv")
deneme.head()

# Train BIOSNAP dataset that we created before by using google colab to get data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Training part

In [None]:
!python3 /content/DeepConv-DTI/DeepConvDTI.py /content/drive/MyDrive/DTI-TEZ/dataset/training/training_dti.csv /content/drive/MyDrive/DTI-TEZ/dataset/training/training_compound.csv /content/drive/MyDrive/DTI-TEZ/dataset/training/training_protein.csv --validation -n validation_dataset -i /content/drive/MyDrive/DTI-TEZ/dataset/validation/validation_dti.csv -d /content/drive/MyDrive/DTI-TEZ/dataset/validation/validation_compound.csv -t /content/drive/MyDrive/DTI-TEZ/dataset/validation/validation_protein.csv -W -c 512 128 -w 10 15 20 25 30 -p 128 -f 128 -r 0.0001 -n 30 -v Convolution -l 2500 -V morgan_fp_r1 -L 2048 -D 0 -a elu -F 128 -b 16 -y 0.0001 -o ./validation_output.csv -m ./model.model -e 15

2021-03-24 20:56:27.769761: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
	model parameters summary	
drug_layers          : [512, 128]
protein_strides      : [10, 15, 20, 25, 30]
protein_layers       : [128]     
fc_layers            : [128]     
learning_rate        : 0.0001    
decay                : 0.0001    
activation           : elu       
filters              : 128       
dropout              : 0.0       
prot_vec             : Convolution
prot_len             : 2500      
drug_vec             : morgan_fp_r1
drug_len             : 2048      
2021-03-24 20:56:29.311724: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-03-24 20:56:29.312577: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-03-24 20:56:29.343563: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] success

In [None]:
!python /content/DeepConv-DTI/predict_with_model.py /content/model.model -n predict -i /content/drive/MyDrive/DTI-TEZ/dataset/test/test_dti.csv -d /content/drive/MyDrive/DTI-TEZ/dataset/test/test_compound.csv -t /content/drive/MyDrive/DTI-TEZ/dataset/test/test_protein.csv -v Convolution -l 2500 -V morgan_fp_r1 -L 2048 -W -o test_result.csv

In [None]:
!python /content/DeepConv-DTI/evaluate_performance.py /content/test_result.csv -n predict -T 0.2

Evaluation of the predict set 
	Sen :  0.9014440433212997
	Spe :  0.6526045487894351
	Acc :  0.7780203784570596
	Precision :  0.7250290360046457
	F1 :  0.8036691342130673


# Yeni Bölüm

In [None]:
import torch
from torch import nn
from torch.nn import MSELoss
from torch.nn import functional as F
from torch.utils.data import Dataset
from torchvision.datasets.utils import download_url, check_integrity

from sklearn.model_selection import train_test_split

from functools import partial
import numpy as np


In [None]:
class DeepConvDTIDataset(Dataset):

    MAX_TARGET_SEQUENCE_LENGTH = 2500

    AMINO_ALPHABET = ['?', 'A', 'C', 'B', 'E',
                      'D', 'G', 'L', 'O', 'U',
                      'F', 'I', 'H', 'K', 'M',
                      'N', 'Q', 'P', 'S', 'R', 
                      'W', 'V', 'Y', 'X', 'Z',
                      'T']


    aminochar2idx = dict(zip(AMINO_ALPHABET, range(len(AMINO_ALPHABET))))

    def sequence2idx(sequence, index):
        return [index[sequence_character] for sequence_character in sequence]

    target2idx = partial(sequence2idx, index=aminochar2idx)
    

    def __init__(self, sequences, smiles, affinities, mode="train"):
      
        self.sequences = sequences
        self.smiles = smiles
        self.affinities = affinities
        self.mode = mode

    def pad_sequence(self, sequence, max_length, index_fn):
        
        if len(sequence) < max_length:
          sequence = F.pad(torch.LongTensor(index_fn(sequence)), pad=(0, max_length - len(sequence)))

        return sequence[:max_length]

    def __getitem__(self, idx):
        
        sequence = self.pad_sequence(self.sequences[idx], self.MAX_TARGET_SEQUENCE_LENGTH, self.target2idx)
        smile = np.array(Chem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), radius=2, nBits=2048))

        if self.mode == "train":
            return torch.FloatTensor(smile), sequence, self.affinities[idx]

        return torch.FloatTensor(smile), sequence#torch.from_numpy(smile)
    
    def __len__(self):
        return len(self.affinities)

In [None]:
AMINO_ALPHABET = ['?', 'A', 'C', 'B', 'E',
                      'D', 'G', 'L', 'O', 'U',
                      'F', 'I', 'H', 'K', 'M',
                      'N', 'Q', 'P', 'S', 'R', 
                      'W', 'V', 'Y', 'X', 'Z',
                      'T']

In [None]:
aminochar2idx = dict(zip(AMINO_ALPHABET, range(len(AMINO_ALPHABET))))
aminochar2idx

In [None]:
def sequence2idx(sequence, index):
        return [index[sequence_character] for sequence_character in sequence]

In [None]:
target2idx = partial(sequence2idx, index=aminochar2idx)
target2idx

functools.partial(<function sequence2idx at 0x7f0bc48c31e0>, index={'?': 0, 'A': 1, 'C': 2, 'B': 3, 'E': 4, 'D': 5, 'G': 6, 'L': 7, 'O': 8, 'U': 9, 'F': 10, 'I': 11, 'H': 12, 'K': 13, 'M': 14, 'N': 15, 'Q': 16, 'P': 17, 'S': 18, 'R': 19, 'W': 20, 'V': 21, 'Y': 22, 'X': 23, 'Z': 24, 'T': 25})

In [None]:
seq_dict = DeepConvDTIDataset.AMINO_ALPHABET
seq_dict

In [None]:
abc = ['A',
       'I',
       'L',
       'V',
       'F',
       'W',
       'Y',
       'N',
       'C',
       'Q',
       'M',
       'S',
       'T']

In [None]:
ds = DeepConvDTIDataset(["".join(seq_dict), "".join(abc)], 
                        ["C=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", "C=CC(=O)NC1=C(C=C2C(=C1)"], 
                        [30, 45])

In [None]:
"".join(seq_dict), "".join(abc)], 
                        ["C=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", "C=CC(=O)NC1=C(C=C2C(=C1)"], 
                        [30, 45]

SyntaxError: ignored

In [None]:
!pip install functions
import functions as f


Collecting functions
  Downloading https://files.pythonhosted.org/packages/88/6f/7f35add5d405c189d9b9646d298bd50d5db6efdca08af77320b4b0626499/functions-0.7.0.tar.gz
Building wheels for collected packages: functions
  Building wheel for functions (setup.py) ... [?25l[?25hdone
  Created wheel for functions: filename=functions-0.7.0-cp36-none-any.whl size=3064 sha256=7dd1bae36e3469801d63246ea9d00a54e9c2692c47941aec36df291ee8da6e45
  Stored in directory: /root/.cache/pip/wheels/f1/2e/9b/7907d32e006eaf4954ad1a976bff08867851394b26719a6d5f
Successfully built functions
Installing collected packages: functions
Successfully installed functions-0.7.0


SyntaxError: ignored