#### DeepDTA

### DeepDTI

DeepConv-DTI: Prediction of drug-target
interactions via deep learning with
convolution on protein sequences


Paper : https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007129

Code : https://github.com/GIST-CSBL/DeepConv-DTI.


Cost, labor-intensive, in vitro and in vivo, silico-based, local residue patterns of proteins, binding sites, early stage of drug
discovery

Docking Approach, Ray-Casting (DARC)


Chemical space + Genomic spaces = Pharmacological space

Yamanashi - kernel regression method
Beakley - bipartite local model

Matrix Factorization Methods


cons

similarity-based methods work well for DTIs within
specific protein classes but not for other classes


pros


features-based methods
for proteins, composition, transition, and distribution (CTD).
for drugs, fingerprints.

structure-activity relationship (QSAR) models

Simplified molecular-input line entry system (SMILES) and Amino Acid Sequences.

deep-nets

DeepDTI

The composition of amino acids, dipeptides, and tripeptides for proteins and extended-connectivity fingerprint (ECFP) for drugs
MFDR
DL-CPI
DeepDTA

KronRLS
SimBoost

### refs

cost, silico - https://www.sciencedirect.com/science/article/abs/pii/S0009279706003541?via%3Dihub

## ECFP4, PubChem, PSC, CTD

## ECFP4, PubChem, PSC, CTD

### MolTrans,

### PromolNet

In [None]:
!pip install propy3 khmer

import sys
import urllib
import string
from pathlib import Path
from itertools import repeat, product


import pandas as pd

import khmer

from propy import PyPro
from propy.GetProteinFromUniprot import GetProteinSequence


from IPython.display import Image

In [None]:
datasets = ["BIOSNAP", "BindingDB", "DAVIS"]
dataset_identifiers = ["train", "test", "val"]
file_type = "csv"

In [None]:
data_path = Path("data")
data_path.mkdir(exist_ok=True)

datasets = ["BIOSNAP", "BindingDB", "DAVIS"]
dataset_identifiers = ["train", "test", "val"]
file_type = "csv"

BASE_URL = "https://raw.githubusercontent.com/kexinhuang12345/MolTrans/master/dataset/"

col_identifiers = ["DrugBank ID", "Gene", "Label", "SMILES", "Target Sequence"]

for dataset in datasets:
    (data_path / dataset).mkdir(exist_ok=True)
    for data_identifier in dataset_identifiers:
        file_name = data_identifier + "." + file_type
        if dataset == "BIOSNAP":
            file_category = "full_data" 
            file_ids = [file_category, file_name]
            urllib.request.urlretrieve(BASE_URL + "/".join([dataset, file_category, file_ids[1]]), data_path / dataset / ("_".join(file_ids)))
            filename = "_".join(file_ids)
        
        else:
            urllib.request.urlretrieve(BASE_URL + "/".join([dataset, file_name]), data_path / dataset / (file_name))

    if dataset == "BIOSNAP":
      train, val, test = [pd.read_csv(data_path / dataset / ("full_data" + "_" + data_identifier + ".csv")) for data_identifier in dataset_identifiers]

    else:
      train, val, test = [pd.read_csv(data_path / dataset / (data_identifier + ".csv")) for data_identifier in dataset_identifiers]

    train["dataset_type"] = "train"
    val["dataset_type"] = "val"
    test["dataset_type"] = "test"

    all_dataset = pd.concat([train, val, test]).reset_index(drop=True)

    del train, val, test

    for dataset_type in dataset_identifiers:
        all_dataset[all_dataset["dataset_type"].isin([dataset_type])].index.values.tofile(data_path / dataset / (dataset_type + "_" + "indices.bin"))
            
    all_dataset.to_csv(data_path / dataset / "data.csv")

### Encoding Methods

#### Protein subfamily representation

In [None]:
STAGE_1_ALPHABET = list("LVIMCAGSTPFYWEDNQKRG")
STAGE_2_ALPHABET = ["LVIM","C", "A", "G", "S", "T", "P", "FY", "W", "E", "D", "N", "Q", "KR", "H"]
STAGE_3_ALPHABET = ["LVIM","C", "A", "G", "ST", "P", "FYW", "EDNQ", "KR", "H"]
STAGE_4_ALPHABET = ["LVIMC", "AG", "ST", "P", "FYW", "EDNQ", "KR", "H"]
STAGE_5_ALPHABET = ["LVIMC", "AGSTP", "FYW", "EDNQKRH"]
STAGE_6_ALPHABET = ["LVIMCAGSTPFYW", "EDNQKRH"]

In [None]:
def create_alphabet_merging_rules(mappings, 
                                  alphabet_length = 20,
                                  mapping_characters = string.ascii_lowercase):
    
    rules = dict()
    
    for mapping in mappings:
      character_num_in_rule = len(mapping) 
      if character_num_in_rule > 1:
          rules.update(dict(zip(mapping, repeat(mapping_characters[0]))))
      else:
          rules.update(dict(zip(mapping, mapping_characters[0])))
          
      mapping_characters = mapping_characters[1:]

    return rules


def compose_sequence(sequence, 
                     mapping_rules):
    
    for key, value in mapping_rules.items():
        sequence = sequence.replace(key, value)

    return sequence

def get_composed_sequences(sequences, rule):
    return sequences.apply(lambda seq: compose_sequence(seq, rule))

def concat_composed_sequences(sequences, rules, columns):
    composed_sequences = pd.concat([get_composed_sequences(sequences, rule) for rule in rules], axis=1)
    composed_sequences.columns = columns
    return composed_sequences

In [None]:
data = pd.read_csv("/content/data/BIOSNAP/data.csv")

In [None]:
rules = {f"ALPHABET-{idx + 2}": create_alphabet_merging_rules(alphabet) \
          for idx, alphabet in enumerate([STAGE_2_ALPHABET, STAGE_3_ALPHABET, STAGE_4_ALPHABET, STAGE_5_ALPHABET, STAGE_6_ALPHABET])}

In [None]:
included_alphabets = ["ALPHABET-2", "ALPHABET-3", "ALPHABET-4"]
included_rules = [rules[alphabet] for alphabet in included_alphabets]

concat_composed_sequences(data.loc[:, "Target Sequence"], included_rules, columns=included_alphabets)

Unnamed: 0,ALPHABET-2,ALPHABET-3,ALPHABET-4
0,acneaaagamaaaaeacajfcdjjcmdknaakdcgbcndeogimac...,acieaaafahaaaaeacahecdhhchdhiaahdcfbcidejfghac...,abgcaaadafaaaacabafcbbffbfbfgaafbbdabgbchdefab...
1,ahdlheohanhgcdhddegdofdefeaegeccaefdngakeogehf...,agdhgejgaigfcdgddefdjedeeeaefeccaeedifahejfege...,aebfecheagedbbebbcdbhcbcccacdcbbaccbgdafchdcec...
2,acfhajaefncnagaadadfinegadnanjcanacakcdhnoakbc...,acegahaeeiciafaadadegiefadiaihcaiacahcdgijahbc...,abceafaccgbgadaababcegcdabgagfbagabafbbeghafab...
3,adcdaaaadcejgdlaeeccgagkdccfccnaaagceggceaaggc...,adcdaaaadcehfdhaeeccfafhdccecciaaafceffceaaffc...,abbbaaaabbcfdbfaccbbdadfbbbcbbgaaadbcddbcaaddb...
4,ahaifednfeeehnokjnnlahmnankokaaknnnfafcancdjkn...,agageedieeeegijhhiihaghiaihjhaahiiieaecaicdhhi...,aeaeccbgcccceghffggfaefgagfhfaafgggcacbagbbffg...
...,...,...,...
27478,adajcaagacaaacahaaaakaaonnmniccnhggdgagagdadla...,adahcaafacaaacagaaaahaajiihigccigffdfafafdadha...,abafbaadabaaabaeaaaafaahggfgebbgeddbdadadbabfa...
27479,aldgcamgeegeecgecegcccgndiejhbjaocacccnjacnmhi...,ahdfcahfeefeecfecefcccfidgehgbhajcacccihacihgg...,afbdbafdccdccbdcbcdbbbdgbecfeafahbabbbgfabgfee...
27480,ackahgdlkefcemkaclnhcnndcanmnlaojankonhacnhhnm...,achagfdhheecehhachigciidcaihihajhaihjigaciggih...,abfaedbffccbcffabfgebggbbagfgfahfagfhgeabgeegf...
27481,aeennfnfnfnnngmncfelahcahkmemamjhnjchlaakmlnkd...,aeeiieieieiiifhiceehagcaghhehahhgihcghaahhhihd...,accggcgcgcgggdfgbccfaebaeffcfaffegfbefaafffgfb...


#### K-Mer Counter

In [None]:
def extract_kmers(sequence, ksize=2, kfactor=4):
    ksize = ksize
    nkmers = 1e7 # 32 ** ksize
    tablesize = 1 #+ extra_slot_num

    kmer = khmer.Counttable(ksize, 1e7, tablesize)#, tablesize, ))
    kmer.set_use_bigcount(True)
    kmer.consume(sequence)

    return kmer

def get_unique_khmer_set(alphabet, ksize):
    items = sorted(list(map(lambda x: "".join(x), product(*[alphabet \
                                                            for ix in range(ksize)]))))
    return items

def get_kmer_counts(kmers, items):
    return dict(zip(items, map(lambda x: kmers.get(x), items)))


def get_residue_patterns_from_composed_sequence(sequence, items, kmer_size=2):
    kmers = extract_kmers(sequence)
    return pd.Series(get_kmer_counts(kmers, items))


def concat_kmer_patterns(sequence_list, alphabet, kmer_size=2):
    alphabet_letters = sorted(set(alphabet))
    items = get_unique_khmer_set(alphabet_letters, kmer_size)

    return pd.concat([get_residue_patterns_from_composed_sequence(sequence, items, kmer_size) \
                      for sequence in sequence_list], axis=1)

In [None]:
concat_kmer_patterns(concat_composed_sequences(data.loc[:, "Target Sequence"], included_rules, columns=included_alphabets).loc[:100, "ALPHABET-4"].values, rules["ALPHABET-4"].values())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
aa,25,33,17,56,26,32,23,11,34,59,51,15,38,196,37,24,33,34,57,18,16,37,9,181,6,29,43,50,24,50,21,75,44,170,48,44,26,29,25,14,...,22,50,22,52,19,35,86,43,28,22,37,24,46,16,60,46,50,51,48,59,42,29,12,46,51,30,23,19,19,45,43,3,40,44,16,52,50,23,56,49
ab,16,39,25,41,16,28,30,8,23,64,40,27,36,155,33,23,46,63,33,14,10,50,5,166,7,45,35,24,30,25,21,70,35,114,49,34,50,18,35,30,...,37,24,26,20,33,45,83,36,45,30,33,23,63,22,24,63,25,38,34,64,32,46,14,33,50,43,31,14,29,43,54,7,41,29,22,20,25,34,75,49
ac,20,28,16,26,28,31,12,11,56,80,54,28,26,135,39,24,22,34,36,16,20,41,9,165,4,36,35,32,20,34,23,70,45,248,42,38,24,46,28,16,...,26,32,34,33,21,25,66,55,28,19,35,31,42,10,51,42,34,24,41,80,26,27,17,31,60,18,34,8,15,25,44,2,21,43,13,33,34,31,71,58
ad,7,13,7,17,2,9,10,3,20,31,21,9,12,52,16,11,13,13,12,7,4,20,6,69,0,12,14,16,6,20,12,25,11,94,18,15,11,17,8,3,...,5,16,8,18,12,12,35,21,14,9,20,11,16,12,15,16,20,23,10,31,20,15,4,20,29,12,14,7,6,12,16,2,12,16,2,18,20,8,7,16
ae,4,11,14,26,18,21,16,7,27,46,38,22,30,122,32,22,29,19,29,11,15,25,5,125,4,18,17,40,19,27,6,62,27,89,25,24,8,25,21,9,...,28,40,20,26,20,7,43,22,12,10,34,25,28,5,27,28,27,31,37,46,30,23,11,33,44,7,17,7,16,19,30,6,18,34,19,26,27,24,45,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hd,2,2,1,0,0,1,4,0,1,3,0,1,0,6,2,0,2,0,2,0,0,5,0,6,0,0,2,0,0,3,3,2,1,4,2,1,0,1,2,0,...,1,0,0,2,4,0,3,5,1,1,1,2,1,3,2,1,3,3,1,3,1,1,2,1,4,1,1,1,2,1,0,0,1,2,1,2,3,0,2,2
he,0,3,3,2,3,5,2,1,1,2,1,3,3,4,3,3,4,0,4,0,1,3,4,12,2,4,1,1,1,2,0,1,1,13,3,3,0,5,1,2,...,1,1,3,2,3,1,6,3,2,0,0,1,1,4,4,1,2,6,2,2,7,3,3,1,3,2,3,3,0,7,2,0,2,3,1,2,2,5,3,3
hf,0,2,2,0,4,5,5,0,2,9,1,3,2,21,7,2,11,5,5,1,0,7,3,13,4,6,9,4,1,6,0,3,1,26,3,4,3,2,5,3,...,7,4,3,5,2,3,9,11,4,5,1,4,1,1,3,1,6,10,0,9,7,3,1,0,10,3,6,4,6,6,4,1,3,10,3,5,6,4,2,9
hg,3,4,3,2,1,6,5,1,3,9,1,1,1,16,3,2,6,3,3,1,0,2,4,14,2,4,0,1,0,3,1,2,0,20,2,3,7,0,2,1,...,2,1,2,2,3,0,7,8,5,2,3,0,3,0,5,3,3,0,2,9,4,0,3,6,7,1,0,0,5,3,2,1,2,2,0,2,3,0,2,3


#### Protein composition extraction

In [None]:
compositions = ["AAComp", "DPComp", "TPComp"]

In [None]:
def extract_composition(sequence, composition="AAComp"):
    protein_properties = PyPro.GetProDes(sequence)
    compositions = getattr(protein_properties, f"Get{composition}")()
    return pd.Series(compositions.values(), index=pd.Index(compositions.keys(), name=sequence))

In [None]:
def concat_compositions(sequence, compositions):
    return pd.concat([extract_composition(sequence, composition) for composition in compositions]).T

In [None]:
pd.concat([concat_compositions(data.loc[ix, "Target Sequence"], compositions) for ix in range(5)], axis=1)

Unnamed: 0,0,1,2,3,4
A,5.534,6.911,7.595,13.836,3.404
R,4.743,5.616,3.165,7.966,4.681
N,3.953,3.456,3.481,2.306,4.681
D,5.534,4.536,5.380,4.193,2.979
C,4.743,2.808,1.266,3.354,4.681
...,...,...,...,...,...
VVS,0.000,0.000,0.000,1.000,0.000
VVT,0.000,0.000,0.000,0.000,0.000
VVW,0.000,0.000,0.000,1.000,0.000
VVY,0.000,0.000,0.000,0.000,0.000
