In [None]:
import tensorflow as tf
from os import path, getcwd, chdir
config = tf.compat.v1.ConfigProto()
sess = tf.compat.v1.Session(config = config)
import rdkit
import numpy as np
from rdkit import *
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [None]:
from rdkit.Chem import rdchem
import re
Chiral = {"CHI_UNSPECIFIED":0,  "CHI_TETRAHEDRAL_CW":1, "CHI_TETRAHEDRAL_CCW":2, "CHI_OTHER":3}

Hybridization = {"UNSPECIFIED":0, "S":1, "SP":2, "SP2":3, "SP3":4, "SP3D":5, "SP3D2":6, "OTHER":7}
# the size of featurevector          
atomInfo = 21
structInfo = 21
lensize= atomInfo + structInfo

H_Vector = [0]*atomInfo
H_Vector[0]= 1
# check the lower cases  
lowerReg = re.compile(r'^[a-z]+$')
def islower(s):
    return lowerReg.match(s) is not None
# check the upper cases       
upperReg = re.compile(r'^[A-Z]+$')
def isupper(s):
    return upperReg.match(s) is not None
# information of atoms 
def calc_atom_feature(atom):
    
    if atom.GetSymbol() == 'H':   feature = [1,0,0,0,0]
    elif atom.GetSymbol() == 'C': feature = [0,1,0,0,0]
    elif atom.GetSymbol() == 'O': feature = [0,0,1,0,0]
    elif atom.GetSymbol() == 'N': feature = [0,0,0,1,0]
    else: feature = [0,0,0,0,1]
        
    feature.append(atom.GetTotalNumHs()/8)
    feature.append(atom.GetTotalDegree()/4)
    feature.append(atom.GetFormalCharge()/8)
    feature.append(atom.GetTotalValence()/8)
    feature.append(atom.IsInRing()*1)
    feature.append(atom.GetIsAromatic()*1)

    f =  [0]*(len(Chiral)-1)
    if Chiral.get(str(atom.GetChiralTag()), 0) != 0:
        f[Chiral.get(str(atom.GetChiralTag()), 0)] = 1
    feature.extend(f)

    f =  [0]*(len(Hybridization)-1)
    if Hybridization.get(str(atom.GetHybridization()), 0) != 0:
        f[Hybridization.get(str(atom.GetHybridization()), 0)] = 1
    feature.extend(f)
    
    return(feature)

def calc_structure_feature(c,flag,label):
    feature = [0]*structInfo

    if c== '(' :
        feature[0] = 1
        flag = 0
    elif c== ')' :
        feature[1] = 1
        flag = 0
    elif c== '[' :
        feature[2] = 1
        flag = 0
    elif c== ']' :
        feature[3] = 1
        flag = 0
    elif c== '.' :
        feature[4] = 1
        flag = 0
    elif c== ':' :
        feature[5] = 1
        flag = 0
    elif c== '=' :
        feature[6] = 1
        flag = 0
    elif c== '#' :
        feature[7] = 1
        flag = 0
    elif c== '\\':
        feature[8] = 1
        flag = 0
    elif c== '/' :
        feature[9] = 1
        flag = 0  
    elif c== '@' :
        feature[10] = 1
        flag = 0
    elif c== '+' :
        feature[11] = 1
        flag = 1
    elif c== '-' :
        feature[12] = 1
        flag = 1
    elif c.isdigit() == True:
        if flag == 0:
            if c in label:
                feature[20] = 1
            else:
                label.append(c)
                feature[19] = 1
        else:
            feature[int(c)-1+12] = 1
            flag = 0
    return(feature,flag,label)

def calc_featurevector(mol, smiles,atomsize):
    flag = 0
    label = []
    molfeature = []
    idx = 0
    j = 0
            
    for c in smiles:
        if islower(c) == True: continue
        elif isupper(c) == True:
            if c == 'H':
                molfeature.extend(H_Vector)
            else:
                molfeature.extend(calc_atom_feature(rdchem.Mol.GetAtomWithIdx(mol, idx)))
                idx = idx + 1
            molfeature.extend([0]*structInfo)
            j = j +1
            
        else:   
            molfeature.extend([0]*atomInfo)
            f,flag,label = calc_structure_feature(c,flag,label)
            molfeature.extend(f)
            j = j +1

    #0-Padding
    molfeature.extend([0]*(atomsize-j)*lensize)        
    return(molfeature)
# change molecules to features     
def mol_to_feature(mol,n,atomsize):
    try: defaultSMILES = Chem.MolToSmiles(mol, kekuleSmiles=False, isomericSmiles=True, rootedAtAtom=int(n))
    except: defaultSMILES = Chem.MolToSmiles(mol, kekuleSmiles=False, isomericSmiles=True)
    try: isomerSMILES = Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True, rootedAtAtom=int(n))
    except: isomerSMILES = Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True)
    return calc_featurevector(Chem.MolFromSmiles(defaultSMILES), isomerSMILES,atomsize)

def mol_to_allSMILESfeature(mol, atomsize):
    idx, features =0,  []
    while idx < mol.GetNumAtoms():
        try: defaultSMILES = Chem.MolToSmiles(mol, kekuleSmiles=False, isomericSmiles=True, rootedAtAtom=int(idx))
        except: break
        isomerSMILES = Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True, rootedAtAtom=int(idx))
        features.append(calc_featurevector(Chem.MolFromSmiles(defaultSMILES), isomerSMILES,atomsize))
        idx = idx + 1
    return(features)

In [None]:
# read the SMILES-MoA data  
import pandas as pd
df = pd.read_csv('top_20_MOAs.txt', sep = '\t')

In [None]:
# change 20 MoAs to classes 
MOA_class_dictionary = {'EGFR inhibitor': 8,
 'HDAC inhibitor': 16,
 'PI3K inhibitor': 13,
 'acetylcholine receptor agonist': 1,
 'acetylcholine receptor antagonist': 4,
 'adrenergic receptor agonist': 18,
 'adrenergic receptor antagonist': 15,
 'bacterial cell wall synthesis inhibitor': 14,
 'benzodiazepine receptor agonist': 10,
 'calcium channel blocker': 5,
 'cyclooxygenase inhibitor': 6,
 'dopamine receptor antagonist': 12,
 'glucocorticoid receptor agonist': 9,
 'glutamate receptor antagonist': 19,
 'histamine receptor antagonist': 17,
 'phosphodiesterase inhibitor': 3,
 'serotonin receptor agonist': 7,
 'serotonin receptor antagonist': 2,
 'sodium channel blocker': 11,
 'topoisomerase inhibitor': 0}

In [None]:
# add classes column  
df['classes'] = None
for i in range(df.shape[0]):
  df.iloc[i,2] = MOA_class_dictionary[df.iloc[i,1]]

In [None]:
# Split out the test set  
from sklearn.model_selection import train_test_split
x_train_valid, x_test, y_train_valid, y_test = train_test_split(df.SMILES, df.classes, test_size =10/100,
 stratify = df.classes, shuffle = True, random_state = 1000)

In [None]:
# kfold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 9)
skf.get_n_splits(np.array(list(x_train_valid)), np.array(list(y_train_valid)))
train_index_list = []
valid_index_list = []
for train_index, valid_index in skf.split(np.array(list(x_train_valid)), np.array(list(y_train_valid))):
  train_index_list.append(train_index)
  valid_index_list.append(valid_index)

In [None]:
number_of_kfold = 0 # change the number from 0-8 to get 9 shuffles
x_train = list(np.array(list(x_train_valid))[train_index_list[ number_of_kfold ]])
x_valid = list(np.array(list(x_train_valid))[valid_index_list[ number_of_kfold ]])
y_train = list(np.array(list(y_train_valid))[train_index_list[ number_of_kfold ]])
y_valid = list(np.array(list(y_train_valid))[valid_index_list[ number_of_kfold ]])
x_test = list(x_test)
y_test = list(y_test)

In [None]:
x_all = list(x_train) + list(x_valid) + list(x_test)
y_all = list(y_train) + list(y_valid) + list(y_test)

In [None]:
import pandas as pd
pd_x_all = pd.DataFrame(x_all)
pd_x_all['labels'] = y_all
pd_x_all.to_csv('6663.txt', sep = ' ', index = False, header = False,)

In [None]:
import SCFPfunctions as Mf
xp = np 
file = '6663.txt'
smi = Chem.SmilesMolSupplier(file, delimiter = ' ', titleLine = False)
mols = [mol for mol in smi if mol is not None]

In [None]:
# get the feature matrix and labels 
F_list, T_list = [], []
for mol in mols:
  F_list.append(mol_to_feature(mol,-1,500))
  T_list.append(mol.GetProp('_Name'))
 
data_f = xp.asarray(F_list, dtype = xp.float32).reshape(-1, 1, 500, lensize)
data_t = xp.asarray(T_list, dtype = xp.int32).reshape(-1, 1)

dataset = (data_f, data_t)
x, y = dataset[0].T, dataset[1]
x = np.moveaxis(x, -1, 0)

In [None]:
# get the train, valid, test sets 
x_train = x[0:len(x_train)] 
x_valid = x[len(x_train):len(x_train) + len(x_valid)]  
x_test = x[len(x_train) + len(x_valid):len(x_train) + len(x_valid) + len(x_test)] 

y_list = [int(i) for i in y]
Y_train = y_list[0:len(x_train)] 
Y_valid = y_list[len(x_train):len(x_train) + len(x_valid)]  
Y_test = y_list[len(x_train) + len(x_valid):len(x_train) + len(x_valid) + len(x_test)] 

In [None]:
# create class weights
train_Y = y_train
y_unique = np.unique(np.array(train_Y))
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight = 'balanced',classes = y_unique,
                       y = np.array(train_Y)) 
class_weights_dict45 = dict(enumerate(class_weights))

In [None]:
# set the architecture of model      
drop = 0.8  
model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(8,(3,3),activation='relu',input_shape=(42, 500, 1)),
            tf.keras.layers.MaxPool2D((3,3)),
            tf.keras.layers.Dropout(drop),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dropout(drop),
            tf.keras.layers.Dense(units = len(set(y_list)), activation='softmax')])

In [None]:
# set the checkpoint   
from keras.callbacks import ModelCheckpoint
filepath_cnn = './content/CNN_20_MOA_weights.hdf5'
checkpoint_cnn = ModelCheckpoint(filepath_cnn, monitor='val_accuracy', verbose=0, save_best_only=True,
              mode='max')

In [None]:
# compile the model 
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),  
       loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
       metrics = ['accuracy']) 

In [None]:
# train the model 
from tensorflow.keras.callbacks import EarlyStopping  
earlyStopping = EarlyStopping(monitor = 'val_loss', patience = 30, verbose = 0, mode = 'min')
reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
           factor = 0.89, patience = 7, verbose = 1, min_delta = 1e-119, mode = 'min')
history = model.fit(x_train, np.array(y_train), validation_data=(x_valid, np.array(y_valid)),
           class_weight = class_weights_dict45, shuffle = True, verbose = 2, epochs = 1800,
           batch_size = 64, callbacks=[earlyStopping, checkpoint_cnn, reduce_lr_loss])

In [None]:
# Load the best model
from keras.models import load_model
best_model = load_model(filepath_cnn)

In [None]:
# Evaluate the model 
from sklearn.metrics import classification_report
assert list(y_test)[0:5] == [14, 12, 6, 13, 14]
print(classification_report(y_test, np.array(best_model.predict(x_test).argmax(-1)),))

In [None]:
# Training curves
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc = 'upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc = 'upper left')
plt.show()

In [None]:
# References 
# http://www.dna.bio.keio.ac.jp/smiles/
# https://github.com/pharmbio/dl_quantmap/tree/master/cross_validation/CNN