In [None]:
# install rdkit  
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y -c rdkit rdkit python=3.7
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import pprint     
pprint.pprint(sys.path)
!python -c "import site; print (site.getsitepackages())"

In [2]:
import pandas as pd
df = pd.read_csv('top_20_MOAs.txt', sep = '\t')
df

Unnamed: 0,SMILES,MOA
0,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O,adrenergic receptor antagonist
1,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C1CCCO1,adrenergic receptor antagonist
2,CC(C)NCC(O)COc1ccccc1CC=C,adrenergic receptor antagonist
3,COc1ccccc1N1CCN(CCN2C(=O)c3ccccc3C(C)(C)C2=O)CC1,adrenergic receptor antagonist
4,CC(C)(C)NCC(O)CSc1nc(cs1)-c1ccc(s1)C(N)=O,adrenergic receptor antagonist
...,...,...
1216,OCCCC(O)=O,benzodiazepine receptor agonist
1217,CN1c2ccc(Cl)cc2C(=NC(O)C1=O)c1ccccc1,benzodiazepine receptor agonist
1218,CCN(C(C)=O)c1cccc(c1)-c1ccnc2c(cnn12)C#N,benzodiazepine receptor agonist
1219,CCOC(=O)c1ncc2[nH]c3ccc(OCc4ccccc4)cc3c2c1COC,benzodiazepine receptor agonist


In [3]:
# check the duplicates 
for i in df.SMILES.tolist():
  if df.SMILES.tolist().count(i) != 1:
    print(i)

In [4]:
MOA_class_dictionary = {'EGFR inhibitor': 8,
 'HDAC inhibitor': 16,
 'PI3K inhibitor': 13,
 'acetylcholine receptor agonist': 1,
 'acetylcholine receptor antagonist': 4,
 'adrenergic receptor agonist': 18,
 'adrenergic receptor antagonist': 15,
 'bacterial cell wall synthesis inhibitor': 14,
 'benzodiazepine receptor agonist': 10,
 'calcium channel blocker': 5,
 'cyclooxygenase inhibitor': 6,
 'dopamine receptor antagonist': 12,
 'glucocorticoid receptor agonist': 9,
 'glutamate receptor antagonist': 19,
 'histamine receptor antagonist': 17,
 'phosphodiesterase inhibitor': 3,
 'serotonin receptor agonist': 7,
 'serotonin receptor antagonist': 2,
 'sodium channel blocker': 11,
 'topoisomerase inhibitor': 0}

In [5]:
sorted_classes = list(MOA_class_dictionary.values())
sorted_classes.sort() 
assert sorted_classes == [i for i in range(20)]

In [6]:
# add classes column 
df['classes'] = None
for i in range(df.shape[0]):
  df.iloc[i,2] = MOA_class_dictionary[df.iloc[i,1]]
df

Unnamed: 0,SMILES,MOA,classes
0,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O,adrenergic receptor antagonist,15
1,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C1CCCO1,adrenergic receptor antagonist,15
2,CC(C)NCC(O)COc1ccccc1CC=C,adrenergic receptor antagonist,15
3,COc1ccccc1N1CCN(CCN2C(=O)c3ccccc3C(C)(C)C2=O)CC1,adrenergic receptor antagonist,15
4,CC(C)(C)NCC(O)CSc1nc(cs1)-c1ccc(s1)C(N)=O,adrenergic receptor antagonist,15
...,...,...,...
1216,OCCCC(O)=O,benzodiazepine receptor agonist,10
1217,CN1c2ccc(Cl)cc2C(=NC(O)C1=O)c1ccccc1,benzodiazepine receptor agonist,10
1218,CCN(C(C)=O)c1cccc(c1)-c1ccnc2c(cnn12)C#N,benzodiazepine receptor agonist,10
1219,CCOC(=O)c1ncc2[nH]c3ccc(OCc4ccccc4)cc3c2c1COC,benzodiazepine receptor agonist,10


In [7]:
# A function that changes smiles string to fingerprints 
import rdkit
import numpy as np
from rdkit import *
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
def smiles_to_array_to_string(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays  = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = list((np.squeeze(x_array)).astype(int))
  string = ''
  for i in x_array:
    string += str(i) 
  return string

In [8]:
# Check the existence of Isomers
assert len(set([smiles_to_array_to_string(i) for i in df.SMILES.tolist()])) == df.shape[0]

In [9]:
df.head(3)

Unnamed: 0,SMILES,MOA,classes
0,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O,adrenergic receptor antagonist,15
1,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C1CCCO1,adrenergic receptor antagonist,15
2,CC(C)NCC(O)COc1ccccc1CC=C,adrenergic receptor antagonist,15


In [10]:
# Split out the test set  
from sklearn.model_selection import train_test_split
x_train_valid, x_test, y_train_valid, y_test = train_test_split(df.SMILES, df.classes, test_size =10/100,
 stratify = df.classes, shuffle = True, random_state = 1000)

In [11]:
# kfold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 9)
skf.get_n_splits(np.array(list(x_train_valid)), np.array(list(y_train_valid)))
train_index_list = []
valid_index_list = []
for train_index, valid_index in skf.split(np.array(list(x_train_valid)), np.array(list(y_train_valid))):
  train_index_list.append(train_index)
  valid_index_list.append(valid_index)

In [12]:
a_list = []
for i in range(9):
  a_list += list(np.array(list(x_train_valid))[valid_index_list[i]])

In [13]:
number_of_kfold = 6 # change the number from 0-8 to get 9 shuffles

In [14]:
  x_train = list(np.array(list(x_train_valid))[train_index_list[ number_of_kfold ]])
  x_valid = list(np.array(list(x_train_valid))[valid_index_list[ number_of_kfold ]])
  y_train = list(np.array(list(y_train_valid))[train_index_list[ number_of_kfold ]])
  y_valid = list(np.array(list(y_train_valid))[valid_index_list[ number_of_kfold ]])
  x_test = list(x_test)
  y_test = list(y_test)

In [15]:
# turn to cannoical  smiles
x_train = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_train]
x_valid = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_valid]
x_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_test]

In [16]:
def smiles_to_array(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = ((np.squeeze(x_array)).astype(int)) 
  return x_array

In [17]:
train_x = np.zeros((len(x_train), 2048), dtype = np.float32)
for f in range(train_x.shape[0]):
  train_x[f] = smiles_to_array(x_train[f])

In [18]:
valid_x = np.zeros((len(x_valid), 2048), dtype = np.float32)
for f in range(valid_x.shape[0]):
  valid_x[f] = smiles_to_array(x_valid[f])

In [19]:
test_x = np.zeros((len(x_test), 2048), dtype = np.float32)
for f in range(test_x.shape[0]):
  test_x[f] = smiles_to_array(x_test[f])

In [20]:
# Check if there are overlaps
overlap = []
for i in range(train_x.shape[0]):
  for j in range(valid_x.shape[0]):
    if np.array_equal(train_x[i], valid_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [21]:
for i in range(test_x.shape[0]):
  for j in range(valid_x.shape[0]):
    if np.array_equal(test_x[i], valid_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [22]:
for i in range(train_x.shape[0]):
  for j in range(test_x.shape[0]):
    if np.array_equal(train_x[i], test_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [23]:
assert len(overlap) == 0

In [24]:
y_train = np.array(y_train).astype(int)
y_valid = np.array(y_valid).astype(int)
y_test = np.array(y_test).astype(int)

In [25]:
import gc               
gc.collect() 

66

In [26]:
# Create class weights
from sklearn.utils import class_weight
y_unique = np.unique(np.array(y_train))
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = y_unique,
                y = np.array(y_train)) 
class_weights_dict45 = dict(enumerate(class_weights))
class_weights_dict45

{0: 1.6827586206896552,
 1: 1.3942857142857144,
 2: 0.8872727272727273,
 3: 0.976,
 4: 0.7283582089552239,
 5: 1.318918918918919,
 6: 0.6177215189873417,
 7: 0.9959183673469387,
 8: 1.525,
 9: 1.318918918918919,
 10: 1.8074074074074074,
 11: 1.5741935483870968,
 12: 0.9568627450980393,
 13: 1.4787878787878788,
 14: 0.6421052631578947,
 15: 0.6421052631578947,
 16: 1.5741935483870968,
 17: 0.8714285714285714,
 18: 0.7393939393939394,
 19: 0.8271186440677966}

In [27]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import classification_report

In [28]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
therandomforest = RandomForestClassifier(random_state = 0, class_weight = class_weights_dict45)
print(therandomforest.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, therandomforest.predict(test_x)))

0.7560975609756098               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.80      1.00      0.89         4
           2       0.43      0.43      0.43         7
           3       0.50      0.50      0.50         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.77      1.00      0.87        10
           7       0.56      0.83      0.67         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.50      0.67         4
          11       1.00      0.25      0.40         4
          12       0.83      0.83      0.83         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.70      0.70      0.70        10
          16       1.00      1.00      1.00         4
        

In [29]:
assert list(y_test[0:5])  == [14, 12, 6, 13, 14]
from sklearn.neighbors import KNeighborsClassifier
theneighbor = KNeighborsClassifier(n_neighbors = 1, algorithm = 'kd_tree',)
print(theneighbor.fit(train_x , y_train).score(test_x, y_test),
   classification_report(y_test, theneighbor.predict(test_x)))

0.6585365853658537               precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.50      0.75      0.60         4
           2       0.67      0.57      0.62         7
           3       0.29      0.33      0.31         6
           4       1.00      0.88      0.93         8
           5       1.00      0.80      0.89         5
           6       0.54      0.70      0.61        10
           7       0.33      0.67      0.44         6
           8       0.80      1.00      0.89         4
           9       1.00      1.00      1.00         5
          10       1.00      0.50      0.67         4
          11       0.50      0.50      0.50         4
          12       1.00      0.83      0.91         6
          13       1.00      0.25      0.40         4
          14       1.00      0.80      0.89        10
          15       0.60      0.60      0.60        10
          16       0.60      0.75      0.67         4
        

In [30]:
assert  list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.linear_model import LogisticRegression
thelogisticregression = LogisticRegression(random_state = 0, class_weight = class_weights_dict45) 
print(thelogisticregression.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelogisticregression.predict(test_x)))

0.7317073170731707               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.60      0.75      0.67         4
           2       0.57      0.57      0.57         7
           3       1.00      0.50      0.67         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.73      0.80      0.76        10
           7       0.56      0.83      0.67         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.50      0.67         4
          11       0.20      0.25      0.22         4
          12       0.71      0.83      0.77         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.70      0.70      0.70        10
          16       1.00      1.00      1.00         4
        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [31]:
assert list(y_test[0:5])  == [14, 12, 6, 13, 14]
from lightgbm import LGBMClassifier
thelgbclassifier = LGBMClassifier(class_weight = class_weights_dict45)
print(thelgbclassifier.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelgbclassifier.predict(test_x), ))

0.6747967479674797               precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       0.50      0.50      0.50         4
           2       0.38      0.43      0.40         7
           3       0.60      0.50      0.55         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.80      0.80      0.80        10
           7       0.57      0.67      0.62         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       0.50      0.25      0.33         4
          11       0.25      0.25      0.25         4
          12       0.71      0.83      0.77         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.55      0.60      0.57        10
          16       1.00      1.00      1.00         4
        

In [32]:
pip  install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [33]:
from catboost import CatBoostClassifier
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
thecatboost = CatBoostClassifier(verbose = 0, class_weights = class_weights_dict45, task_type = "GPU")
print(thecatboost.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thecatboost.predict(test_x)))

0.7235772357723578               precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       0.50      0.75      0.60         4
           2       0.44      0.57      0.50         7
           3       0.60      0.50      0.55         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.80      0.80      0.80        10
           7       0.71      0.83      0.77         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.25      0.40         4
          11       0.25      0.25      0.25         4
          12       0.71      0.83      0.77         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.78      0.70      0.74        10
          16       1.00      1.00      1.00         4
        

In [34]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import BaggingClassifier
thebagging = BaggingClassifier(base_estimator = therandomforest, random_state = 0)
print(thebagging.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thebagging.predict(test_x),))

0.6991869918699187               precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.75      0.75      0.75         4
           2       0.67      0.29      0.40         7
           3       0.60      0.50      0.55         6
           4       0.54      0.88      0.67         8
           5       1.00      0.80      0.89         5
           6       0.53      1.00      0.69        10
           7       0.44      0.67      0.53         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.25      0.40         4
          11       1.00      0.25      0.40         4
          12       0.83      0.83      0.83         6
          13       0.50      0.25      0.33         4
          14       1.00      0.90      0.95        10
          15       0.75      0.60      0.67        10
          16       1.00      1.00      1.00         4
        

In [35]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import StackingClassifier
estimators = [
('therandomforest', therandomforest), 
('thecatboost', thecatboost), 
('thelogisticregression', thelogisticregression)]
thestacking = StackingClassifier(estimators = estimators, final_estimator = therandomforest)
print(thestacking.fit(train_x, y_train).score(test_x, y_test),
  classification_report(y_test, thestacking.predict(test_x),))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.7317073170731707               precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       0.60      0.75      0.67         4
           2       0.57      0.57      0.57         7
           3       0.67      0.67      0.67         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.89      0.80      0.84        10
           7       0.71      0.83      0.77         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.25      0.40         4
          11       0.20      0.25      0.22         4
          12       0.71      0.83      0.77         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.64      0.70      0.67        10
          16       1.00      1.00      1.00         4
        

In [36]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
thevoting = VotingClassifier(estimators = [
('therandomforest', therandomforest), 
('thecatboost', thecatboost), 
('thelogisticregression', thelogisticregression)], 
voting = 'soft', n_jobs = -1)
print(thevoting.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thevoting.predict(test_x)))

0.7398373983739838               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       0.60      0.75      0.67         4
           2       0.44      0.57      0.50         7
           3       0.60      0.50      0.55         6
           4       0.70      0.88      0.78         8
           5       1.00      0.80      0.89         5
           6       0.82      0.90      0.86        10
           7       0.71      0.83      0.77         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       0.67      0.50      0.57         4
          11       0.33      0.25      0.29         4
          12       0.71      0.83      0.77         6
          13       0.67      0.50      0.57         4
          14       1.00      0.90      0.95        10
          15       0.64      0.70      0.67        10
          16       1.00      1.00      1.00         4
        

In [37]:
assert list(y_test[0:5]) == [14, 12, 6, 13, 14]
from sklearn.ensemble import AdaBoostClassifier  
theadaboost = AdaBoostClassifier(base_estimator = therandomforest)
print(theadaboost.fit(train_x, y_train).score(test_x, y_test),
      classification_report(y_test, theadaboost.predict(test_x)))

0.7317073170731707               precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.60      0.75      0.67         4
           2       0.33      0.29      0.31         7
           3       1.00      0.33      0.50         6
           4       0.67      1.00      0.80         8
           5       1.00      0.80      0.89         5
           6       0.77      1.00      0.87        10
           7       0.33      0.50      0.40         6
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         5
          10       1.00      0.50      0.67         4
          11       0.50      0.25      0.33         4
          12       0.83      0.83      0.83         6
          13       0.50      0.50      0.50         4
          14       1.00      0.90      0.95        10
          15       0.75      0.60      0.67        10
          16       1.00      1.00      1.00         4
        

In [38]:
# References
# https://future-chem.com/rdkit-google-colab/#toc5
# https://www.rdkit.org/docs/index.html