In [None]:
# install rdkit  
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y -c rdkit rdkit python=3.7
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import pprint     
pprint.pprint(sys.path)
!python -c "import site; print (site.getsitepackages())"

In [3]:
import pandas as pd
df = pd.read_csv('top_50_MOAs.txt', sep = '\t')
df

Unnamed: 0,SMILES,MOA
0,Oc1ccc(CCNCC2CCc3ccccc3C2=O)cc1,adrenergic receptor antagonist
1,OC(CNCCNC(=O)Nc1ccccc1)COc1ccccc1C#N,adrenergic receptor antagonist
2,O=C1Nc2ccccc2C2=NC(CN3CCN(CC3)c3ccccc3)CN12,adrenergic receptor antagonist
3,O=C(NC1CCN(CCc2c[nH]c3ccccc23)CC1)c1ccccc1,adrenergic receptor antagonist
4,O[C@H](CNC[C@@H](O)[C@@H]1CCc2cc(F)ccc2O1)[C@H...,adrenergic receptor antagonist
...,...,...
1958,Cn1c2ncn(CC(O)=O)c2c(=O)n(C)c1=O,adenosine receptor agonist
1959,CCNC(=O)[C@H]1O[C@H]([C@H](O)[C@@H]1O)n1cnc2c(...,adenosine receptor agonist
1960,CCNC(=O)[C@H]1O[C@H]([C@@H](O)[C@@H]1O)n1cnc2c...,adenosine receptor agonist
1961,Cc1sc(N)c(C(=O)c2cccc(c2)C(F)(F)F)c1C,adenosine receptor agonist


In [4]:
# check the duplicates 
for i in df.SMILES.tolist():
  if df.SMILES.tolist().count(i) != 1:
    print(i)

In [5]:
MOA_class_dictionary =  {'ATPase inhibitor': 45,
 'Aurora kinase inhibitor': 7,
 'CC chemokine receptor antagonist': 13,
 'CDK inhibitor': 37,
 'DNA synthesis inhibitor': 29,
 'EGFR inhibitor': 41,
 'HCV inhibitor': 25,
 'HDAC inhibitor': 26,
 'HSP inhibitor': 23,
 'JAK inhibitor': 32,
 'MEK inhibitor': 5,
 'PARP inhibitor': 16,
 'PI3K inhibitor': 49,
 'PPAR receptor agonist': 9,
 'acetylcholine receptor agonist': 36,
 'acetylcholine receptor antagonist': 1,
 'acetylcholinesterase inhibitor': 46,
 'adenosine receptor agonist': 8,
 'adenosine receptor antagonist': 44,
 'adrenergic receptor agonist': 22,
 'adrenergic receptor antagonist': 10,
 'angiotensin converting enzyme inhibitor': 6,
 'antioxidant': 47,
 'bacterial 30S ribosomal subunit inhibitor': 14,
 'bacterial DNA gyrase inhibitor': 18,
 'bacterial cell wall synthesis inhibitor': 43,
 'benzodiazepine receptor agonist': 33,
 'bromodomain inhibitor': 2,
 'calcium channel blocker': 42,
 'cyclooxygenase inhibitor': 12,
 'cytochrome P450 inhibitor': 30,
 'dopamine receptor agonist': 48,
 'dopamine receptor antagonist': 27,
 'estrogen receptor agonist': 35,
 'glucocorticoid receptor agonist': 4,
 'glutamate receptor agonist': 19,
 'glutamate receptor antagonist': 20,
 'histamine receptor antagonist': 24,
 'histone lysine methyltransferase inhibitor': 11,
 'local anesthetic': 34,
 'monoamine oxidase inhibitor': 17,
 'opioid receptor agonist': 31,
 'phosphodiesterase inhibitor': 40,
 'potassium channel blocker': 15,
 'protein synthesis inhibitor': 28,
 'serotonin receptor agonist': 38,
 'serotonin receptor antagonist': 3,
 'sodium channel blocker': 39,
 'topoisomerase inhibitor': 21,
 'tubulin polymerization inhibitor': 0}  

In [7]:
sorted_classes = list(MOA_class_dictionary.values())
sorted_classes.sort() 
assert sorted_classes == [i for i in range(50)]

In [8]:
# add classes column 
df['classes'] = None
for i in range(df.shape[0]):
  df.iloc[i,2] = MOA_class_dictionary[df.iloc[i,1]]
df

Unnamed: 0,SMILES,MOA,classes
0,Oc1ccc(CCNCC2CCc3ccccc3C2=O)cc1,adrenergic receptor antagonist,10
1,OC(CNCCNC(=O)Nc1ccccc1)COc1ccccc1C#N,adrenergic receptor antagonist,10
2,O=C1Nc2ccccc2C2=NC(CN3CCN(CC3)c3ccccc3)CN12,adrenergic receptor antagonist,10
3,O=C(NC1CCN(CCc2c[nH]c3ccccc23)CC1)c1ccccc1,adrenergic receptor antagonist,10
4,O[C@H](CNC[C@@H](O)[C@@H]1CCc2cc(F)ccc2O1)[C@H...,adrenergic receptor antagonist,10
...,...,...,...
1958,Cn1c2ncn(CC(O)=O)c2c(=O)n(C)c1=O,adenosine receptor agonist,8
1959,CCNC(=O)[C@H]1O[C@H]([C@H](O)[C@@H]1O)n1cnc2c(...,adenosine receptor agonist,8
1960,CCNC(=O)[C@H]1O[C@H]([C@@H](O)[C@@H]1O)n1cnc2c...,adenosine receptor agonist,8
1961,Cc1sc(N)c(C(=O)c2cccc(c2)C(F)(F)F)c1C,adenosine receptor agonist,8


In [9]:
# A function that changes smiles string to fingerprints 
import rdkit
import numpy as np
from rdkit import *
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
def smiles_to_array_to_string(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays  = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = list((np.squeeze(x_array)).astype(int))
  string = ''
  for i in x_array:
    string += str(i) 
  return string

In [10]:
# Check the existence of Isomers
assert len(set([smiles_to_array_to_string(i) for i in df.SMILES.tolist()])) == df.shape[0]

In [11]:
df.head(3)

Unnamed: 0,SMILES,MOA,classes
0,Oc1ccc(CCNCC2CCc3ccccc3C2=O)cc1,adrenergic receptor antagonist,10
1,OC(CNCCNC(=O)Nc1ccccc1)COc1ccccc1C#N,adrenergic receptor antagonist,10
2,O=C1Nc2ccccc2C2=NC(CN3CCN(CC3)c3ccccc3)CN12,adrenergic receptor antagonist,10


In [12]:
# Split out the test set  
from sklearn.model_selection import train_test_split
x_train_valid, x_test, y_train_valid, y_test = train_test_split(df.SMILES, df.classes, test_size =10/100,
 stratify = df.classes, shuffle = True, random_state = 1000)

In [13]:
# kfold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 9)
skf.get_n_splits(np.array(list(x_train_valid)), np.array(list(y_train_valid)))
train_index_list = []
valid_index_list = []
for train_index, valid_index in skf.split(np.array(list(x_train_valid)), np.array(list(y_train_valid))):
  train_index_list.append(train_index)
  valid_index_list.append(valid_index)

In [14]:
a_list = []
for i in range(9):
  a_list += list(np.array(list(x_train_valid))[valid_index_list[i]])

In [15]:
number_of_kfold =  7 # change the number from 0-8 to get 9 shuffles

In [16]:
  x_train = list(np.array(list(x_train_valid))[train_index_list[ number_of_kfold ]])
  x_valid = list(np.array(list(x_train_valid))[valid_index_list[ number_of_kfold ]])
  y_train = list(np.array(list(y_train_valid))[train_index_list[ number_of_kfold ]])
  y_valid = list(np.array(list(y_train_valid))[valid_index_list[ number_of_kfold ]])
  x_test = list(x_test)
  y_test = list(y_test)

In [17]:
# turn to cannoical  smiles
x_train = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_train]
x_valid = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_valid]
x_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi),True) for smi in x_test]

In [18]:
def smiles_to_array(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = ((np.squeeze(x_array)).astype(int)) 
  return x_array

In [19]:
train_x = np.zeros((len(x_train), 2048), dtype = np.float32)
for f in range(train_x.shape[0]):
  train_x[f] = smiles_to_array(x_train[f])

In [20]:
valid_x = np.zeros((len(x_valid), 2048), dtype = np.float32)
for f in range(valid_x.shape[0]):
  valid_x[f] = smiles_to_array(x_valid[f])

In [21]:
test_x = np.zeros((len(x_test), 2048), dtype = np.float32)
for f in range(test_x.shape[0]):
  test_x[f] = smiles_to_array(x_test[f])

In [22]:
# Check if there are overlaps
overlap = []
for i in range(train_x.shape[0]):
  for j in range(valid_x.shape[0]):
    if np.array_equal(train_x[i], valid_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [23]:
for i in range(test_x.shape[0]):
  for j in range(valid_x.shape[0]):
    if np.array_equal(test_x[i], valid_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [24]:
for i in range(train_x.shape[0]):
  for j in range(test_x.shape[0]):
    if np.array_equal(train_x[i], test_x[j]) == True:
      overlap.append((i,j))
      print(i,j)

In [25]:
assert len(overlap) == 0

In [26]:
y_train = np.array(y_train).astype(int)
y_valid = np.array(y_valid).astype(int)
y_test = np.array(y_test).astype(int)

In [27]:
import gc               
gc.collect() 

571

In [28]:
# Create class weights
from sklearn.utils import class_weight
y_unique = np.unique(np.array(y_train))
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = y_unique,
                y = np.array(y_train)) 
class_weights_dict45 = dict(enumerate(class_weights))
class_weights_dict45

{0: 1.6526315789473685,
 1: 0.46865671641791046,
 2: 1.8470588235294119,
 3: 0.5607142857142857,
 4: 0.8722222222222222,
 5: 1.8470588235294119,
 6: 1.6526315789473685,
 7: 1.8470588235294119,
 8: 1.8470588235294119,
 9: 1.7444444444444445,
 10: 0.4131578947368421,
 11: 1.4952380952380953,
 12: 0.4025641025641026,
 13: 1.3652173913043477,
 14: 1.7444444444444445,
 15: 1.57,
 16: 1.7444444444444445,
 17: 1.4952380952380953,
 18: 1.256,
 19: 1.4952380952380953,
 20: 0.5322033898305085,
 21: 1.0827586206896551,
 22: 0.46865671641791046,
 23: 1.6526315789473685,
 24: 0.5607142857142857,
 25: 1.9625,
 26: 1.0129032258064516,
 27: 0.615686274509804,
 28: 1.1214285714285714,
 29: 1.6526315789473685,
 30: 1.6526315789473685,
 31: 1.6526315789473685,
 32: 1.7444444444444445,
 33: 1.162962962962963,
 34: 1.6526315789473685,
 35: 1.6526315789473685,
 36: 0.9235294117647059,
 37: 1.162962962962963,
 38: 0.6408163265306123,
 39: 1.0129032258064516,
 40: 0.628,
 41: 0.98125,
 42: 0.8722222222222222,

In [29]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import classification_report

In [31]:
assert list(y_test[0:5]) == [17, 8, 7, 22, 4]
therandomforest = RandomForestClassifier(random_state = 0, class_weight = class_weights_dict45)
print(therandomforest.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, therandomforest.predict(test_x)))

0.6091370558375635               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.88      0.93         8
           2       1.00      0.50      0.67         2
           3       0.40      0.57      0.47         7
           4       0.71      1.00      0.83         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       0.50      0.50      0.50         2
           9       1.00      0.50      0.67         2
          10       0.50      0.90      0.64        10
          11       1.00      0.33      0.50         3
          12       0.53      0.80      0.64        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
assert list(y_test[0:5])  ==  [17, 8, 7, 22, 4]
from sklearn.neighbors import KNeighborsClassifier
theneighbor = KNeighborsClassifier(n_neighbors = 1, algorithm = 'kd_tree',)
print(theneighbor.fit(train_x , y_train).score(test_x, y_test),
   classification_report(y_test, theneighbor.predict(test_x)))

0.5736040609137056               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.62      0.77         8
           2       1.00      0.50      0.67         2
           3       0.50      0.71      0.59         7
           4       0.83      1.00      0.91         5
           5       0.67      1.00      0.80         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      0.50      0.67         2
          10       0.89      0.80      0.84        10
          11       1.00      0.33      0.50         3
          12       0.67      0.60      0.63        10
          13       0.33      0.33      0.33         3
          14       0.67      1.00      0.80         2
          15       0.33      0.50      0.40         2
          16       0.33      0.50      0.40         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
assert  list(y_test[0:5]) == [17, 8, 7, 22, 4]
from sklearn.linear_model import LogisticRegression
thelogisticregression = LogisticRegression(random_state = 0, class_weight = class_weights_dict45) 
print(thelogisticregression.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelogisticregression.predict(test_x)))

0.6294416243654822               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.86      0.75      0.80         8
           2       1.00      1.00      1.00         2
           3       0.33      0.71      0.45         7
           4       0.83      1.00      0.91         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      0.50      0.67         2
          10       0.60      0.90      0.72        10
          11       1.00      0.33      0.50         3
          12       0.78      0.70      0.74        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
assert list(y_test[0:5])  == [17, 8, 7, 22, 4]
from lightgbm import LGBMClassifier
thelgbclassifier = LGBMClassifier(class_weight = class_weights_dict45)
print(thelgbclassifier.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thelgbclassifier.predict(test_x) ))

0.5685279187817259               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       1.00      0.62      0.77         8
           2       1.00      1.00      1.00         2
           3       0.25      0.57      0.35         7
           4       0.83      1.00      0.91         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       0.50      0.50      0.50         2
          10       0.82      0.90      0.86        10
          11       0.50      0.33      0.40         3
          12       0.60      0.60      0.60        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
pip  install catboost

Successfully installed catboost-1.0.6 cycler-0.11.0 fonttools-4.37.1 graphviz-0.20.1 kiwisolver-1.4.4 matplotlib-3.5.3 plotly-5.10.0 scipy-1.7.3 tenacity-8.0.1 typing-extensions-4.3.0
[0m

In [37]:
from catboost import CatBoostClassifier
assert list(y_test[0:5]) == [17, 8, 7, 22, 4]
thecatboost = CatBoostClassifier(verbose = 0, class_weights = class_weights_dict45, task_type = "GPU")
print(thecatboost.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thecatboost.predict(test_x)))

0.5532994923857868               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.83      0.62      0.71         8
           2       1.00      1.00      1.00         2
           3       0.44      0.57      0.50         7
           4       0.83      1.00      0.91         5
           5       0.67      1.00      0.80         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      0.50      0.67         2
          10       0.78      0.70      0.74        10
          11       1.00      0.33      0.50         3
          12       0.50      0.40      0.44        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.67      1.00      0.80         2
          16       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
assert list(y_test[0:5]) == [17, 8, 7, 22, 4]
from sklearn.ensemble import BaggingClassifier
thebagging = BaggingClassifier(base_estimator = thelogisticregression, random_state = 0)
print(thebagging.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thebagging.predict(test_x)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.6192893401015228               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.75      0.75      0.75         8
           2       1.00      1.00      1.00         2
           3       0.46      0.86      0.60         7
           4       0.83      1.00      0.91         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      0.50      0.67         2
          10       0.60      0.90      0.72        10
          11       1.00      0.33      0.50         3
          12       0.78      0.70      0.74        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
assert list(y_test[0:5]) ==  [17, 8, 7, 22, 4]
from sklearn.ensemble import StackingClassifier
estimators = [
('therandomforest', therandomforest), 
('thelgbclassifier',thelgbclassifier ), 
('thelogisticregression', thelogisticregression)]
thestacking = StackingClassifier(estimators = estimators, final_estimator = therandomforest)
print(thestacking.fit(train_x, y_train).score(test_x, y_test),
  classification_report(y_test, thestacking.predict(test_x),))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.6700507614213198               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.75      0.86         8
           2       1.00      1.00      1.00         2
           3       0.50      0.86      0.63         7
           4       0.83      1.00      0.91         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      1.00      1.00         2
          10       0.69      0.90      0.78        10
          11       1.00      0.33      0.50         3
          12       0.89      0.80      0.84        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
assert list(y_test[0:5]) == [17, 8, 7, 22, 4]
thevoting = VotingClassifier(estimators = [
('therandomforest', therandomforest), 
('thelogisticregression', thelogisticregression)], 
voting = 'soft', n_jobs = -1)
print(thevoting.fit(train_x, y_train).score(test_x, y_test),
   classification_report(y_test, thevoting.predict(test_x)))

0.6345177664974619               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.75      0.86         8
           2       1.00      1.00      1.00         2
           3       0.33      0.71      0.45         7
           4       0.83      1.00      0.91         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       1.00      0.50      0.67         2
          10       0.64      0.90      0.75        10
          11       1.00      0.33      0.50         3
          12       0.70      0.70      0.70        10
          13       0.00      0.00      0.00         3
          14       0.50      1.00      0.67         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
assert list(y_test[0:5]) == [17, 8, 7, 22, 4]
from sklearn.ensemble import AdaBoostClassifier  
theadaboost = AdaBoostClassifier(base_estimator = therandomforest)
print(theadaboost.fit(train_x, y_train).score(test_x, y_test),
      classification_report(y_test, theadaboost.predict(test_x)))

0.5989847715736041               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      0.75      0.71         8
           2       1.00      0.50      0.67         2
           3       0.29      0.57      0.38         7
           4       0.62      1.00      0.77         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         2
           8       0.50      0.50      0.50         2
           9       1.00      0.50      0.67         2
          10       0.64      0.90      0.75        10
          11       1.00      0.33      0.50         3
          12       0.39      0.70      0.50        10
          13       0.00      0.00      0.00         3
          14       0.67      1.00      0.80         2
          15       0.00      0.00      0.00         2
          16       1.00      0.50      0.67         2
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# References
# https://future-chem.com/rdkit-google-colab/#toc5
# https://www.rdkit.org/docs/index.html