# Comparing iwatobipen solubility results with our classifier#
    - Matt Robinson

https://iwatobipen.wordpress.com/2019/02/01/try-gcn-qspr-with-pytorch-based-graph-library-rdkit-pytorch-dgl/

The prolific and well-known chemoinformatics blogger *iwatobipen* released his result using a graph convolutional network similarly built with pytorch and dgl. Here we compare his results to those obtained with our own gcn built with pytorch and gcn. Our gcn is built includes a few more features somewhat emulating the structure of the DeepChem gcn.

The dataset is a solubility dataset available directly from RDKIT. 

In [1]:
import sys
sys.path.insert(0, '..')

import mygcn

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from mygcn import features
from mygcn import gcn
from mygcn import train
from mygcn import evaluation

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import dgl

In [5]:
from rdkit import Chem
from rdkit.Chem import RDConfig
import os

# Getting the Data: #

In [6]:
solcls = {'(A) low':0, '(B) medium':1, '(C) high':2} # note the 3 classes

train_mols = [m for m in Chem.SDMolSupplier(os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.train.sdf'))]
train_smiles = [Chem.MolToSmiles(x) for x in train_mols]
train_y = [solcls[m.GetProp('SOL_classification')] for m in train_mols]

test_mols = [m for m in Chem.SDMolSupplier(os.path.join(RDConfig.RDDocsDir,'Book/data/solubility.test.sdf'))]
test_smiles = [Chem.MolToSmiles(x) for x in test_mols]
test_y = [solcls[m.GetProp('SOL_classification')] for m in test_mols]

In [7]:
trainset_df = pd.DataFrame({'smiles': train_smiles, 'labels': train_y})
trainset_df.to_csv('solubility_classification_train.csv',index=False)

In [8]:
testset_df = pd.DataFrame({'smiles': test_smiles, 'labels': test_y})
testset_df.to_csv('solubility_classification_test.csv',index=False)

In [9]:
print('training set size: ', len(train_smiles))
print('testing_set_size: ', len(test_smiles))

training set size:  1025
testing_set_size:  257


In [10]:
train_dl, valid_dl, _ = features.get_graph_data('solubility_classification_train.csv',
                                      smiles_field='smiles',
                                      labels_field='labels',
                                      train_size=1.0,
                                      valid_size=0.0,
                                      self_edges=True,
                                      edge_features=True,
                                      seed=13)

In [11]:
# note the use of the function here, since the train/test sets are
# already split for us
test_dl, _, _ = features.get_graph_data('solubility_classification_test.csv',
                                    smiles_field='smiles',
                                    labels_field='labels',
                                    train_size=1.0,
                                    valid_size=0.0,
                                    self_edges=True,
                                    edge_features=True,
                                    seed=13)

# Create our DeepChem-like model #

In [12]:
num_features = features.get_num_atom_features() + features.get_num_bond_features()
learning_rate = 0.001

model = gcn.DeepChemGCNClassifier(n_inputs=num_features,
                                 n_hidden=64,
                                 n_hidden_layers=2,
                                 n_outputs=len(solcls),
                                 dropout=0.2)
loss_func = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
print(model)

DeepChemGCNClassifier(
  (dropout): Dropout(p=0.2)
  (layers): ModuleList(
    (0): GraphConvLayer(
      (linear): Linear(in_features=73, out_features=64, bias=True)
    )
    (1): BatchNormLayer(
      (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): GraphPoolLayer()
    (3): GraphConvLayer(
      (linear): Linear(in_features=64, out_features=64, bias=True)
    )
    (4): BatchNormLayer(
      (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): GraphPoolLayer()
  )
  (dense_layer): Linear(in_features=64, out_features=128, bias=True)
  (final_bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (classification_layer): Linear(in_features=128, out_features=3, bias=True)
)


### Note that we will train for only 100 epochs, not 200 as is done in the post ###

In [14]:
%%time
# Note no validation set in this case, matching iwatobipen's training
# valid_dl is just an empty dataloader
train.fit(model, train_dl, valid_dl, loss_func, opt, n_epochs=100,
          report_valid_loss=False)

Epoch 0, train loss 1.0255 valid loss N/A
Epoch 10, train loss 0.5751 valid loss N/A
Epoch 20, train loss 0.4999 valid loss N/A
Epoch 30, train loss 0.4531 valid loss N/A
Epoch 40, train loss 0.4356 valid loss N/A
Epoch 50, train loss 0.4782 valid loss N/A
Epoch 60, train loss 0.5086 valid loss N/A
Epoch 70, train loss 0.3796 valid loss N/A
Epoch 80, train loss 0.3616 valid loss N/A
Epoch 90, train loss 0.4538 valid loss N/A
CPU times: user 7min 42s, sys: 1.46 s, total: 7min 43s
Wall time: 2min 42s


In [15]:
evaluation.evaluate_classifier(model, test_dl, loss_func, classes=[0,1,2])

test_loss:  0.7727183434698317
accuracy:  0.7704280155642024
classification report: 
               precision    recall  f1-score   support

           0       0.84      0.81      0.83       102
           1       0.77      0.69      0.73       115
           2       0.64      0.90      0.75        40

   micro avg       0.77      0.77      0.77       257
   macro avg       0.75      0.80      0.77       257
weighted avg       0.78      0.77      0.77       257

micro auc score and score for each class: 
0  :  0.9412397216951297
1  :  0.854745866503368
2  :  0.9653225806451614
micro  :  0.9157822222895123
bootstrapped micro auc score and score for each class: 
0  :  [0.9200371057513915, 0.9603898538048232]
1  :  [0.8149020580213241, 0.8895757575757576]
2  :  [0.9398275862068965, 0.985309017223911]
micro  :  [0.9008986136115342, 0.9312849578641211]


# Now run our Duvenaud-like model: #

In [36]:
num_features = features.get_num_atom_features() + features.get_num_bond_features()
learning_rate = 0.01 # note change here

model = gcn.DuvenaudGCNClassifier(n_inputs=num_features,
                                  n_hidden=64,
                                  n_outputs=len(solcls),)
loss_func = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=learning_rate)

In [37]:
%%time
train.fit(model, train_dl, valid_dl, loss_func, opt, n_epochs=200,
          report_valid_loss=False)

Epoch 0, train loss 1.0025 valid loss N/A
Epoch 20, train loss 0.8868 valid loss N/A
Epoch 40, train loss 0.8657 valid loss N/A
Epoch 60, train loss 0.8614 valid loss N/A
Epoch 80, train loss 0.8436 valid loss N/A
Epoch 100, train loss 0.8968 valid loss N/A
Epoch 120, train loss 0.8627 valid loss N/A
Epoch 140, train loss 0.8423 valid loss N/A
Epoch 160, train loss 0.8665 valid loss N/A
Epoch 180, train loss 0.8675 valid loss N/A
CPU times: user 6min 23s, sys: 1.82 s, total: 6min 25s
Wall time: 2min 31s


In [39]:
evaluation.evaluate_classifier(model, test_dl, loss_func, multiclass=True)

test_loss:  0.8723989990022447
accuracy:  0.622568093385214
classification report: 
               precision    recall  f1-score   support

           0       0.71      0.66      0.68       102
           1       0.59      0.60      0.59       115
           2       0.52      0.60      0.56        40

   micro avg       0.62      0.62      0.62       257
   macro avg       0.61      0.62      0.61       257
weighted avg       0.63      0.62      0.62       257

