# Comparing PatWalters ESOL results with those from our own regressor #

Walters' nice post on the topic can be found at http://practicalcheminformatics.blogspot.com/2018/09/predicting-aqueous-solubility-its.html

All of his code can also be found in [this github repo](https://github.com/PatWalters/solubility) 

In [55]:
import sys
sys.path.insert(0, '../../')

import mygcn

In [56]:
%reload_ext autoreload
%autoreload 2

In [140]:
from mygcn import features
from mygcn import gcn
from mygcn import train
from mygcn import evaluation

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import dgl

In [59]:
from rdkit import Chem
from rdkit.Chem import RDConfig
import os

In [124]:
# had to remove methane, b/c feature code doesnt work for thing only with self node
train_dl, valid_dl, test_dl = features.get_graph_data('delaney.csv',
                                      smiles_field='SMILES',
                                      labels_field='measured log(solubility:mol/L)',
                                      train_size=0.8,
                                      valid_size=0.1,
                                      self_edges=True,
                                      edge_features=True,
                                      seed=13)

In [125]:
num_features = features.get_num_atom_features() + features.get_num_bond_features()
learning_rate = 0.001

model = gcn.DeepChemGCNRegressor(n_inputs=num_features,
                                 n_hidden=64,
                                 n_hidden_layers=2,
                                 n_outputs=1,
                                 dropout=0.1)
loss_func = nn.MSELoss()
opt = optim.Adam(model.parameters(), lr=learning_rate)

In [126]:
print(model)

DeepChemGCNRegressor(
  (dropout): Dropout(p=0.1)
  (layers): ModuleList(
    (0): GraphConvLayer(
      (linear): Linear(in_features=73, out_features=64, bias=True)
    )
    (1): BatchNormLayer(
      (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): GraphPoolLayer()
    (3): GraphConvLayer(
      (linear): Linear(in_features=64, out_features=64, bias=True)
    )
    (4): BatchNormLayer(
      (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): GraphPoolLayer()
  )
  (dense_layer): Linear(in_features=64, out_features=128, bias=True)
  (final_bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (prediction_layer): Linear(in_features=128, out_features=1, bias=True)
)


In [127]:
%%time
train.fit(model, train_dl, valid_dl, loss_func, opt, n_epochs=20,
          report_valid_loss=True)

Epoch 0, train loss 0.3790 valid loss 0.34598570539240253
Epoch 2, train loss 0.1984 valid loss 0.2528736842306037
Epoch 4, train loss 0.0607 valid loss 0.24517574644925302
Epoch 6, train loss 0.0405 valid loss 0.24881528134931596
Epoch 8, train loss 0.0329 valid loss 0.28065985127499227
Epoch 10, train loss 0.0290 valid loss 0.28439382084628995
Epoch 12, train loss 0.0267 valid loss 0.2519476371898986
Epoch 14, train loss 0.0247 valid loss 0.2596116149634646
Epoch 16, train loss 0.0214 valid loss 0.2701522007323148
Epoch 18, train loss 0.0215 valid loss 0.2753318276321679
CPU times: user 1min 52s, sys: 329 ms, total: 1min 52s
Wall time: 38.8 s


In [149]:
evaluation.evaluate_regressor(model, test_dl, loss_func)

test_loss:  0.015612033398255058
***
RMSE:  0.6930116
RMSE CI:  (0.5963088449907235, 0.7777826603072564)
***
MAE:  0.5516355
MAE CI:  [0.49128002, 0.6170165]
***
R^2:  0.8293499720270193
R^2 CI:  (0.7745354116512831, 0.8719049552094038)


# Now try on that independent test set #

In [145]:
independent_test_dl,_,_ = features.get_graph_data('dls_100_unique.csv',
                                      smiles_field='SMILES',
                                      labels_field='LogS exp (mol/L)',
                                      train_size=1.0,
                                      valid_size=0.0,
                                      self_edges=True,
                                      edge_features=True,
                                      seed=13)

In [150]:
evaluation.evaluate_regressor(model, independent_test_dl, loss_func)

test_loss:  0.04954068149839129
***
RMSE:  1.1629573
RMSE CI:  (0.9203120304136536, 1.3630719642000126)
***
MAE:  0.9592582
MAE CI:  [0.8299284, 1.1157275]
***
R^2:  0.508747988797586
R^2 CI:  (0.3405562318420812, 0.6520205437477077)


These results seem to agree with those found by Pat Walters