# Basics of DeepChem

## setup

In [1]:
!pip install --pre deepchem
import deepchem as dc
dc.__version__

Collecting deepchem
  Downloading deepchem-2.6.1-py3-none-any.whl (608 kB)
[K     |████████████████████████████████| 608 kB 7.0 MB/s 
[?25hCollecting numpy>=1.21
  Downloading numpy-1.21.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 288 kB/s 
Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 1.3 MB/s 
Installing collected packages: numpy, rdkit-pypi, deepchem
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.5 which is incompatible.
data

'2.6.1'

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
!cp '/content/gdrive/My Drive/all_carboxylics.csv' all_carboxylics.csv

Mounted at /content/gdrive


In [3]:
# use pandas just to extract the property names
import pandas as pd

carboxylics_frame = pd.read_csv('all_carboxylics.csv', index_col='Unnamed: 0')
tasks = list(carboxylics_frame.columns[2:])

# A fully connected model on fingerprints

## data set and loader

Can load directly from csv, immediately converting SMILES to fingerprints.

In [154]:
data_loader = dc.data.CSVLoader(tasks, feature_field='smiles', featurizer=dc.feat.CircularFingerprint())
dataset = data_loader.create_dataset('all_carboxylics.csv')

And split into train, validation and test sets (fractions 0.8, 0.1, 0.1 by default).

In [155]:
splitter = dc.splits.RandomSplitter()
data_train, data_val, data_test = splitter.train_valid_test_split(dataset)

## define a simple fully connected model

The deepchem models include a loss function and an optimizer.
For MultitaskRegressor the loss is an l2 loss.

The default optimizer is Adam with learning rate 1e-3. It cannot be initialized differently, only set later. (stored in model.optimizer)

In [156]:
n_tasks = len(tasks)
n_features = data_train.X.shape[1]
model = dc.models.MultitaskRegressor(n_tasks, n_features,
                                     layer_sizes=[100, 100],
                                     dropouts=0.0,
                                     weight_decay_penalty=0.0,
                                     activation_fns='relu',
                                     )

In [157]:
model.optimizer.learning_rate

0.001

## train the model

Enable early stopping based on r2 score

In [168]:
metric = dc.metrics.Metric(dc.metrics.r2_score)
callback = dc.models.ValidationCallback(data_val, 1000, [metric], save_dir='params_fc',
                                        save_on_minimum=False)

In [169]:
model.fit(data_train, nb_epoch=50, callbacks=callback)

Step 2000 validation: r2_score=0.442748
Step 3000 validation: r2_score=0.462631
Step 4000 validation: r2_score=0.401124
Step 5000 validation: r2_score=0.495744


0.00293416827917099

## evaluate trained model

In [172]:
model.restore('params/checkpoint3.pt')
scores = evaluate_model(model, data_test, tasks)

r2 score over all tasks:  0.47436534927587376
Per task r2 scores:
Dissocation energy (nucleofuge) 0.4787726829218043
Dissociation energy (electrofuge) 0.5408110299297527
Electroaccepting power(w+) 0.43608280760303364
Electrodonating power (w-) 0.6038047631451557
Electronegativity (chi=-mu) 0.5607445326410704
Electronic chemical potential (mu) 0.5333586884533013
Electronic chemical potential (mu+) 0.4358858664066162
Electronic chemical potential (mu-) 0.6988126078836793
Electrophilicity index (w=omega) 0.4093435403385497
Global Dual Descriptor Deltaf+ 0.3846568027797508
Global Dual Descriptor Deltaf- 0.3822147812253719
Hardness (eta) 0.553279539461363
Hyperhardness (gamma) 0.410062048657335
Net Electrophilicity 0.5421300703903269
Softness (S) 0.14552047730099515


# A GraphConv model

This model is based on:

Duvenaud, David K., et al. "Convolutional networks on graphs for 
         learning molecular fingerprints." Advances in neural information processing 
         systems. 2015.

In [174]:
def create_featurized_data(featurizer, splitter=dc.splits.RandomSplitter(), csv_file='all_carboxylics.csv',
                           save_dir='params'):
  data_loader = dc.data.CSVLoader(tasks, feature_field='smiles', featurizer=featurizer)
  dataset = data_loader.create_dataset(csv_file)
  data_train, data_val, data_test = splitter.train_valid_test_split(dataset)
  return {'train': data_train, 'val': data_val, 'test': data_test}

In [175]:
datasets_convmol, callback_convmol = create_featurized_data(
    dc.feat.ConvMolFeaturizer()
)
callback_gc = callback = dc.models.ValidationCallback(datasets_convmol['val'], 1000, [metric], save_dir='params_gc',
                                          save_on_minimum=False)

In [176]:
model_gc = dc.models.GraphConvModel(n_tasks, mode='regression',
                                    )
model_gc.fit(datasets_convmol['train'], nb_epoch=50, callbacks=callback_gc)

  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." %

Step 1000 validation: r2_score=-3.18949
Step 2000 validation: r2_score=-0.819767
Step 3000 validation: r2_score=-0.237069
Step 4000 validation: r2_score=-0.0593696


0.03475967884063721

In [180]:
model_gc.restore('params_gc/ckpt-2.data-00000-of-00001')
scores_gc = evaluate_model(model_gc, datasets_convmol['test'], tasks)

DataLossError: ignored

# A message passing model (not working!)

This model is based on


Vinyals, Oriol, Samy Bengio, and Manjunath Kudlur. “Order matters: Sequence to sequence for sets.” arXiv preprint arXiv:1511.06391 (2015).

In [107]:
(data_train_weave, data_val_weave, data_test_weave), callback_weave = create_featurized_data(
    dc.feat.WeaveFeaturizer()
)

In [128]:
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
    featurizer='Weave', split='index')
train_dataset, valid_dataset, test_dataset = delaney_datasets

'split' is deprecated.  Use 'splitter' instead.


In [129]:
train_dataset.X[0]

<deepchem.feat.mol_graphs.WeaveMol at 0x7f9a44e59090>

In [100]:
tst = data_train_weave.X[0]

In [103]:
tst.get_atom_features().shape

(32, 75)

In [109]:
data_train_weave.get_shape()

((8834,), (8834, 15), (8834, 15), (8834,))

In [125]:
data_train_weave.batch_size

AttributeError: ignored

In [111]:
data_train_weave.get_shard_size()

8192

In [132]:
weavemol = data_train_weave.X[10]
print(weavemol.get_atom_features().shape, weavemol.get_pair_features().shape)
weavemol_d = train_dataset.X[10]
print(weavemol_d.get_atom_features().shape, weavemol_d.get_pair_features().shape)

(16, 75) (256, 14)
(6, 75) (36, 14)


((16, 75), (256, 14))

In [120]:
np.sqrt(529)

23.0

In [122]:
4550 / 23

197.82608695652175

In [133]:
n_pair_feat = 14
n_atom_feat = 75
batch_size = 1
model_mp = dc.models.MPNNModel(n_tasks,  n_pair_feat=n_pair_feat, n_atom_feat=n_atom_feat, batch_size=batch_size,
                               T=3, M=5,
                               mode='regression')
model_mp.fit(data_train_weave, nb_epoch=50, callbacks=callback_weave)



  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


KeyboardInterrupt: ignored

In [None]:
scores_mp = evaluate_model(model_mp, data_test_weave, tasks)

# GAT model

Based on

Petar Veličković, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Liò, and Yoshua Bengio. “Graph Attention Networks.” ICLR 2018.

In [139]:
datasets_molg, callback_molg = create_featurized_data(
    dc.feat.MolGraphConvFeaturizer()
)

In [143]:
!pip install dgl dgllife

Collecting dgllife
  Downloading dgllife-0.2.9.tar.gz (138 kB)
[K     |████████████████████████████████| 138 kB 7.8 MB/s 
Building wheels for collected packages: dgllife
  Building wheel for dgllife (setup.py) ... [?25l[?25hdone
  Created wheel for dgllife: filename=dgllife-0.2.9-py3-none-any.whl size=219056 sha256=931d87a916564c258cbd21c6782c94d3a20d34f104f376712698e689510e7709
  Stored in directory: /root/.cache/pip/wheels/34/83/11/9772c74b559d9182c9083362e8ba8b0201c4963e41e03859fe
Successfully built dgllife
Installing collected packages: dgllife
Successfully installed dgllife-0.2.9


In [144]:
model_gat = dc.models.GATModel(n_tasks, mode='regression')
model_gat.fit(datasets_molg['train'], nb_epoch=50, callbacks=callback_molg)



Step 1000 validation: r2_score=0.124474




Step 2000 validation: r2_score=0.289048




Step 3000 validation: r2_score=0.20342




Step 4000 validation: r2_score=0.282468




0.02322451114654541

In [145]:
scores_gat = evaluate_model(model_gat, datasets_molg['test'], tasks)

r2 score over all tasks:  0.3413150817587046
Per task r2 scores:
Dissocation energy (nucleofuge) 0.3187087124322623
Dissociation energy (electrofuge) 0.4303118098527615
Electroaccepting power(w+) 0.15992480155797995
Electrodonating power (w-) 0.3506445281281272
Electronegativity (chi=-mu) 0.5334632674010562
Electronic chemical potential (mu) 0.4916128001645961
Electronic chemical potential (mu+) 0.38440913866386817
Electronic chemical potential (mu-) 0.4306510489504649
Electrophilicity index (w=omega) 0.27144290518628233
Global Dual Descriptor Deltaf+ 0.16181596402753362
Global Dual Descriptor Deltaf- 0.16236249069721076
Hardness (eta) 0.5073657719394482
Hyperhardness (gamma) 0.42987519462203194
Net Electrophilicity 0.3070782428868345
Softness (S) 0.18005954987011075


# Pagtn model (path augmented gat) (dataset loading takes forever, stopped after ~hour)

In [147]:
datasets_pagtn, callback_pagtn = create_featurized_data(
    dc.feat.PagtnMolGraphFeaturizer()
)

KeyboardInterrupt: ignored

In [None]:
model_pagtn = dc.models.PagtnModel(n_tasks, mode='regression')
model_pagtn.fit(datasets_pagtn['train'], nb_epoch=50, callbacks=callback_pagtn)

In [None]:
scores_pagtn = evaluate_model(model_gat, datasets_pagtn['test'], tasks)

# GCN Kipf, Welling

Based on

Thomas N. Kipf and Max Welling. “Semi-Supervised Classification with Graph Convolutional Networks.” ICLR 2017.

In [149]:
model_gcn = dc.models.GCNModel(n_tasks, mode='regression')
model_gcn.fit(datasets_molg['train'], nb_epoch=50, callbacks=callback_molg)



Step 1000 validation: r2_score=-0.281325




Step 2000 validation: r2_score=0.396938




Step 3000 validation: r2_score=0.444716




Step 4000 validation: r2_score=0.464424




0.05179398536682129

In [150]:
scores_gcn = evaluate_model(model_gcn, datasets_molg['test'], tasks)

r2 score over all tasks:  -1.211065493713938
Per task r2 scores:
Dissocation energy (nucleofuge) -0.19790105009217296
Dissociation energy (electrofuge) -0.09105616306064968
Electroaccepting power(w+) -0.0795073505310182
Electrodonating power (w-) 0.19355112539725516
Electronegativity (chi=-mu) -3.3598989734085674
Electronic chemical potential (mu) -6.257112496426266
Electronic chemical potential (mu+) -3.5700714076756386
Electronic chemical potential (mu-) -2.2775186976163435
Electrophilicity index (w=omega) -0.8631114339731856
Global Dual Descriptor Deltaf+ 0.12978084162137393
Global Dual Descriptor Deltaf- 0.1658149132394543
Hardness (eta) -1.5758233720295594
Hyperhardness (gamma) -0.16546913794063212
Net Electrophilicity -0.0062250417510443246
Softness (S) -0.2114341614620765


In [153]:
model_gcn.model.parameters()

<generator object Module.parameters at 0x7f9a4e560350>

In [None]:
evaluate_model