# DeepChem

- [Installation](https://github.com/deepchem/deepchem#installation)
- [Tutorial](https://deepchem.readthedocs.io/en/latest/get_started/tutorials.html)
- [Sample Notebooks](https://github.com/deepchem/deepchem/tree/master/examples/tutorials) 

#### Creating custom Models with TensorFlow and PyTorch

You can define your own model and wrap it by DeepChem model class.

In [1]:
import pandas as pd

from sklearn.metrics import r2_score

import deepchem as dc
import tensorflow as tf
import torch

# Data

In [4]:
# In this notebook, we use a dataset provided by DeepChem. 
# Since we will make a CNN model, we cannot use "GraphConv" features. 

tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='ECFP', splitter='random')
train_dataset, valid_dataset, test_dataset = datasets

# Initialze the metrics
# https://deepchem.readthedocs.io/en/latest/api_reference/metrics.html
metric_r2 = dc.metrics.Metric(dc.metrics.r2_score)
metric_mse = dc.metrics.Metric(dc.metrics.mean_squared_error)
metrics = [metric_r2, metric_mse]

y_test = test_dataset.y   # 2D array

# Keras Model 

In [5]:
# KerasModel is a subclass of DeepChem's Model class. It acts as a wrapper around a tensorflow.keras.Model. 
keras_model = tf.keras.Sequential([
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(1)   # single output , regression
])

# You need to pass loss function (https://deepchem.readthedocs.io/en/latest/api_reference/models.html#losses)
model = dc.models.KerasModel(keras_model, dc.models.losses.L2Loss())

In [6]:
model.fit(train_dataset, nb_epoch=50)   # You can increase "nb_epoch"

0.03087562561035156

In [7]:
model.model.summary()   # this line must be after "model.fit"

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (100, 1000)               1025000   
_________________________________________________________________
dropout (Dropout)            (100, 1000)               0         
_________________________________________________________________
dense_1 (Dense)              (100, 1)                  1001      
Total params: 1,026,001
Trainable params: 1,026,001
Non-trainable params: 0
_________________________________________________________________


In [8]:
print('training set score:', model.evaluate(train_dataset, metrics))   
print('validation set score:', model.evaluate(valid_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

training set score: {'r2_score': 0.9788902786268423, 'mean_squared_error': 0.02110972137315766}
validation set score: {'r2_score': 0.7094642454833595, 'mean_squared_error': 0.2047975356107964}
test set score: {'r2_score': 0.6895225628555833, 'mean_squared_error': 0.2820879183579153}


### Prediction

In [9]:
# input to "model.predict" is "dataset"
y_pred = model.predict(test_dataset)   # numpy array [n, 1]

r2_score(y_test, y_pred)

0.6895225628555833

# PyTorch Model 

In [11]:
# TorchModel works just like KerasModel, except it wraps a torch.nn.Module.
pytorch_model = torch.nn.Sequential(
    torch.nn.Linear(1024, 1000),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(1000, 1)
)

model = dc.models.TorchModel(pytorch_model, dc.models.losses.L2Loss())

In [12]:
# If you have scikit-learn version >= 0.24, you will get error. I installed version 0.23.
print(model)

TorchModel(device=device(type='cpu'), learning_rate=None, loss=None,
           model=Sequential(
  (0): Linear(in_features=1024, out_features=1000, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=1000, out_features=1, bias=True)
),
           model_dir='/var/folders/5_/xyk2cqhn5yj2yx4nw54_7x240000gn/T/tmp8vtjjiu5',
           optimizer=<deepchem.models.optimizers.Adam object at 0x7fbdc12793d0>)




In [13]:
loss = model.fit(train_dataset, nb_epoch=50)   # You can increase "nb_epoch"
loss

0.026348528861999513

In [14]:
print('training set score:', model.evaluate(train_dataset, metrics))
print('validation set score:', model.evaluate(valid_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

training set score: {'r2_score': 0.9804008336837297, 'mean_squared_error': 0.01959916631627031}
validation set score: {'r2_score': 0.7108336373479438, 'mean_squared_error': 0.20383225655376985}
test set score: {'r2_score': 0.6724113366864082, 'mean_squared_error': 0.2976345236604092}


### Prediction

In [15]:
# input to "model.predict" is "dataset"
y_pred = model.predict(test_dataset)   # numpy array [n, 1]

r2_score(y_test, y_pred)

0.6724113366864082