This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 4.

# Installing DeepChem

In [2]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-07-26 15:22:42--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
reinstalling: python-3.7.3-h0371630_0 ...
using -f (force) option
Python 3.7.3
reinstalling: ca-certificates-2019.1.23-0 ...
using -f (force) option
reinstalling: libgcc-ng-8.2.0-hdf63c60_1 ...
using -f (force) option
reinstalling: libstdcxx-ng-8.2.0-hdf63c60_1 ...
using -f (force) option
reinstalling: libffi-3.2.1-hd88cf55_4 ...
using -f (force) option
reinstalling: ncurses-6.1-he6710b0_1 ...
using -f (force) option
reinstalling: openssl-1.1.1b-h7b6447c_1 ...
using -f (force) option
reinstalling: xz-5.2.4-h14c3975_4 ...
using -f (force) option
reinstalli

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [13]:
# Install DeepChem 
!pip install deepchem



In [0]:
import deepchem as dc

# PDBBind Dataset Description
From [PDBBind Website](http://www.pdbbind.org.cn/): The aim of the PDBbind database is to provide a comprehensive collection of the experimentally measured binding affinity data for all types of biomolecular complexes deposited in the Protein Data Bank (PDB). It thus provides an essential linkage between energetic and structural information of these complexes, which is helpful for various computational and statistical studies on molecular recognition occurred in biological systems.

In [16]:
# load the PDBBind dataset
pdbbind_tasks, pdbbind_datasets, transformers = dc.molnet.load_pdbbind_grid(featurizer="grid", split="random", subset="core")

Loading dataset from disk.
TIMING: dataset construction took 0.063 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.


In [0]:
# partition the data into train, validation, and test sets
train_dataset, valid_dataset, test_dataset = pdbbind_datasets

# Building the Model

In [0]:
# Create and train a random forest model for predicting the binding affinity
from sklearn.ensemble import RandomForestRegressor
sklearn_model = RandomForestRegressor(n_estimators=100)
model = dc.models.SklearnModel(sklearn_model, model_dir="pdbbind_rf")
model.fit(train_dataset)

In [24]:
# Evaluate the model using Pearson correlation metric

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

computed_metrics: [0.9606902111693288]
computed_metrics: [0.5124925029164453]


# Displaying the Results

In [23]:
# Print the results (2 decimal )
print("Train Pearson Correlation Score: {:.2f}".format(train_scores["pearson_r2_score"]))
print("Test Pearson Correlation Score: {:.2f}".format(test_scores["pearson_r2_score"]))

Train Pearson Correlation Score: 0.96
Test Pearson Correlation Score: 0.51
