In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
from deepchem.molnet import load_tox21
from deepchem.molnet import load_hiv

#### Load data according to the following:
https://github.com/deepchem/deepchem/blob/master/examples/hiv/hiv_tf_models.py

#### Complete Smiles data is contained here
https://github.com/deepchem/deepchem/blob/master/examples/hiv/HIV.csv

In [14]:
np.random.seed(123)

# Load hiv dataset
n_features = 1024
hiv_tasks, hiv_datasets, transformers = load_hiv()
train_dataset, valid_dataset, test_dataset = hiv_datasets

# obtain smiles strings for each
train_smiles, valid_smiles, test_smiles = train_dataset.ids, valid_dataset.ids, test_dataset.ids

# inspect shape of fingerprinting data and distribution of output
X_train, y_train, w_train = train_dataset.X, train_dataset.y, train_dataset.w

print(len(train_smiles), train_smiles[0])
print(X_train.shape, y_train.shape, w_train.shape) # ~32000 training molecules
print(f'Of all ys, {np.sum(y_train==0)} are 0s and {np.sum(y_train==1)} are 1s')

32901 CCOP(=O)(Nc1cccc(Cl)c1)OCC
(32901, 1024) (32901, 1) (32901, 1)
Of all ys, 31669 are 0s and 1232 are 1s


In [15]:
# train a basic classiciation model
model = dc.models.MultitaskClassifier(
    len(hiv_tasks),
    n_features,
    layer_sizes=[100],
    dropouts=[.25],
    learning_rate=0.001,
    batch_size=100,
    n_epochs=2
)

# Fit trained model
print('fitting')
model.fit(train_dataset)

fitting


0.5813687515258789

In [16]:
# Evaluate model performance 

metric_list = []
metric_list.append(dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean))
#metric_list.append(dc.metrics.Metric(dc.metrics.acc, np.mean))

print("Evaluating model")
# still not really sure what the transformers are doing here
train_scores = model.evaluate(train_dataset, metric_list, transformers)
valid_scores = model.evaluate(valid_dataset, metric_list, transformers)
print(train_scores, valid_scores)

Evaluating model
{'mean-roc_auc_score': 0.9690518168756943} {'mean-roc_auc_score': 0.7780793773270624}
