In [1]:
import numpy as np
import tensorflow as tf
import deepchem as dc



In [2]:
"""
OFFSIDES dataset loader.
"""
from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem

logger = logging.getLogger(__name__)


def load_offsides(featurizer='ECFP', split='index', reload=True, K=4):
  logger.info("About to load ofssides dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "offsides/" + featurizer + "/" + str(split))

  dataset_file = os.path.join("/home/mamonteiro/source-code/Project-LEI/offsides/", "offsides.csv.gz")


  dataset = deepchem.utils.save.load_from_disk(dataset_file)
  logger.info("Columns of dataset: %s" % str(dataset.columns.values))
  logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
  OFFSIDES_tasks = dataset.columns.values[1:].tolist()

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return OFFSIDES_tasks, all_dataset, transformers

  # Featurize OFFSIDES dataset
  logger.info("About to featurize OFFSIDES dataset.")
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

  logger.info("OFFSIDES tasks: %s" % str(OFFSIDES_tasks))
  logger.info("%d tasks in total" % len(OFFSIDES_tasks))

  loader = deepchem.data.CSVLoader(
      tasks=OFFSIDES_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)
  logger.info("%d datapoints in OFFSIDES dataset" % len(dataset))

  # Initialize transformers
  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
  logger.info("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  if split == None:
    return OFFSIDES_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'task': deepchem.splits.TaskSplitter()
  }
  splitter = splitters[split]
  if split == 'task':
    fold_datasets = splitter.k_fold_split(dataset, K)
    all_dataset = fold_datasets
  else:
    train, valid, test = splitter.train_valid_test_split(dataset)
    if reload:
      deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                               transformers)
    all_dataset = (train, valid, test)
  return OFFSIDES_tasks, all_dataset, transformers


In [3]:
offsides_tasks, offsides_datasets, transformers = load_offsides(featurizer='GraphConv',reload=True)
train_dataset, valid_dataset, test_dataset = offsides_datasets

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [4]:
featurizer = dc.feat.CircularFingerprint(size = 704)

In [5]:
loader = dc.data.CSVLoader(
      tasks=offsides_tasks, smiles_field="smiles",
      featurizer=featurizer)

In [6]:
dataset = loader.featurize('offsides.csv')

Loading raw samples now.
shard_size: 8192
About to start loading CSV from offsides.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.936 s
TIMING: dataset construction took 0.964 s
Loading dataset from disk.


In [7]:
train_dataset.y.shape

(704, 27)

In [8]:
valid_dataset.y.shape

(88, 27)

In [9]:
test_dataset.y.shape

(89, 27)

In [10]:
splitter = dc.splits.RandomSplitter('offsides.csv')
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset)
#NOTE THE RENAMING:
valid_dataset, test_dataset = test_dataset, valid_dataset


Computing train/valid/test indices
TIMING: dataset construction took 0.063 s
Loading dataset from disk.
TIMING: dataset construction took 0.015 s
Loading dataset from disk.
TIMING: dataset construction took 0.014 s
Loading dataset from disk.


In [11]:
n_features = train_dataset.y.shape[0]
n_features

704

In [12]:
params_dict = {"activation": ["relu","sigmoid","tahn"],
               "optimizer": ["Adam","RMSprop"],
               "momentum": [.9],
               "penalty": [0.]
              }
n_features = train_dataset.y.shape[0]
def model_builder(model_params, model_dir):
    model = dc.models.MultitaskClassifier(
    len(offsides_tasks), n_features, **model_params)
    return model

In [13]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
optimizer = dc.hyper.HyperparamOpt(model_builder)
best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, train_dataset, valid_dataset, [], metric)

Fitting model 1/6
hyperparameters: {'activation': 'relu', 'optimizer': 'Adam', 'momentum': 0.9, 'penalty': 0.0}
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


computed_metrics: [0.5607876712328768, 0.6791497975708503, 0.3970588235294118, 0.46087636932707354, 0.6721518987341772, 0.5830687830687831, 0.7037037037037037, 0.5404761904761904, 0.5377551020408163, 0.4324009324009324, 0.748995983935743, 0.40588235294117647, 0.2852941176470588, 0.5240506329113925, 0.3827160493827161, 0.43478260869565216, 0.5542857142857143, 0.625, 0.41049382716049376, 0.5604575163398693, 0.6375939849624059, 0.6309931506849316, 0.5562770562770563, 0.6218708827404479, 0.6781376518218624, 0.7943722943722944, 0.5884615384615384]
Model 1/6, Metric mean-roc_auc_score, Validation set 0: 0.555818
	best_validation_score so far: 0.555818
Fitting model 2/6
hyperparameters: {'activation': 'relu', 'optimizer': 'RMSprop', 'momentum': 0.9, 'penalty': 0.0}
computed_metrics: [0.485445205479452, 0.6720647773279352, 0.42238562091503273, 0.4663536776212832, 0.6278481012658228, 0.5857142857142859, 0.712962962962963, 0.7214285714285715, 0.4780612244897959, 0.43123543123543123, 0.6385542168

In [25]:
best_dnn

MultitaskClassifier(activation_fns=None, bias_init_consts=None, dropouts=None,
                    layer_sizes=None, n_classes=2, n_features=704, n_tasks=27,
                    weight_decay_penalty=None, weight_decay_penalty_type=None,
                    weight_init_stddevs=None)

In [26]:
best_hyperparams

('relu', 'RMSprop', 0.9, 0.0)

In [27]:
all_results

{"('relu', 'Adam', 0.9, 0.0)": 0.5558183198038952,
 "('relu', 'RMSprop', 0.9, 0.0)": 0.5645446463307585,
 "('sigmoid', 'Adam', 0.9, 0.0)": 0.5635134647857006,
 "('sigmoid', 'RMSprop', 0.9, 0.0)": 0.5584465280526021,
 "('tahn', 'Adam', 0.9, 0.0)": 0.5482449187579804,
 "('tahn', 'RMSprop', 0.9, 0.0)": 0.5596510493480616}

In [28]:
model=model_builder(params_dict,params_dict)

In [29]:
model.fit(train_dataset, nb_epoch=100)

375.54270987957716

In [30]:
obj=best_dnn.fit(train_dataset,**params_dict,epochs=100)

In [31]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

In [32]:
train_scores = model.evaluate(train_dataset, [metric], transformers)

computed_metrics: [0.9998442471575106, 0.999908650771901, 0.9998328851617215, 0.9991499409681228, 0.9999399453502688, 0.9999107326174088, 0.9991564938260035, 0.9992330753677752, 0.9984292006736624, 1.0, 0.999935231063182, 0.9999640339519493, 0.9994001599573448, 0.9998314674371012, 1.0, 0.9997873471557681, 0.9992788461538462, 0.9999825601674224, 0.9997537205784048, 0.9997415311051161, 0.9998975777129103, 0.9996078667953667, 0.9999226716930065, 0.9996167802354406, 0.9997930307819674, 0.9999267578125, 0.99972286257763]


In [33]:
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

computed_metrics: [0.553082191780822, 0.5850202429149798, 0.38398692810457513, 0.41705790297339596, 0.6784810126582279, 0.42275132275132277, 0.6049382716049383, 0.6857142857142857, 0.5321428571428571, 0.4606643356643356, 0.5582329317269077, 0.4294117647058824, 0.20294117647058824, 0.5075949367088608, 0.49691358024691357, 0.43412384716732544, 0.6000000000000001, 0.46604938271604934, 0.48649691358024694, 0.40604575163398693, 0.5887218045112781, 0.4794520547945206, 0.5124458874458875, 0.5513833992094861, 0.5313765182186234, 0.7012987012987013, 0.5448717948717949]


In [34]:
test_scores = model.evaluate(test_dataset, [metric], transformers)

computed_metrics: [0.5753846153846154, 0.6538734896943852, 0.6461538461538461, 0.49934895833333337, 0.8371794871794871, 0.4908045977011495, 0.5674358974358975, 0.6321138211382114, 0.45682888540031397, 0.7341772151898733, 0.5426829268292683, 0.8353658536585366, 0.6674698795180722, 0.6646341463414634, 0.7123287671232876, 0.6854887674559805, 0.5706349206349206, 0.5989583333333334, 0.7076923076923077, 0.6142857142857143, 0.6119791666666666, 0.6803966437833715, 0.6153846153846154, 0.6414392059553351, 0.6986301369863014, 0.6469760900140646, 0.620137299771167]


In [35]:
print(train_scores)
print(valid_scores)
print(test_scores)

{'mean-roc_auc_score': 0.9996876895212344}
{'mean-roc_auc_score': 0.5118962887635851}
{'mean-roc_auc_score': 0.637325392186871}
