In [1]:
import numpy as np
import tensorflow as tf
import deepchem as dc



In [2]:
sider_tasks, sider_datasets, transformers = dc.molnet.load_sider(featurizer = 'GraphConv')
train_dataset, valid_dataset, test_dataset = sider_datasets

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [3]:
sider_datasets

(<deepchem.data.datasets.DiskDataset at 0x7f42a51ba710>,
 <deepchem.data.datasets.DiskDataset at 0x7f425a10e278>,
 <deepchem.data.datasets.DiskDataset at 0x7f425a10edd8>)

In [4]:
train_dataset.y.shape

(1141, 27)

In [5]:
valid_dataset.y.shape

(143, 27)

In [6]:
test_dataset.y.shape

(143, 27)

In [7]:
featurizer = dc.feat.CircularFingerprint(size = 1024)

In [8]:
loader = dc.data.CSVLoader(
      tasks=sider_tasks, smiles_field="smiles",
      featurizer=featurizer)

In [9]:
dataset = loader.featurize('sider.csv')

Loading raw samples now.
shard_size: 8192
About to start loading CSV from sider.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 1.953 s
TIMING: dataset construction took 2.012 s
Loading dataset from disk.


In [10]:
sider_tasks

['Hepatobiliary disorders',
 'Metabolism and nutrition disorders',
 'Product issues',
 'Eye disorders',
 'Investigations',
 'Musculoskeletal and connective tissue disorders',
 'Gastrointestinal disorders',
 'Social circumstances',
 'Immune system disorders',
 'Reproductive system and breast disorders',
 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
 'General disorders and administration site conditions',
 'Endocrine disorders',
 'Surgical and medical procedures',
 'Vascular disorders',
 'Blood and lymphatic system disorders',
 'Skin and subcutaneous tissue disorders',
 'Congenital, familial and genetic disorders',
 'Infections and infestations',
 'Respiratory, thoracic and mediastinal disorders',
 'Psychiatric disorders',
 'Renal and urinary disorders',
 'Pregnancy, puerperium and perinatal conditions',
 'Ear and labyrinth disorders',
 'Cardiac disorders',
 'Nervous system disorders',
 'Injury, poisoning and procedural complications']

In [11]:
splitter = dc.splits.RandomSplitter('sider.csv')
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset)
#NOTE THE RENAMING:
valid_dataset, test_dataset = test_dataset, valid_dataset


Computing train/valid/test indices
TIMING: dataset construction took 0.124 s
Loading dataset from disk.
TIMING: dataset construction took 0.031 s
Loading dataset from disk.
TIMING: dataset construction took 0.028 s
Loading dataset from disk.


In [12]:
n_features = train_dataset.get_data_shape()[0]

In [13]:
params_dict = {"activation": ["relu","sigmoid","tahn"],
               "optimizer": ["Adam","RMSprop"],
               "momentum": [.9],
               "penalty": [0.]
              }

n_features = train_dataset.get_data_shape()[0]
def model_builder(model_params, model_dir):
    model = dc.models.MultitaskClassifier(
    len(sider_tasks), n_features, **model_params)
    return model

In [14]:

metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
optimizer = dc.hyper.HyperparamOpt(model_builder)
best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, [], metric)

Fitting model 1/6
hyperparameters: {'activation': 'relu', 'optimizer': 'Adam', 'momentum': 0.9, 'penalty': 0.0}
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


computed_metrics: [0.6759368836291914, 0.57277318640955, nan, 0.6665257819103974, 0.6548423423423424, 0.6835499775885253, 0.7216378662659654, 0.5992767915844839, 0.5714285714285714, 0.674559686888454, 0.6651873767258383, 0.6968503937007875, 0.7680180180180181, 0.597463768115942, 0.6898071625344353, 0.6509396241503398, 0.6351575456053069, 0.5798212005108556, 0.5881578947368421, 0.6390022675736962, 0.7094414893617023, 0.6313253012048192, 0.747927031509121, 0.671900826446281, 0.7327823691460056, 0.7642276422764227, 0.6897524967433782]
Model 1/6, Metric mean-roc_auc_score, Validation set 0: 0.664550
	best_validation_score so far: 0.664550
Fitting model 2/6
hyperparameters: {'activation': 'relu', 'optimizer': 'RMSprop', 'momentum': 0.9, 'penalty': 0.0}




computed_metrics: [0.6928994082840236, 0.5654269972451791, nan, 0.6601859678782755, 0.6399211711711712, 0.6842223218287764, 0.7009767092411721, 0.6081525312294542, 0.5704260651629073, 0.6592954990215264, 0.6624753451676528, 0.7249015748031497, 0.7691441441441441, 0.5891304347826087, 0.677961432506887, 0.6493402638944423, 0.6293532338308458, 0.5852490421455938, 0.6111842105263158, 0.6507936507936508, 0.7019060283687943, 0.6257028112449798, 0.7512437810945274, 0.6845041322314049, 0.7203856749311295, 0.7723577235772358, 0.6875814155449413]
Model 2/6, Metric mean-roc_auc_score, Validation set 1: 0.664412
	best_validation_score so far: 0.664550
Fitting model 3/6
hyperparameters: {'activation': 'sigmoid', 'optimizer': 'Adam', 'momentum': 0.9, 'penalty': 0.0}




computed_metrics: [0.6771203155818541, 0.5718549127640036, nan, 0.6781487743026204, 0.6221846846846847, 0.7059614522635589, 0.7054845980465816, 0.5884286653517423, 0.5761904761904761, 0.6626223091976517, 0.6649408284023668, 0.7071850393700787, 0.7744932432432432, 0.5927536231884059, 0.678236914600551, 0.6505397840863655, 0.6616915422885572, 0.5718390804597702, 0.5975877192982456, 0.6482993197278911, 0.7249556737588653, 0.6301204819277109, 0.7437810945273631, 0.6675619834710744, 0.7316345270890725, 0.7882113821138212, 0.6888840642640035]
Model 3/6, Metric mean-roc_auc_score, Validation set 2: 0.665797
	best_validation_score so far: 0.665797
Fitting model 4/6
hyperparameters: {'activation': 'sigmoid', 'optimizer': 'RMSprop', 'momentum': 0.9, 'penalty': 0.0}




computed_metrics: [0.6792899408284024, 0.573921028466483, nan, 0.6559594251901943, 0.6311936936936937, 0.6875840430300313, 0.7096168294515401, 0.591387245233399, 0.5774436090225563, 0.6630136986301369, 0.6464497041420119, 0.7204724409448819, 0.7694256756756757, 0.5873188405797101, 0.6831955922865014, 0.6397441023590564, 0.6492537313432836, 0.5766283524904214, 0.5958333333333333, 0.6312925170068027, 0.7061170212765958, 0.6222891566265061, 0.7529021558872304, 0.6747933884297521, 0.7229109274563819, 0.7634146341463415, 0.6973512809379071]
Model 4/6, Metric mean-roc_auc_score, Validation set 3: 0.661877
	best_validation_score so far: 0.665797
Fitting model 5/6
hyperparameters: {'activation': 'tahn', 'optimizer': 'Adam', 'momentum': 0.9, 'penalty': 0.0}




computed_metrics: [0.6745562130177515, 0.5656565656565657, nan, 0.6868131868131868, 0.6303490990990991, 0.685342895562528, 0.7054845980465815, 0.6173570019723866, 0.5709273182957393, 0.6712328767123288, 0.6442307692307692, 0.7367125984251969, 0.7761824324324323, 0.5615942028985508, 0.703305785123967, 0.6483406637345062, 0.6310116086235489, 0.599616858237548, 0.5767543859649122, 0.6528344671201813, 0.7134308510638298, 0.6359437751004016, 0.7628524046434493, 0.6696280991735537, 0.7224517906336089, 0.7723577235772358, 0.696699956578376]
Model 5/6, Metric mean-roc_auc_score, Validation set 4: 0.665833
	best_validation_score so far: 0.665833
Fitting model 6/6
hyperparameters: {'activation': 'tahn', 'optimizer': 'RMSprop', 'momentum': 0.9, 'penalty': 0.0}




computed_metrics: [0.6798816568047338, 0.5723140495867769, nan, 0.6650464919695689, 0.6368243243243243, 0.685342895562528, 0.7148760330578512, 0.5946745562130178, 0.5606516290726817, 0.6669275929549903, 0.6683925049309665, 0.7362204724409449, 0.7618243243243243, 0.568840579710145, 0.7038567493112948, 0.6505397840863654, 0.6227197346600333, 0.5932311621966795, 0.587280701754386, 0.6546485260770976, 0.7134308510638299, 0.6323293172690763, 0.7487562189054726, 0.6654958677685952, 0.735307621671258, 0.7922764227642276, 0.6873643074250977]
Model 6/6, Metric mean-roc_auc_score, Validation set 5: 0.665348
	best_validation_score so far: 0.665833
computed_metrics: [0.928406199754819, 0.9154449785157672, 0.9487649152187565, 0.9210463232912212, 0.9356331304100618, 0.9118616921047673, 0.9716445471422364, 0.9183550333439187, 0.9267816379617622, 0.9265855817641959, 0.9253679516545641, 0.9607127722688363, 0.9272378962034133, 0.9283527401050056, 0.9297783873214907, 0.9348542520658467, 0.971275273179349

In [15]:
best_dnn

MultitaskClassifier(activation_fns=None, bias_init_consts=None, dropouts=None,
                    layer_sizes=None, n_classes=2, n_features=1024, n_tasks=27,
                    weight_decay_penalty=None, weight_decay_penalty_type=None,
                    weight_init_stddevs=None)

In [16]:
best_hyperparams

('tahn', 'Adam', 0.9, 0.0)

In [17]:
all_results

{"('relu', 'Adam', 0.9, 0.0)": 0.6645497498618181,
 "('relu', 'RMSprop', 0.9, 0.0)": 0.6644123681019535,
 "('sigmoid', 'Adam', 0.9, 0.0)": 0.665796634238483,
 "('sigmoid', 'RMSprop', 0.9, 0.0)": 0.6618770141718782,
 "('tahn', 'Adam', 0.9, 0.0)": 0.6658333895283935,
 "('tahn', 'RMSprop', 0.9, 0.0)": 0.6653482452271642}

In [18]:
model=model_builder(params_dict,params_dict)

In [19]:
model.fit(train_dataset, nb_epoch=100)

149.27905597971446

In [20]:
obj=best_dnn.fit(train_dataset,**params_dict,epochs=100)

In [21]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

In [22]:
train_scores = model.evaluate(train_dataset, [metric], transformers)

computed_metrics: [0.9992345791571439, 0.9999362755808026, 1.0, 0.9999708454810496, 0.9999411816604418, 0.9996122682668753, 0.9999555634553856, 0.9976235842066264, 0.9998596986174626, 0.9998278713215178, 0.998548093636792, 0.9998855835240275, 0.9997605363984674, 0.9969014742588744, 0.9999281609195403, 0.9998780011711887, 0.9998473249143929, 0.999340909807052, 0.9999244696294223, 0.9998406666989368, 0.9999284880876209, 0.9996081848826919, 0.9986471540861872, 0.9999521808612382, 0.9999600752037979, 0.9999506260615397, 0.9996762126525074]


In [23]:
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

computed_metrics: [0.6434418145956607, 0.5626721763085399, nan, 0.6386306001690616, 0.6148648648648649, 0.6351411922904527, 0.7038880540946657, 0.556706114398422, 0.6301378446115289, 0.6377690802348337, 0.6395463510848127, 0.6486220472440944, 0.7157939189189189, 0.6166666666666667, 0.6636363636363636, 0.6157536985205918, 0.6135986733001658, 0.6054438058748404, 0.5910087719298245, 0.5916099773242631, 0.660904255319149, 0.6610441767068271, 0.7404643449419569, 0.6740185950413222, 0.7277318640955004, 0.7658536585365854, 0.6884498480243162]




In [24]:
test_scores = model.evaluate(test_dataset, [metric], transformers)

computed_metrics: [0.5822784810126582, 0.5089580386610089, 0.5340579710144928, 0.5793103448275863, 0.5206611570247933, 0.5994288681204569, 0.5800751879699249, 0.5588235294117647, 0.6313953488372093, 0.6962745098039216, 0.593421052631579, 0.6276978417266188, 0.6825958702064896, 0.586231884057971, 0.6267796610169492, 0.5988662131519275, 0.5467455621301776, 0.5789855072463768, 0.5669312169312168, 0.6448377581120945, 0.5914160401002506, 0.6442778457772337, 0.5332130730050934, 0.600587084148728, 0.6967105263157896, 0.7518248175182483, 0.5408163265306123]


In [25]:
print(train_scores)
print(valid_scores)
print(test_scores)

{'mean-roc_auc_score': 0.9995385189089475}
{'mean-roc_auc_score': 0.6478230291820858}
{'mean-roc_auc_score': 0.6001185821218954}


In [26]:
import seaborn as sns


sns.set(rc={'figure.figsize':(10,7)})
sns.set(style="whitegrid")
ax = sns.barplot(x=[test_scores['mean-roc_auc_score']],
                 y=['First Scenario'])
ax.set(xlim=(0, None))

[(0, 0.6301245112279902)]