In [3]:
import numpy as np
import tensorflow as tf
import deepchem as dc
from sklearn.kernel_ridge import KernelRidge



In [4]:
dataset_file = "../../deepchem/datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"
featurizer = dc.feat.CoulombMatrixEig(23, remove_hydrogens=False)

loader = dc.data.SDFLoader(
      tasks=["atomization_energy"], smiles_field="smiles",
      featurizer=featurizer,
      mol_field="mol")
dataset = loader.featurize(dataset_file)


Loading raw samples now.
shard_size: 8192
Reading structures from ../../deepchem/datasets/gdb1k.sdf.
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
TIMING: featurizing shard 0 took 0.627 s
TIMING: dataset construction took 0.851 s
Loading dataset from disk.


In [5]:
# Split dataset
random_splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = random_splitter.train_valid_test_split(dataset)

# Apply normalisation transformer
transformers = [
    dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset),
    dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

for dataset in [train_dataset, valid_dataset, test_dataset]:
    for transformer in transformers:
        dataset = transformer.transform(dataset)

TIMING: dataset construction took 0.012 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.
TIMING: dataset construction took 0.042 s
Loading dataset from disk.
TIMING: dataset construction took 0.012 s
Loading dataset from disk.
TIMING: dataset construction took 0.008 s
Loading dataset from disk.
TIMING: dataset construction took 0.007 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.


In [10]:
def krr_model_builder(model_params, model_dir):
    sklearn_model = KernelRidge(**model_params)
    return dc.models.SklearnModel(sklearn_model, model_dir)

params_dict = {
    "kernel": ["laplacian", "linear"],
    "alpha": [0.0001, 0.001, 0.01, 0.01],
    "gamma": [0.0001, 0.001, 0.01, 0.1]
}

metric = dc.metrics.Metric(dc.metrics.mean_absolute_error)

optimizer = dc.hyper.HyperparamOpt(krr_model_builder)
best_krr, best_krr_hyperparams, all_krr_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, transformers,
    metric=metric)




Fitting model 1/32
hyperparameters: {'kernel': 'laplacian', 'gamma': 0.0001, 'alpha': 0.0001}
computed_metrics: [86283.96601157558]
Model 1/32, Metric mean_absolute_error, Validation set 0: 86283.966012
	best_validation_score so far: 86283.966012
Fitting model 2/32
hyperparameters: {'kernel': 'laplacian', 'gamma': 0.0001, 'alpha': 0.001}
computed_metrics: [76013.57938648511]
Model 2/32, Metric mean_absolute_error, Validation set 1: 76013.579386
	best_validation_score so far: 86283.966012
Fitting model 3/32
hyperparameters: {'kernel': 'laplacian', 'gamma': 0.0001, 'alpha': 0.01}
computed_metrics: [71806.19872522898]
Model 3/32, Metric mean_absolute_error, Validation set 2: 71806.198725
	best_validation_score so far: 86283.966012
Fitting model 4/32
hyperparameters: {'kernel': 'laplacian', 'gamma': 0.0001, 'alpha': 0.01}
computed_metrics: [71806.19872522898]
Model 4/32, Metric mean_absolute_error, Validation set 3: 71806.198725
	best_validation_score so far: 86283.966012
Fitting model 5/3

In [None]:
# Metrics to models
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error)
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]