In [13]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tensorflow as tf
import deepchem as dc
from tensorflow import keras
import joblib
from deepchem.utils.save import load_from_disk
from deepchem.models.sklearn_models import SklearnModel

sns.set()


## Keras model - binarised representation

keras_bin = keras.models.load_model("keras_from_mat_sgd_0.001_400_100_18072019.h5")

## Keras model - eigenspectrum representation

keras_eig = keras.models.load_model("keras_from_mat_eigenspectrum_23072019.h5")

## KRR model - binarised representation

krr_bin = joblib.load(open("krr_binarised_laplacian_24072019.sav", "rb"))

## KRR model - eigenspectrum representation 

krr_eig = joblib.load(open("krr_eig_laplacian_24072019.sav", "rb"))


models = {}
models['keras_bin'] = [keras_bin]
models['keras_eig'] = [keras_eig]
models['krr_bin'] = [krr_bin]
models['krr_eig'] = [krr_eig]

In [2]:
## Load data set

dataset_file = "raw_data/gdb7.sdf"

tasks = ["u0_atom"]

featurizer = dc.feat.CoulombMatrix(max_atoms=23)

loader = dc.data.SDFLoader(
  tasks=['u0_atom'],
  smiles_field="smiles",
  mol_field="mol",
  featurizer=featurizer)

dataset = loader.featurize(dataset_file)

eig_featurizer = dc.feat.CoulombMatrixEig(23, remove_hydrogens=False)

data_loader_eig = dc.data.SDFLoader(
      tasks=["u0_atom"], smiles_field="smiles",
      featurizer=eig_featurizer,
      mol_field="mol")

dataset_eig = data_loader_eig.featurize(dataset_file)


Loading raw samples now.
shard_size: 8192
Reading structures from raw_data/gdb7.sdf.
Currently featurizing feature_type: CoulombMatrix
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 3.635 s
TIMING: dataset construction took 5.626 s
Loading dataset from disk.
Loading raw samples now.
shard_size: 8192
Reading structures from raw_data/gdb7.sdf.
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 4.927 s
TIMING: dataset construction took 6.347 s
Loading dataset from disk.


In [9]:
def split_transformer(splitter, trans_func, dataset):
    train, valid, test = splitter.train_valid_test_split(dataset)
    if trans_func == 'normaliser':
        normaliser = dc.trans.NormalizationTransformer(transform_X=True, dataset=train, move_mean=True)
        train, valid, test = (normaliser.transform(train), normaliser.transform(valid), normaliser.transform(test))
    elif trans_func == 'binariser':
        binariser = dc.trans.CoulombFitTransformer(dataset=train)
        train, valid, test = (binariser.transform(train), binariser.transform(valid), binariser.transform(test))
#     trans = transf_func
#     train = trans.transform(train)
#     valid = transformer.transform(valid)
#     test  = transformer.transform(test)
    return train, valid, test

In [18]:
## Split and transform datasets

splitter = dc.splits.SingletaskStratifiedSplitter(task_number=0)

train_eig, valid_eig, test_eig = split_transformer(splitter, 'normaliser', dataset_eig)
train_bin, valid_bin, test_bin = split_transformer(splitter, 'binariser', dataset)

print(train_eig.get_shape())
print(train_bin.get_shape())
#print(test_dataset.get_shape())


TIMING: dataset construction took 0.064 s
Loading dataset from disk.
TIMING: dataset construction took 0.024 s
Loading dataset from disk.
TIMING: dataset construction took 0.023 s
Loading dataset from disk.
TIMING: dataset construction took 0.052 s
Loading dataset from disk.


  X = np.nan_to_num((X - self.X_means) / self.X_stds)
  X = np.nan_to_num((X - self.X_means) / self.X_stds)


TIMING: dataset construction took 0.014 s
Loading dataset from disk.
TIMING: dataset construction took 0.015 s
Loading dataset from disk.
TIMING: dataset construction took 0.410 s
Loading dataset from disk.
TIMING: dataset construction took 0.196 s
Loading dataset from disk.
TIMING: dataset construction took 0.178 s
Loading dataset from disk.
TIMING: dataset construction took 2.195 s
Loading dataset from disk.
TIMING: dataset construction took 0.287 s
Loading dataset from disk.
TIMING: dataset construction took 0.267 s
Loading dataset from disk.
((5728, 23), (5728, 1), (5728, 1), (5728,))
((5728, 2094), (5728, 1), (5728, 1), (5728,))


In [19]:
## Test data 
X_test = test_eig.X
y_test = test_eig.y

X_bin_test = test_bin.X[:, :2089]
y_bin_test = test_bin.y[:, :2089]

In [23]:
# Test
from sklearn import metrics 

keras_bin_pred = models['keras_bin'][0].predict(X_bin_test)
predictions = pd.DataFrame(keras_bin_pred, columns=['keras_bin'])
predictions['krr_bin'] = models['krr_bin'][0].predict(X_bin_test) 
predictions['keras_eig'] = models['keras_eig'][0].predict(X_test)
predictions['krr_eig']  = models['krr_eig'][0].predict(X_test)



In [28]:
# Error analysis

errors = {}

for model in models:
    if 'bin' in model:
        errors[model] = {'mae': metrics.mean_absolute_error(y_bin_test, predictions[model]), 
                        'mse': np.sqrt(metrics.mean_squared_error(y_bin_test, predictions[model]))}
    else:
        errors[model] = {'mae': metrics.mean_absolute_error(y_test, predictions[model]), 
                        'mse': np.sqrt(metrics.mean_squared_error(y_test, predictions[model]))}

errors

{'keras_bin': {'mae': 62.884709861371775, 'mse': 117.16826913806958},
 'keras_eig': {'mae': 42.113606628758944, 'mse': 100.55122848744365},
 'krr_bin': {'mae': 120.56416409810042, 'mse': 150.82757726914818},
 'krr_eig': {'mae': 55.788587067307674, 'mse': 92.79802797436831}}

In [None]:

pos = list(range(len(errors)))
width = 0.25

# Plotting the bars
fig, ax = plt.subplots(figsize=(10,5))

# Create a bar with pre_score data,
# in position pos,
plt.bar(pos, 
        #using df['pre_score'] data,
        errors['keras_bin']['mae'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
        # color='#EE3224', 
        color='green',
        # with label the first value in first_name
        label='keras_bin_mae') 


# Create a bar with pre_score data,
# in position pos,
plt.bar([p+width for p in pos], 
        #using df['pre_score'] data,
        errors['keras_eig']['mae'], 
        # of width
        width, 
        # with alpha 0.5
        alpha=0.5, 
        # with color
        # color='#EE3224', 
        color='green',
        # with label the first value in first_name
        label='keras_eig_mae') 
plt.show()

