In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


In [2]:
from __future__ import division, print_function

import joblib
import numpy as np
from IPython.display import SVG
from scipy.spatial.distance import pdist, squareform, jaccard, cityblock
from scipy import stats

from multiprocessing import Pool
from copy import deepcopy

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model as LM

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge, ElasticNet, Lasso

# Model dictionaries - we're just going to be looking at Ridge and Random Forest here.
model_dict = {'ridge' : {'m' : Ridge, 'kw' : {'fit_intercept':True, 'alpha':0.1}},
              'rf'    : {'m' : RandomForestRegressor, 'kw' : {'n_estimators':100, 'n_jobs':4, 'max_depth':10}}}

# Datasets
from glob import glob
targets = [s.replace('datasets/', '') for s in glob('datasets/*')]

import matplotlib.pyplot as plt
%pylab inline
figsize(20, 10)
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [3]:
# Pull in data for a single target name
def get_data(tgt_name='COX-2'):
    data_dir = 'datasets/' + tgt_name + '/'
    preds = joblib.load(data_dir + tgt_name + '_predsu.npy')
    resps = joblib.load(data_dir + tgt_name + '_respu.npy')
    smiles = joblib.load(data_dir + tgt_name + '.smiu')
    return preds, resps, smiles

In [6]:
preds, resps, _ = get_data('HERG')

In [7]:
preds.shape

(5010, 128)

In [8]:
model = Sequential([
    Dense(32, input_shape=(128,)),
    Activation('relu'),
    Dense(10),
    Activation('relu'),
])

In [9]:
# For a mean squared error regression problem
model.compile(optimizer='rmsprop',
              loss='mse')

https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/

In [10]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [11]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(128, input_dim=128, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [12]:
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [13]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, preds, resps, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: -0.56 (0.06) MSE


In [21]:
estimator.fit(preds, resps, epochs=50, batch_size=5)

<keras.callbacks.History at 0x1a19fda290>

In [18]:
estimator.predict(preds)

array([ 7.35495281,  5.05813503,  5.6678009 , ...,  6.90033531,
        5.33633423,  5.65323162], dtype=float32)

In [19]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model as LM

def regress(response, pred_list, one=False, do_print=True):

    regr = LM.LinearRegression(fit_intercept=one)
    regr.fit(np.asarray(pred_list).swapaxes(0, 1), response)
    
    presp = regr.predict(np.asarray(pred_list).swapaxes(0, 1))
    r2 = r2_score(response, presp)
    if do_print:
        if one:
            rs = ('%6.4f '* (len(pred_list) + 1)) % (tuple(regr.coef_) + (regr.intercept_,))
        else:
            rs = ('%6.4f '* (len(pred_list)) % tuple(regr.coef_))
        print ('Coeffs:       ' + rs)
        print ('R-squared: {:9.4f}'.format(r2))
    else:
        return regr.coef_, r2

In [20]:
regress(resps, [estimator.predict(preds),])

Coeffs:       1.0050 
R-squared:    0.9486


In [22]:
estimator2 = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [23]:
estimator2.fit(preds, resps, validation_split=0.33)

<keras.callbacks.History at 0x1a19b21810>

In [24]:
regress(resps, [estimator2.predict(preds),])

Coeffs:       1.0009 
R-squared:    0.7260


In [27]:
2**7

128

In [28]:
# define the model
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(128, input_dim=128, kernel_initializer='normal', activation='relu'))
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [29]:
numpy.random.seed(7)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, preds, resps, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))



Larger: -0.49 (0.03) MSE


In [30]:
pipeline.fit(preds, resps)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x1a17878810>)])

In [31]:
regress(resps, [pipeline.predict(preds),])

Coeffs:       0.9963 
R-squared:    0.9603


In [33]:
pipeline = Pipeline(estimators)

In [38]:
pipeline.fit(preds, resps,)

Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x1a17878810>)])

In [39]:
regress(resps, [pipeline.predict(preds),])

Coeffs:       0.9982 
R-squared:    0.7652
