In [3]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

Using TensorFlow backend.


In [2]:
def rmse(preds):
    return np.sqrt(np.mean((np.array(preds)-y_valid)**2))

In [6]:
def load_train_valid(trainfile, validation_split=0):
    ''' load training data '''
    df_train = pd.read_csv(trainfile)
    df_train_gap = df_train.pad
    df_train = df_train.drop(['pad', 'smiles'], axis=1)
    df_train = df_train.values
    X_train = df_train[:int(df_train.shape[0]*(1-validation_split))]
    y_train = df_train_gap[:int(df_train.shape[0]*(1-validation_split))]
    print "DATA LOADED:"
    print "X_train shape:", X_train.shape, " |  y_train shape:", y_train.shape
    if validation_split is not 0:
        X_valid = df_train[int(df_train.shape[0]*(1-validation_split)):]
        y_valid = df_train_gap[int(df_train.shape[0]*(1-validation_split)):]
        print "X_valid shape:", X_valid.shape, " |  y_valid shape:", y_valid.shape
        return X_train, y_train, X_valid, y_valid
    else:
        return X_train, y_train, None, None

X_train, y_train, X_valid, y_valid = load_train_valid('Data/morganfingerprint_train_100K.csv.gz',\
                                                      validation_split = 0.2)

DATA LOADED:
X_train shape: (81920, 2048)  |  y_train shape: (81920,)
X_valid shape: (20481, 2048)  |  y_valid shape: (20481,)


In [34]:
def load_test(testfile):
    ''' load test data '''
    df_test = pd.read_csv(testfile)
    df_test = df_test.drop(['Id', 'smiles'], axis=1)
    return df_test.values

X_test = load_test('test.csv.gz')

In [13]:
X_test = pd.read_csv('Data/morganfingerprint_test_800K.csv.gz', header=None).drop(0, axis=1)
X_test = X_test.values

In [None]:
# linear regression - original dataframe
LR = LinearRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_valid)
LR_rmse = rmse(LR_pred)
print "Linear Regression RMSE:", LR_rmse

In [7]:
# linear regression - different fingerprints
LR = LinearRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_valid)
LR_rmse = rmse(LR_pred)
print "Linear Regression RMSE:", LR_rmse

Linear Regression RMSE: 110.832719538


In [4]:
# random forest - original dataframe
RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_valid)
RF_rmse = rmse(RF_pred)
print "Random Forest Regression RMSE:", RF_rmse

Random Forest Regression RMSE: 0.273389041375


In [8]:
# random forest - different fingerprints
RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_valid)
RF_rmse = rmse(RF_pred)
print "Random Forest Regression RMSE:", RF_rmse

Random Forest Regression RMSE: 0.109962810099


In [21]:
pickle.dump(RF, open('RF_100k.p', 'wb'))

In [27]:
# neural net
model = Sequential()
model.add(Dense(128, input_shape=(256,), activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, y_train, batch_size=128, epochs=20, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
model.evaluate(X_valid, y_valid)



0.076093788065314291

In [28]:
##### SAVE MODEL #####
model.save('nn_model.h5')

In [3]:
model = load_model('nn_model.h5')

In [8]:
RF = pickle.load(open( "Models/RF_100k.p", "rb" ))

In [None]:
def prediction_generator():
    f1 = gzip.open('test.csv.gz', 'rb')
    next(f1, None)
    fbuf = io.BufferedReader(f1)
    for j, line in enumerate(fbuf):
        smile = line.decode('utf-8').split(',', 2)[1]
        l = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2)
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(l, arr)
        yield RF.predict(X_test[0,:].reshape(1, -1))[0]

In [None]:
with open(filename, "w") as f:
    f.write("Id,Prediction\n")
    for i,p in enumerate(predictions):
        f.write(str(i+1) + "," + str(p) + "\n")

In [43]:
with open('nn_preds.csv', 'w') as f:
    f.write("Id,Prediction\n")
    for i in range(X_test.shape[0]):
        pred_curr = model.predict(X_test[0,:].reshape(1,X_test.shape[1]))[0][0]
        f.write(str(i+1) + ',' + str(pred_curr) + '\n')

In [16]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [18]:
RF_pred = RF.predict(X_test)
write_to_file('Predictions/rf.csv', RF_pred)