In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pickle

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


In [49]:
def smiles_to_fingerprint (smiles, method='Morgan'):
    if method == "Morgan": 
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fp_arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, fp_arr)
        return fp_arr
    elif method == "other":
        print "another fingerprint"   

In [46]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [73]:
def do_forest(forest, smiles, y, fold=None):
    if fold:
        smiles = smiles[fold]
        y = y[fold]
    fps = smiles.apply(smiles_to_fingerprint).tolist()
    X = np.ndarray((smiles.shape[0], 2048))
    for i in range (smiles.shape[0]):
        X[i] = fps[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    forest.fit(X_train, y_train)
    print mean_squared_error(y_test, forest.predict(X_test))

In [39]:
#convert the test set to fingerprints for submission
test_fps = df_test.smiles.apply(smiles_to_fingerprint)
X_test = np.ndarray((824230, 2048))
for i in range (test_fps.shape[0]):
    X_test[i] = test_fps[i]
np.save('Data/test_morgans',X_test)

In [78]:
X_test = np.load('Data/test_morgans.npy')

In [20]:
forest = RandomForestRegressor(n_estimators=50, warm_start=True)
do_forest(forest, df_train.smiles,df_train.gap, slice(0,100000))

0.0100673940578


In [23]:
f = open('Data/morgan_50t.forest','wb')
pickle.dump(forest, f)
f.close()
#forest = pickle.load(f)

In [44]:
f = open('Data/morgan_50t.forest','rb')
forest = pickle.load(f)

In [47]:
write_to_file('Data/morgan_50t_forest.csv', forest.predict(X_test))

In [74]:
forest.set_params(n_estimators=100)
do_forest(forest, df_train.smiles, df_train.gap.values, slice(100000,200000))

0.00878610689185


In [75]:
forest.set_params(n_estimators=150)
do_forest(forest, df_train.smiles, df_train.gap.values, slice(200000,300000))

0.00875536170404


In [None]:
f = open('Data/morgan_150t.forest','wb')
pickle.dump(forest, f)
f.close()

In [80]:
write_to_file('Data/morgan_150t_forest.csv', forest.predict(X_test))