In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

## Experiments on small sample

In [17]:
df_sample_train, df_sample_test = train_test_split(df_train, train_size=0.01, test_size=0.01)
print df_sample_train.shape
print df_sample_test.shape
df_sample_train.head()

(10000, 258)
(10000, 258)


Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
426390,C1=CC2=C([SiH2]1)C=C([SiH2]2)c1cc2ncccc2c2=CCC...,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1.79
162601,[nH]1c2cc(-c3cncs3)c3cocc3c2c2c3cocc3c3C=C[SiH...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.79
879608,c1cc2[se]c3c(ccc4cc(-c5cccc6c[nH]cc56)c5=CCC=c...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,2.07
816701,[nH]1ccc2c3nsnc3c3c4c5nsnc5c(cc4c4=CCC=c4c3c12...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.58
949758,C1=Cc2c([SiH2]1)c1c3cocc3c(cc1c1nsnc21)-c1scc2...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.45


In [67]:
#ngram_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
#ngram_vectorizer.fit(df_train.smiles)
X_train = ngram_vectorizer.transform(df_sample_train.smiles)
X_test = ngram_vectorizer.transform(df_sample_test.smiles)
y_train = df_sample_train.gap.values
y_test = df_sample_test.gap.values

In [21]:
params = {'n_estimators': [20,100,200]}
rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=params, cv=2)
rfr.fit (X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [20, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [32]:
def score(gridsearch):
    print('RMSE'),
    print mean_squared_error(gridsearch.predict(X_test), y_test)
    print('BEST'),
    print gridsearch.best_params_
score (rfr)

RMSE 0.0319368402442
BEST {'n_estimators': 200}


In [29]:
from sklearn.linear_model import *
params = {'alpha': [ 1e-5, 1e-4, 1e-3, 0.1, 1]}
ridge = GridSearchCV(estimator=Ridge(), param_grid=params)
ridge.fit (X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [33]:
score(ridge)

RMSE 0.0370029264426
BEST {'alpha': 0.001}


In [68]:
lasso = GridSearchCV(estimator=Lasso(max_iter=10000), param_grid=params)
lasso.fit (X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [35]:
score(lasso)

RMSE 0.0378248773261
BEST {'alpha': 1e-05}


In [36]:
elastic = GridSearchCV(estimator=ElasticNet(), param_grid=params)
elastic.fit (X_train, y_train)
score(elastic)

RMSE 0.0381340201797
BEST {'alpha': 1e-05}


## ALL DATA

In [6]:
fourgram_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 4), lowercase=False)
fourgram_vectorizer.fit(df_train.smiles)

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [102]:
X_train, X_test,y_train ,y_test = train_test_split(df_train.smiles, df_train.gap.values, test_size=0.1)

In [103]:
#X_train = fourgram_vectorizer.transform(X_train)
#X_test3g = ngram_vectorizer.transform(X_test)
X_test4g = fourgram_vectorizer.transform(X_test)

In [61]:
ridge = Ridge(alpha=0.001)
ridge.fit(X_train, y_train)
mean_squared_error(ridge.predict(X_test), y_test)

0.022809802525883563

In [7]:
X = fourgram_vectorizer.transform(df_train.smiles)

In [64]:
ridge.fit(X, df_train.gap.values)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [66]:
mean_squared_error(y_test, ridge.predict(X_test))

0.022750743792671384

## RandomForest in batches

In [9]:
def regress(forest, fold, trees):
    X = fourgram_vectorizer.transform(df_train.smiles[fold])
    y = df_train.gap.values[fold]
    forest.set_params(n_estimators=trees)
    forest.fit(X,y)
def rmse(true, pred):
    return mean_squared_error(true, pred)

In [None]:
rfr = RandomForestRegressor(warm_start=True)
trees = 20 
regress(rfr,slice(0, 100000), trees)
#print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 40
regress(rfr,slice(100000, 200000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 60
regress(rfr,slice(200000, 300000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 80
regress(rfr,slice(300000, 400000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 100
regress(rfr,slice(400000, 500000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 120
regress(rfr,slice(500000, 600000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 140
regress(rfr,slice(600000, 700000), trees)
print rmse(y_test,rfr.predict(X_test4g))

In [None]:
trees = 160
regress(rfr,slice(700000, 800000), trees)
print rmse(y_test,rfr.predict(X_test4g))

## Save to disc

In [65]:
import pickle
f = open('1-4grams_ALLs.ridge', 'wb')
pickle.dump(ridge, f)
#f = open('1-3grams_80000s.rfr', 'rb')
#rfr = pickle.load(f)

In [53]:
mean_squared_error(y_test, rfr.predict(X_test))

0.032025732949881439

## Submission

In [70]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [73]:
test_pred = fourgram_vectorizer.transform(df_test.smiles)
write_to_file("1-4grams_ALLs.ridge.csv", ridge.predict(test_pred))