In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [6]:
df,_ = train_test_split(df_train, train_size=0.1)
print df.shape
df.head()


(100000, 258)


Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
187239,[nH]1cccc1-c1cc2cc3[se]c4ccccc4c3cc2c2=C[SiH2]...,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1.96
872787,[nH]1c2C=C([SiH2]c2c2cc3c4c[nH]cc4ccc3cc12)c1c...,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1.99
556801,[nH]1-c2cc[se]c2-c2[SiH2]c3ccc4c[nH]cc4c3-c12,1,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2.35
256612,c1ccc([se]1)-c1cc2cnc3c4ccccc4[se]c3c2c2cscc12,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.5
516063,c1cnc(s1)-c1sc(c2[SiH2]ccc12)-c1sc(-c2cccc3nsn...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.29


In [7]:
X = df.drop(['gap'], axis=1)
X.shape

(100000, 257)

In [8]:
y = df.gap.values
y.shape

(100000,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Baselines

In [10]:
lr = LinearRegression()
lr.fit(X_train.drop(['smiles'], axis=1), y_train)
lr_pred = lr.predict(X_test.drop(['smiles'], axis=1))
mean_squared_error(y_test,lr_pred)

In [12]:
rf = RandomForestRegressor()
rf.fit(X_train.drop(['smiles'], axis=1), y_train)
rf_pred = rf.predict(X_test.drop(['smiles'], axis=1))
mean_squared_error(y_test,rf_pred)

0.076909157274511933

## NGRAMS

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
ngram_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), min_df=1)
ngram_vectorizer.fit(X.smiles)

TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
ngrams_train = ngram_vectorizer.transform(X_train.smiles)
ngrams_test = ngram_vectorizer.transform(X_test.smiles)

In [18]:
rfr = RandomForestRegressor()
rfr.fit(ngrams_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [19]:
rfr_pred = rfr.predict(ngrams_test)
mean_squared_error(y_test, rfr_pred)

0.033493121783189701

## Submission

In [20]:
rfr_test_pred = rfr.predict(ngram_vectorizer.transform(df_test.smiles))

In [21]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [22]:
write_to_file("rfr_ngrams_1.csv", rfr_test_pred)
