In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [14]:
#import rdkit
import rdkit.Chem.AllChem
#import rdkit.Chem.Draw

In [28]:
from collections import Counter

In [15]:
%%time
df_train = pd.read_csv("train.csv")

Wall time: 1min 19s


In [16]:
%%time
temp = df_train.sample(n=100000) # df_train.sample(frac=0.1)
del df_train
df_train = temp
del temp

Wall time: 1.67 s


In [17]:
orig_feature_names = list(df_train.columns[1:(len(df_train.columns)-1)])
feature_names = list(df_train.columns[1:(len(df_train.columns)-1)])

In [18]:
%%time
df_train['molecule'] = map(rdkit.Chem.MolFromSmiles, df_train['smiles'].values)

Wall time: 1min 43s


In [19]:
%%time
df_train['atom_count'] = map(lambda m : len(m.GetAtoms()), df_train['molecule'].values)
feature_names.append('atom_count')

Wall time: 590 ms


In [20]:
%%time
df_train['len'] = map(len, df_train['smiles'].values)
feature_names.append('len')

Wall time: 44 ms


In [21]:
%%time
df_train['equals'] = map(lambda s : s.count('='), df_train['smiles'].values)
feature_names.append('equals')

Wall time: 114 ms


In [22]:
%%time
df_train['double'] = map(lambda m : sum(np.array([b.GetBondType() for b in m.GetBonds()])==2), df_train['molecule'])
feature_names.append('double')

Wall time: 36.4 s


In [23]:
%%time
df_train['triple'] = map(lambda m : sum(np.array([b.GetBondType() for b in m.GetBonds()])==3), df_train['molecule'])
feature_names.append('triple')

Wall time: 33.5 s


In [25]:
%%time
max_atoms = max(df_train['atom_count'].values)
for r in range(max_atoms):
    df_train['ring '+str(r)] = map(lambda m : m.GetRingInfo().NumAtomRings(r), df_train['molecule'])
    feature_names.append('ring '+str(r))

Wall time: 13.9 s


In [56]:
%%time
element_count = {element : [] for element in range(1,151)}
for m in df_train['molecule']:
    counts = Counter([atom.GetAtomicNum() for atom in m.GetAtoms()])
    for element in range(1,151):
        element_count[element].append(counts[element])

Wall time: 29.9 s


In [63]:
%%time
elements = [k for k in element_count if sum(element_count[k]) > 0]
print elements
for element in elements:
    df_train['count '+str(element)] = element_count[element]
    feature_names.append('count '+str(element))

Wall time: 368 ms


In [None]:
# commented out because this is just the sum of the 50 columns above, so it creates linear dependence and doesn't add information
#%%time
#max_atoms = max(df_train['atom_count'].values)
#df_train['ring total'] = map(lambda m : sum([m.GetRingInfo().NumAtomRings(r) for r in range(max_atoms)]), df_train['molecule'])
#feature_names.append('ring total')

In [29]:
#df_train['bond_count'] = map(lambda s : len(rdkit.Chem.MolFromSmiles(s).GetBonds()), df_train['smiles'].values)
#feature_names.append('bond_count')

In [129]:
df_train = df_train.reset_index(drop=True)

In [130]:
%%time
X_train_old = df_train[orig_feature_names].values
X_train = df_train[feature_names].values
Y_train = df_train['gap'].values

Wall time: 6.01 s


In [144]:
validation_indices = np.random.choice(range(len(df_train)), size=int(0.7*len(df_train)), replace=False)
mask = np.array([False for n in range(len(df_train))])
mask[validation_indices] = True
df_valid = df_train[mask]
df_train = df_train[~mask]

In [149]:
%%time
X_train_old = df_train[orig_feature_names].values
X_train = df_train[feature_names].values
Y_train = df_train['gap'].values

Wall time: 307 ms


In [150]:
%%time
X_valid_old = df_valid[orig_feature_names].values
X_valid = df_valid[feature_names].values
Y_valid = df_valid['gap'].values

Wall time: 144 ms


In [153]:
%%time
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred_valid = LR.predict(X_valid)
#rmse = reduce(lambda s, (a,b) : s + (a-b)**2, zip(Y_train, LR_pred_train), 0)
rmse = mean_squared_error(Y_valid, LR_pred_valid)
print rmse

0.0464585375732
Wall time: 7.14 s


In [164]:
%%time
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred_valid = RF.predict(X_valid)
#rmse = reduce(lambda s, (a,b) : s + (a-b)**2, zip(Y_train, RF_pred_train), 0)
rmse = mean_squared_error(Y_valid, RF_pred_valid)
print rmse

0.0268039374252
Wall time: 14.9 s


In [162]:
%%time
RR = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000])
RR.fit(X_train, Y_train)
RR_pred_valid = RR.predict(X_valid)
#rmse = reduce(lambda s, (a,b) : s + (a-b)**2, zip(Y_train, RR_pred_train), 0)
rmse = mean_squared_error(Y_valid, RR_pred_valid)
print rmse

0.0464545132387
Wall time: 4.42 s


In [26]:
mol = rdkit.Chem.MolFromSmiles(df_train['smiles'][df_train.index[0]])

In [36]:
rdkit.Chem.Draw.ShowMol(mol)

ImportError: No module named PIL

In [81]:
print [atom.GetAtomicNum() for atom in mol.GetAtoms()]
sum([bond.GetBondType() == rdkit.Chem.rdchem.BondType.SINGLE for bond in mol.GetBonds()])

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 16, 7, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6]


1

In [3]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [7]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [20]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [50]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [45]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


In [51]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [38]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

In [47]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [40]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [48]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)