In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [190]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [191]:
#Only taking 100.000 rows
df,_ = train_test_split(df_train, train_size=0.1)
print df.shape
df.head()

(100000, 258)


Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
97764,[nH]1ccc2c3cscc3c3c(sc4cc(-c5scc6cc[nH]c56)c5c...,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.95
519261,C1=Cc2ccc3c(oc4cc(-c5cccc6ccccc56)c5cscc5c34)c...,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.37
423690,[nH]1ccc2ncc3cc(oc3c12)-c1scc2[nH]ccc12,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,2.41
609263,c1cnc2c(c1)c1=CCC=c1c1c3ncc(cc3cnc21)-c1cccc2c...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,2.11
733641,C1=Cc2cnc3c4cnc(cc4c4nsnc4c3c2[SiH2]1)-c1ccccc1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.46


In [192]:
#Dropping Output Variable "GAP"
X = df.drop(['gap'], axis=1)
X.shape

(100000, 257)

In [193]:
#Storing "GAP" Variable into y
y = df.gap.values
y.shape

(100000,)

# RDKIT - Fingerprints

In [194]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
import numpy

In [195]:
# From: http://www.rdkit.org/docs/Cookbook.html
mols = []
for smile in df.smiles:
    mols.append(Chem.MolFromSmiles(smile))
#molsdf = pd.DataFrame(mols, columns=["mols"])

In [198]:
#Function:    createFingerprint
#Parameter:   Fingerprint-Method as String and molsdf as Array
#Return:      Data frame the fingerprint as feature columns
def createFingerprint (method, mols):
    if method == "Morgan":
        # generate fingeprints: Morgan fingerprint with radius 2
        fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with Morgan Fingerprints
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf
    
    elif method == "other":
        print "another fingerprint"

## Morgan Fingerprint 

In [199]:
morganDF = createFingerprint("Morgan", mols)
morganDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [229]:
#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(morganDF, y, test_size=0.2)

### Linear Regression

In [230]:
LR = LinearRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_test)

In [232]:
print LR_pred.shape
print y_test.shape

(20000,)
(20000,)


In [238]:
print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_pred)

Explained variance score: 1 is perfect prediction: 0.90
RMSE: 327049413832493504.00


In [None]:
write_to_file("MorganFingerprint_LinearRegression.csv", LR_pred)

### Ridge Regression with CV

In [236]:
#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

In [240]:
print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

Explained variance score: 1 is perfect prediction: 0.90
RMSE: 0.02


In [251]:
write_to_file("MorganFingerprint_RidgeRegression_cv.csv", LR_ridge_pred)

### Lasso CV

In [246]:
from sklearn.linear_model import LassoCV
LassoRegressionCV = LassoCV(cv=5)
LassoRegressionCV.fit(X_train, y_train)
LassoRegressionCV_pred = LassoRegressionCV.predict(X_test)

In [247]:
print("Explained variance score: 1 is perfect prediction: %0.2f") % (LassoRegressionCV.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LassoRegressionCV_pred)

Explained variance score: 1 is perfect prediction: 0.89
RMSE: 0.02


In [252]:
write_to_file("MorganFingerprint_Lasso_cv.csv", LassoRegressionCV_pred)

### ElasticNet CV

In [248]:
from sklearn.linear_model import ElasticNetCV
ElasticNetRegression = ElasticNetCV(cv=5)
ElasticNetRegression.fit(X_train, y_train)
ElasticNetRegression_pred = ElasticNetRegression.predict(X_test)



In [249]:
print("Explained variance score: 1 is perfect prediction: %0.2f") % (ElasticNetRegression.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, ElasticNetRegression_pred)

Explained variance score: 1 is perfect prediction: 0.89
RMSE: 0.02


In [253]:
write_to_file("MorganFingerprint_elastic_cv.csv", ElasticNetRegression_pred)

### BayesianRidge

In [254]:
from sklearn.linear_model import BayesianRidge
BayesianRidgeRegression = BayesianRidge()
BayesianRidgeRegression.fit(X_train, y_train)
BayesianRidgeRegression_pred = BayesianRidgeRegression.predict(X_test)

In [255]:
print("Explained variance score: 1 is perfect prediction: %0.2f") % (ElasticNetRegression.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, ElasticNetRegression_pred)

Explained variance score: 1 is perfect prediction: 0.89
RMSE: 0.02


In [256]:
write_to_file("MorganFingerprint_BayesianRidge.csv", ElasticNetRegression_pred)

# Misc functions

In [257]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")