In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [6]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [7]:
#Only taking 100.000 rows
#df,_ = train_test_split(df_train, train_size=0.1)
print df.shape
df.head()

(100000, 258)


Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
474,c1ccc(o1)-c1sc(-c2sc(-c3scc4[se]ccc34)c3cc[SiH...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.78
86166,c1sc(-c2Cc(cc2)-c2sc(-c3scc4cc[se]c34)c3sccc23...,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1.4
305831,C1=Cc2ncc3c4sc(cc4cnc3c2C1)-c1cccnc1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.8
698358,c1sc(-c2sc(-c3ccc(cn3)-c3ncncn3)c3nccnc23)c2sc...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.44
359588,[nH]1c2ccsc2c2c3nsnc3c3C=C([SiH2]c3c12)c1scc2C...,1,0,0,0,1,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1.7


In [8]:
#Dropping Output Variable "GAP"
X = df.drop(['gap'], axis=1)
X.shape

(100000, 257)

In [9]:
#Storing "GAP" Variable into y
y = df.gap.values
y.shape

(100000,)

# RDKIT - Fingerprints

In [2]:
#Some Helpful Links:
# From: http://www.rdkit.org/docs/Cookbook.html
# From https://github.com/rdkit/benchmarking_platform/blob/master/scoring/fingerprint_lib.py

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
import numpy

In [11]:
mols = []
for smile in df.smiles:
    mols.append(Chem.MolFromSmiles(smile))

In [3]:
#Function:    createFingerprint
#Parameter:   Fingerprint-Method as String and molsdf as Array
#Return:      Data frame the fingerprint as feature columns
def createFingerprint (method, mols):
    if method == "Morgan":
        # generate fingeprints: Morgan fingerprint with radius 2
        fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with Morgan Fingerprints
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf
    
    elif method == "GenMACCSKeys":
        # generate fingeprints: GenMACCSKeys
        fps = [MACCSkeys.GenMACCSKeys(m) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with GenMACCSKeys
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf

    elif method == "rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect":
        # generate fingeprints: rdMolDescriptors.GetTopologicalTorsionFingerprintAsIntVect
        fps = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits=1024) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf
    
    elif method == "fpAvalon.GetAvalonFP":
        # generate fingeprints: fpAvalon.GetAvalonFP
        fps = [fpAvalon.GetAvalonFP(m, nBits=1024) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with fpAvalon.GetAvalonFP
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf
    
    elif method == "Chem.RDKFingerprint":
        # generate fingeprints: Chem.RDKFingerprint
        fps = [Chem.RDKFingerprint(m, maxPath=7, fpSize=1024, nBitsPerHash=2) for m in mols]

        # convert the RDKit explicit vectors into numpy arrays
        np_fps = []
        for fp in fps:
          arr = numpy.zeros((1,))
          DataStructs.ConvertToNumpyArray(fp, arr)
          np_fps.append(arr)

        #Create new DF with Chem.RDKFingerprint
        fingerprintsdf = pd.DataFrame(np_fps)
        return fingerprintsdf

## Morgan Fingerprint 

In [199]:
morganDF = createFingerprint("Morgan", mols)
#morganDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [229]:
#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(morganDF, y, test_size=0.2)

### Linear Regression

In [230]:
"""LR = LinearRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_pred)

write_to_file("Data/MorganFingerprint_LinearRegression.csv", LR_pred)"""

### Ridge Regression with CV (Best performing, thereby used for all other fingerprints at first)

In [236]:
#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("Data/MorganFingerprint_RidgeRegression_cv.csv", LR_ridge_pred)

### Lasso CV

In [246]:
"""from sklearn.linear_model import LassoCV
LassoRegressionCV = LassoCV(cv=5)
LassoRegressionCV.fit(X_train, y_train)
LassoRegressionCV_pred = LassoRegressionCV.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LassoRegressionCV.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LassoRegressionCV_pred)

write_to_file("Data/MorganFingerprint_Lasso_cv.csv", LassoRegressionCV_pred)"""

### ElasticNet CV

In [248]:
"""from sklearn.linear_model import ElasticNetCV
ElasticNetRegression = ElasticNetCV(cv=5)
ElasticNetRegression.fit(X_train, y_train)
ElasticNetRegression_pred = ElasticNetRegression.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (ElasticNetRegression.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, ElasticNetRegression_pred)

write_to_file("Data/MorganFingerprint_elastic_cv.csv", ElasticNetRegression_pred)"""



### BayesianRidge

In [254]:
"""from sklearn.linear_model import BayesianRidge
BayesianRidgeRegression = BayesianRidge()
BayesianRidgeRegression.fit(X_train, y_train)
BayesianRidgeRegression_pred = BayesianRidgeRegression.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (ElasticNetRegression.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, ElasticNetRegression_pred)

write_to_file("Data/MorganFingerprint_BayesianRidge.csv", ElasticNetRegression_pred)"""

## GenMACCSKeys

In [20]:
GenMACCSKeysDF = createFingerprint("GenMACCSKeys", mols)

#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(GenMACCSKeysDF, y, test_size=0.2)

#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("Data/GenMACCSKeys_RidgeRegression_cv.csv", LR_ridge_pred)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,1,0


## GetHashedTopologicalTorsionFingerprintAsBitVect

In [29]:
GetHashedTopologicalTorsionFingerprintAsBitVectDF = createFingerprint("rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect", mols)

#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(GetHashedTopologicalTorsionFingerprintAsBitVectDF, y, test_size=0.2)

#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("Data/GGetHashedTopologicalTorsionFingerprintAsBitVect_RidgeRegression_cv.csv", LR_ridge_pred)

Explained variance score: 1 is perfect prediction: 0.80
RMSE: 0.03


## fpAvalon.GetAvalonFP

In [30]:
GetAvalonFPDF = createFingerprint("fpAvalon.GetAvalonFP", mols)

#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(GetAvalonFPDF, y, test_size=0.2)

#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("Data/GetAvalonFP_RidgeRegression_cv.csv", LR_ridge_pred)

Explained variance score: 1 is perfect prediction: 0.89
RMSE: 0.02


## Chem.RDKFingerprint

In [31]:
RDKFingerprintDF = createFingerprint("Chem.RDKFingerprint", mols)

#New Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(RDKFingerprintDF, y, test_size=0.2)

#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("Data/RDKFingerprint_RidgeRegression_cv.csv", LR_ridge_pred)

Explained variance score: 1 is perfect prediction: 0.80
RMSE: 0.03


# Testing for Submission
## Ridge Regression with best performing Fingerprint approach

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

mols = []
for smile in df_test.smiles:
    mols.append(Chem.MolFromSmiles(smile))

X_test = createFingerprint("Morgan", mols)
    
mols = []
for smile in df_train.smiles:
    mols.append(Chem.MolFromSmiles(smile))

X_train = createFingerprint("Morgan", mols)
y_train = df_train.gap.values



#Conduct a Ridge Regression with 5-fold cross validation
from sklearn.linear_model import RidgeCV 
LR_ridge = RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0), cv=5)
LR_ridge.fit(X_train, y_train)
LR_ridge_pred = LR_ridge.predict(X_test)

#print("Explained variance score: 1 is perfect prediction: %0.2f") % (LR_ridge.score(X_train, y_train))
#print("RMSE: %0.2f") % mean_squared_error(y_test, LR_ridge_pred)

write_to_file("SubmissionSamples/MorganFingerprint_RidgeRegression_cv.csv", LR_ridge_pred)

# Misc functions

In [4]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")