In [13]:
# imports 
import os
import time
import datetime
import json
import gc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn import metrics

from itertools import product

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [14]:
dataDirectory = 'data'

In [19]:
train = pd.read_csv(dataDirectory+'/train.csv')
structures = pd.read_csv(dataDirectory+'/structures.csv')
train.shape

(4658147, 6)

In [20]:
# Sampling the 10% dataframe for quick experiments. This is compromise accuracy !
train = train.sample(frac=0.1, replace=False, random_state=2019)
train.shape

(465815, 6)

In [21]:
## Merge with structural features of atoms
trainMerged = pd.merge(train, structures, how = 'left',left_on  = ['molecule_name', 'atom_index_0'],
                  right_on = ['molecule_name',  'atom_index'])
trainMerged.rename(columns={'atom': 'atom_0','x': 'x_0','y': 'y_0','z': 'z_0'}, inplace=True)


trainMerged = pd.merge(trainMerged, structures, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_1'],
                  right_on = ['molecule_name',  'atom_index'])
trainMerged.rename(columns={'atom': 'atom_1','x': 'x_1','y': 'y_1','z': 'z_1'}, inplace=True)

trainMerged.drop(['atom_index_x','atom_index_y'], axis=1, inplace=True)

trainMerged.head(3)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,1164923,dsgdb9nsd_039011,9,8,2JHC,1.27218,H,-0.583182,1.97175,0.803563,C,-0.017013,0.014571,0.025109
1,4441950,dsgdb9nsd_122989,15,16,3JHH,3.53842,H,-1.706262,-0.254513,-1.102151,H,-0.636574,-2.476063,-0.612832
2,4566146,dsgdb9nsd_127768,13,4,3JHC,7.92151,H,0.120346,-2.510468,3.880919,C,0.017184,-2.120399,0.606965


In [22]:
def EuclideanDistance(x,y,z):
    """calculates euclidean distance given abs relative position"""
    return np.power(x**2 + y**2 + z**2,0.5)
    
vecDist = np.vectorize(EuclideanDistance)

In [25]:
## relative postion and euclidean distance features
trainMerged['RelPos_x'] = np.abs(trainMerged['x_0'] - trainMerged['x_1'])
trainMerged['RelPos_y'] = np.abs(trainMerged['y_0'] - trainMerged['y_1'])
trainMerged['RelPos_z'] = np.abs(trainMerged['z_0'] - trainMerged['z_1'])
trainMerged['Euc_Dist'] = vecDist(trainMerged.RelPos_x, trainMerged.RelPos_y ,trainMerged.RelPos_z)

In [26]:
trainMerged.head(3)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,RelPos_x,RelPos_y,RelPos_z,Euc_Dist
0,1164923,dsgdb9nsd_039011,9,8,2JHC,1.27218,H,-0.583182,1.97175,0.803563,C,-0.017013,0.014571,0.025109,0.566169,1.957179,0.778454,2.181075
1,4441950,dsgdb9nsd_122989,15,16,3JHH,3.53842,H,-1.706262,-0.254513,-1.102151,H,-0.636574,-2.476063,-0.612832,1.069689,2.221549,0.489319,2.513752
2,4566146,dsgdb9nsd_127768,13,4,3JHC,7.92151,H,0.120346,-2.510468,3.880919,C,0.017184,-2.120399,0.606965,0.103162,0.390069,3.273954,3.298722


In [27]:
labelencoder = LabelEncoder()
def labelEncodeCategoricalFeatures(DF):
    """label encodes the categorical feautes in a given dataframe"""
    df = DF.copy()
    for c in df.columns:
        if df[c].dtype.name == 'object':
            df[c] = labelencoder.fit_transform(df[c])
    return df

In [28]:
X = trainMerged[['type','RelPos_x','RelPos_y','RelPos_z','Euc_Dist', 'atom_0','atom_1']]
Y = trainMerged['scalar_coupling_constant']

In [29]:
X = labelEncodeCategoricalFeatures(X).values

In [30]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=2019)
print("Train Test Split:")
print(np.array(X_train).shape, np.array(X_test).shape, np.array(y_train).shape, np.array(y_test).shape)

Train Test Split:
(312096, 7) (153719, 7) (312096,) (153719,)


In [31]:
def competetionMetric(types,y_test, yhat):
    """Metric given here: https://www.kaggle.com/c/champs-scalar-coupling/overview/evaluation"""
    maeList = []
    for t in set(types):
        yt = y_test[types==t]
        yh = yhat[types==t]
        maeList.append(mean_absolute_error(yt, yh))
    return np.mean(np.log(maeList))

In [32]:
def evaluateModel(types,y_test, yhat):
    """Prints several regression evaluation metrics given ground truth and predictions"""
    print('Coefficient of determination: ',r2_score(y_test, yhat))
    print('MAE: ',mean_absolute_error(y_test, yhat))
    print('Competition Metric: ',competetionMetric(types,y_test, yhat))

In [33]:
def naiveBaseLine(X_train, X_test, y_train, y_test):
    """Uses mean of training data as prediction"""
    yhat = np.ones(len(y_test)) * np.mean(y_train)
    types = X_test[:,0]
    evaluateModel(types, y_test, yhat)

In [34]:
def typeAwareBaseLine(X_train, X_test, y_train, y_test):
    """Uses mean of corresponding type as pediction"""
    yhat = np.zeros(len(y_test))
    trainDf = pd.DataFrame({'type':X_train[:,0], 'y':y_train})
    meansDf = trainDf.groupby('type').mean()
    meanDict = dict(zip(meansDf.index,meansDf.y))
    types = X_test[:,0]
    yhat = np.array([meanDict[t] for t in types])
    evaluateModel(types, y_test, yhat)

In [35]:
def randomForestModel(X_train, X_test, y_train, y_test):
    """Trains and evaluates a random forrest regressor"""
    regr = RandomForestRegressor(max_depth=10, random_state=2019,n_estimators=100)
    regr.fit(X_train,y_train)
    yhat = regr.predict(X_test)
    types = X_test[:,0]
    evaluateModel(types,y_test, yhat)

In [36]:
def typeAwareRandomForestModel(X_train, X_test, y_train, y_test):
    """Trains a separate random forrest regressor for each class"""
    models = {}
    types = X_train[:,0]
    for t in set(types):
        yt = y_train[types==t]
        xt = X_train[types==t]
        regr = RandomForestRegressor(max_depth=10, random_state=2019,n_estimators=100)
        regr.fit(xt,yt)
        models[t] = regr
    
    types = X_test[:,0]
    yhat = np.zeros(X_test.shape[0])
    for t in set(types):
        yhat[types==t] = models[t].predict(X_test[types==t])

    evaluateModel(types,y_test, yhat)

In [37]:
naiveBaseLine(X_train, X_test, y_train, y_test)

Coefficient of determination:  -8.45019949569e-08
MAE:  24.7044899378
Competition Metric:  3.00509813563


In [38]:
typeAwareBaseLine(X_train, X_test, y_train, y_test)

Coefficient of determination:  0.949197128083
MAE:  4.22328689846
Competition Metric:  1.23701125805


In [39]:
randomForestModel(X_train, X_test, y_train, y_test)

Coefficient of determination:  0.986736212325
MAE:  2.51034462219
Competition Metric:  0.737599583038


In [40]:
typeAwareRandomForestModel(X_train, X_test, y_train, y_test)

Coefficient of determination:  0.986891107317
MAE:  2.46392197367
Competition Metric:  0.713454918034
