In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv")

In [3]:
data_dst = data.filter(regex='_dst$', axis=1).replace(0, np.NaN)
data_dst = data_dst.interpolate(methods='linear', axis=1)
data_dst.fillna(-999, inplace=True)
data.update(data_dst)

In [4]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [5]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :-4], data.iloc[:, -4:], test_size=0.2, shuffle=123)

In [7]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [8]:
y_pred = rf.predict(x_test)

In [10]:
mae = mean_absolute_error(y_pred, y_test)
print("MAE: ", mae)

MAE:  1.8109008374999986


In [11]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 650, 1100, 1550, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 35, 60, 85, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [12]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 146.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 316.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [13]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [34]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    mae = mean_absolute_error(predictions, test_labels)
    print('Model Performance')
    print('MAE:')
    print(mae)
    return mae

In [35]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, x_train, y_train)

Model Performance
MAE:
1.895925500000001
Model Performance
MAE:
1.700113022473087


In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
# Create the parameter grid based on the results of random search 
param_grids = {
    'bootstrap': [True],
    'max_depth': [10,40,50],
    'max_features': [2,10,20],
    'min_samples_leaf': [0.001,0.04,1],
    'min_samples_split':[9,10,11],
    'n_estimators': [2000,2500,3000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grids, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 65.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 149.8min
[Parallel(n_jobs=-1)]: Done 729 out of 729 | elapsed: 200.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [54]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 2,
 'min_samples_leaf': 0.04,
 'min_samples_split': 9,
 'n_estimators': 2000}

In [80]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, x_train, y_train)

Model Performance
MAE:
1.7672923012002235


In [81]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv")

In [82]:
ID = test.id.copy()
test.drop('id', axis=1, inplace=True)

In [83]:
test_dst = test.filter(regex='_dst$', axis=1).replace(0, np.NaN)
test_dst = test_dst.interpolate(methods='linear', axis=1)
test_dst.fillna(-999, inplace=True)
test.update(test_dst)

In [84]:
pred = best_grid.predict(test)

In [85]:
pred[:,0]

array([8.02081192, 8.00709991, 8.02947718, ..., 7.9985671 , 7.96958543,
       7.94574251])

In [86]:
submission = pd.DataFrame({'id':ID, 'hhb':pred[:,0], 'hbo2':pred[:,1], 'ca': pred[:,2], 'na': pred[:,3] })

In [87]:
submission

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.020812,3.987832,9.008492,3.056122
1,10001,8.007100,3.983241,9.092774,3.023701
2,10002,8.029477,4.000765,9.116253,3.026419
3,10003,7.950033,3.997329,9.088201,3.028708
4,10004,7.940633,4.047095,8.965579,3.064143
...,...,...,...,...,...
9995,19995,7.934492,4.034455,9.073284,3.042501
9996,19996,7.984757,4.001487,9.035846,3.006595
9997,19997,7.998567,3.990529,9.003110,3.031248
9998,19998,7.969585,4.008689,9.043946,3.054035


In [88]:

submission.to_csv("submission/best_grid.csv", index=False)

In [39]:
submission.head()

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.835473,4.485795,10.227639,2.759007
1,10001,6.017116,3.917904,8.52475,2.39406
2,10002,9.950055,5.21766,11.239717,3.120166
3,10003,8.282282,4.193402,9.230623,4.488401
4,10004,4.986564,3.290931,7.356716,3.101167


10000