In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import sidetable
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import mean
from numpy import std
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

import warnings 

warnings.filterwarnings('ignore')

In [2]:
df_final = pd.read_csv("heart-rate-final.csv") #--> this is the merged file

In [31]:
#Prepare the training data
df_train = df_final.copy()

uuid = df_train.pop('uuid') #Extract the UUID, we are simply dropping the column here

y_train = df_train.pop('HR') #Heart rate is the target variable

#Drop datasetId column
df_train = df_train.drop(['datasetId'], axis=1)

X_train = df_train

In [38]:
#Preparing the test dataset
df_final_test = pd.read_csv("heart-rate-final-test.csv") #--> this is the merged file
df_final_test = df_final_test.drop(df_final_test.columns[0], axis=1)

df_test = df_final_test.copy()

test_uuid = df_test.pop('uuid') #Extract the UUID

#Drop datasetId column
df_test = df_test.drop(['datasetId'], axis=1)

X_test = df_test

------

In [36]:
#Begin the pipelining!
categorical_columns = ['condition']
numerical_columns = X_train.describe().columns


pipeline_numerical = Pipeline([('scaler', StandardScaler()),('power', PowerTransformer())])

pipeline_categorical = Pipeline([('onehot', OneHotEncoder())])

t = [('cat', pipeline_categorical, categorical_columns), ('num', pipeline_numerical, numerical_columns)]
ct = ColumnTransformer(transformers=t, remainder='passthrough')


In [37]:
%%time
#Fit the model!
model = Pipeline([('ct', ct),('lr', ExtraTreesRegressor())])

model.fit(X_train, y_train)

NameError: name 'X_test' is not defined

In [39]:
y_test_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)

print('MAE: ', '%0.8f'%mean_absolute_error(y_train, y_train_predict))

Training Score:  0.00000000


In [40]:
#Generating the predictions and preparing the submission file!
y_test_predict = y_test_predict.round(6) ##Rounding to the 6th digit since the HR was to the 6th digit in the original data

print(y_test_predict)

predicted_frame = pd.concat([pd.DataFrame(test_uuid), pd.DataFrame(y_test_predict)], axis =1, ignore_index = True)

predicted_frame.columns = ['uuid','HR']

predicted_frame.to_csv('submission.csv',index=False)

[64.709432 73.777723 69.19798  ... 60.942905 77.409522 76.607111]


In [42]:
model.score(X_train, y_train)

1.0

In [None]:
#Hyper parameter tuning
param_grid={
    'lr__n_estimators': [100,200,300]
}
grid = GridSearchCV(model, param_grid=param_grid,scoring='neg_mean_absolute_error', cv=2, refit=True)
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
hr_predict = grid.predict(X_test)

hr_predict = hr_predict.round(6)

hr_predict