In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.tree import _tree
from operator import itemgetter
from sklearn.ensemble import RandomForestRegressor 
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor 
# from sklearn.ensemble import GradientBoostingClassifier
import pickle

from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file("../../msds434-whoop-app-44384939c1f4.json")

In [12]:
project_id = 'msds434-whoop-app'
client = bigquery.Client(credentials=credentials, project=project_id)
query = client.query("select * from `msds434-whoop-app.whoopdataset.whoopmerge`")
df = query.to_dataframe()

In [207]:
# flags for key missing data
df['na_acr'] = df['acute_chronic_strain'].isna() + 0
df['na_workout'] = df['y_workout_strain'].isna() + 0

# imputation
workout_cols = ['acute_chronic_strain', 'workout_strain', 'workout_average_heart_rate','workout_max_heart_rate','workout_kilojoule','zone_one','zone_two','zone_three','zone_four','zone_five',
					 'y_workout_strain', 'y_workout_average_heart_rate','y_workout_max_heart_rate','y_workout_kilojoule','y_zone_one','y_zone_two','y_zone_three','y_zone_four','y_zone_five']

for i in df.columns:
	if df[i].dtype == 'int' or df[i].dtype == 'float':
		if i in workout_cols:
			df[i] = df[i].fillna(df[i].min())
		elif i.startswith("y_"):
			df[i] = df[i].fillna(df.groupby('day_of_week')[i].transform('median'))
		elif i.startswith("w_"):
			df[i] = df[i].fillna(df.groupby('day_of_week')[i].transform('median'))
		else:
			pass

In [208]:
same_day_sleep_cols = ['sleep_start_time','light_sleep_time', 'slow_wave_sleep_time','rem_sleep_time','sleep_cycle_count','disturbance_count','respiratory_rate']
yesterday_sleep_cols = ['y_total_sleep_time', 'y_light_sleep_time', 'y_slow_wave_sleep_time','y_rem_sleep_time','y_sleep_cycle_count','y_disturbance_count','y_respiratory_rate', 'y_sleep_performance_perc', 'y_sleep_consistency_perc','y_sleep_efficiency_perc']
yesterday_strain_cols = ['y_kilojoule','y_strain', 'y_avg_heart_rate','y_max_heart_rate']
yesterday_workout_cols = ['y_workout_start_time','y_workout_max_heart_rate', 'y_workout_max_heart_rate','y_workout_kilojoule','y_zone_one','y_zone_two','y_zone_thee','y_zone_four','y_zone_five']
weekly_avgs = ['acute_chronic_strain', 'w_strain','w_sleep_start_time_sd','w_slow_wave_sleep_time','w_light_sleep_time','w_rem_sleep_time','w_recovery_score','w_hrv_milli','w_resting_heart_rate']
df['recovery_score_bin'] = pd.cut(df['recovery_score'], bins=[-float('inf'), 33, 67, float('inf')], labels=['red', 'yellow', 'green'])

In [166]:
# HELPER FUNCTIONS
def getTreeVars( TREE, varNames ) :
   tree_ = TREE.tree_
   varName = [ varNames[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ]

   nameSet = set()
   for i in tree_.feature :
       if i != _tree.TREE_UNDEFINED :
           nameSet.add( i )
   nameList = list( nameSet )
   parameter_list = list()
   for i in nameList :
       parameter_list.append( varNames[i] )
   return parameter_list

def getEnsembleTreeVars( ENSTREE, varNames ) :
   importance = ENSTREE.feature_importances_
   index = np.argsort(importance)
   theList = []
   for i in index :
       imp_val = importance[i]
       if imp_val > 0.01 :
           v = int( imp_val / np.max( ENSTREE.feature_importances_ ) * 100 )
           theList.append( ( varNames[i], v ) )
   theList = sorted(theList,key=itemgetter(1),reverse=True)
   return theList

## Model 1: predict next day's recovery

In [209]:
other_cols = ['recovery_score', 'week_of_year','day_of_week','na_acr','na_workout', 'sleep_start_time']
feats = list(set(other_cols + same_day_sleep_cols + yesterday_sleep_cols + yesterday_strain_cols + yesterday_workout_cols + weekly_avgs))
df_recovery_model = df[feats]

X = df_recovery_model.drop('recovery_score',axis = 1)
Y = df_recovery_model['recovery_score']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.8, test_size=0.2, random_state=2)

In [210]:
m01_recovery_tree = tree.DecisionTreeRegressor(max_depth=4)
m01_recovery_tree = m01_recovery_tree.fit(X_train, Y_train)

Y_pred_train = m01_recovery_tree.predict(X_train)
Y_pred_test = m01_recovery_tree.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getTreeVars(m01_recovery_tree, X_train.columns.values)

15.248561916368933
18.458252859195152


['y_rem_sleep_time',
 'light_sleep_time',
 'w_hrv_milli',
 'rem_sleep_time',
 'y_disturbance_count',
 'w_resting_heart_rate',
 'y_sleep_efficiency_perc',
 'respiratory_rate',
 'y_sleep_consistency_perc',
 'w_light_sleep_time',
 'y_total_sleep_time',
 'sleep_start_time',
 'y_avg_heart_rate']

In [252]:
m01_recovery_gb = GradientBoostingRegressor(random_state=5,n_iter_no_change = 5, n_estimators=1000, learning_rate = 0.05,max_depth = 4, min_samples_leaf=5)
m01_recovery_gb = m01_recovery_gb.fit(X_train, Y_train)

Y_pred_train = m01_recovery_gb.predict(X_train)
Y_pred_test = m01_recovery_gb.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getEnsembleTreeVars(m01_recovery_gb, X_train.columns.values)

14.135905169944515
17.380436605037946


[('w_resting_heart_rate', 100),
 ('respiratory_rate', 46),
 ('rem_sleep_time', 10),
 ('w_hrv_milli', 8),
 ('light_sleep_time', 8),
 ('y_strain', 7),
 ('w_light_sleep_time', 5),
 ('y_sleep_performance_perc', 5),
 ('w_recovery_score', 5),
 ('acute_chronic_strain', 4),
 ('y_disturbance_count', 4),
 ('y_respiratory_rate', 4),
 ('disturbance_count', 3),
 ('day_of_week', 3),
 ('y_avg_heart_rate', 3),
 ('sleep_cycle_count', 3),
 ('slow_wave_sleep_time', 3),
 ('y_sleep_cycle_count', 3),
 ('y_total_sleep_time', 2)]

## Model 2: workout performance

In [272]:
other_cols = ['workout_strain','recovery_score','hrv_milli', 'day_of_week', 'na_workout']
feats =  list(set(other_cols + same_day_sleep_cols + yesterday_sleep_cols + yesterday_strain_cols + yesterday_workout_cols + weekly_avgs))
df_workout_model = df[feats]
df_workout_model.loc[df_workout_model['na_workout'] == 0]

X = df_workout_model.drop(columns=['workout_strain','na_workout'],axis = 1)
Y = df_workout_model['workout_strain']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.8, test_size=0.2, random_state=2)

In [273]:
m01_workout_tree = tree.DecisionTreeRegressor(max_depth=2)
m01_workout_tree = m01_workout_tree.fit(X_train, Y_train)

Y_pred_train = m01_workout_tree.predict(X_train)
Y_pred_test = m01_workout_tree.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getTreeVars(m01_workout_tree, X_train.columns.values)

3.431130521575869
3.3180467481756875


['w_hrv_milli', 'y_workout_kilojoule', 'y_strain']

In [274]:
m01_workout_gb = GradientBoostingRegressor(random_state=5,n_iter_no_change = 10, n_estimators=1000, learning_rate = 0.05,max_depth = 3, min_samples_leaf=3)
m01_workout_gb = m01_workout_gb.fit(X_train, Y_train)

Y_pred_train = m01_workout_gb.predict(X_train)
Y_pred_test = m01_workout_gb.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getEnsembleTreeVars(m01_workout_gb, X_train.columns.values)

3.139026154470241
3.2712781100509396


[('y_strain', 100),
 ('y_workout_max_heart_rate', 28),
 ('w_hrv_milli', 20),
 ('y_zone_four', 17),
 ('y_workout_start_time', 16),
 ('y_workout_kilojoule', 12),
 ('y_max_heart_rate', 10),
 ('y_zone_thee', 6),
 ('hrv_milli', 5),
 ('sleep_start_time', 5),
 ('y_disturbance_count', 3),
 ('y_sleep_efficiency_perc', 2)]

## Model 3: HRV

In [277]:
other_cols = ['hrv_milli', 'week_of_year','day_of_week','na_acr','na_workout', 'sleep_start_time']
feats = list(set(other_cols + yesterday_sleep_cols + yesterday_strain_cols + yesterday_workout_cols + weekly_avgs))
df_hrv_model = df[feats]

X = df_hrv_model.drop('hrv_milli',axis = 1)
Y = df_hrv_model['hrv_milli']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.8, test_size=0.2, random_state=2)

In [278]:
m01_hrv_tree = tree.DecisionTreeRegressor(max_depth=2)
m01_hrv_tree = m01_hrv_tree.fit(X_train, Y_train)

Y_pred_train = m01_hrv_tree.predict(X_train)
Y_pred_test = m01_hrv_tree.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getTreeVars(m01_hrv_tree, X_train.columns.values)

15.991946895991859
17.560529403666


['w_hrv_milli', 'w_resting_heart_rate']

In [284]:
m01_hrv_gb = GradientBoostingRegressor(random_state=5,n_iter_no_change = 10, n_estimators=1000, learning_rate = 0.05,max_depth = 2, min_samples_leaf=3)
m01_hrv_gb = m01_hrv_gb.fit(X_train, Y_train)

Y_pred_train = m01_hrv_gb.predict(X_train)
Y_pred_test = m01_hrv_gb.predict(X_test)

rmse_train = math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_train))
rmse_test = math.sqrt(metrics.mean_squared_error(Y_test, Y_pred_test))

print(rmse_train)
print(rmse_test)
getEnsembleTreeVars(m01_hrv_gb, X_train.columns.values)

14.14521275480564
16.728784126352515


[('w_hrv_milli', 100),
 ('w_resting_heart_rate', 13),
 ('y_max_heart_rate', 4),
 ('y_avg_heart_rate', 2),
 ('y_respiratory_rate', 2),
 ('w_sleep_start_time_sd', 2),
 ('w_slow_wave_sleep_time', 1)]

In [287]:
import os

os.getcwd()

'c:\\Users\\BrettMele\\Documents\\NU-MSDS\\MSDS434\\MSDS-434-Final\\models'

In [289]:
pickle.dump(m01_hrv_gb, open("hrv_model.pkl", "wb"))
pickle.dump(m01_recovery_gb, open("recovery_model.pkl", "wb"))
pickle.dump(m01_workout_gb, open("strain_model.pkl", "wb"))