<a href="https://colab.research.google.com/github/pedroalmir/qol-poc-results/blob/main/02_QolMonitor_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# QoL Monitor - Modeling

For the Machine Learning algorithms modeling, we decided to use the Scikit-learn toolbox due to its high acceptance in the scientific community and the consistency of its results.

## Required Imports

In [None]:
import time
import gspread
import numpy as np
import pandas as pd
from google.colab import auth
from google.colab import drive
from google.auth import default
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from oauth2client.client import GoogleCredentials
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

## Google Authentication

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
creds, _ = default()

## Getting Processed Data

In [None]:
def getSheetAsDf(spreadsheetName, sheetName):
    spreadsheet = gc.open(spreadsheetName)
    dataset = spreadsheet.worksheet(sheetName).get_all_values()
    df = pd.DataFrame.from_records(dataset)
    header = df.iloc[0]
    df = df[1:]
    df.columns = header
    return df

In [None]:
gc = gspread.authorize(creds)

df_physical = getSheetAsDf('QoL Monitor - Preprocessed Data', 'dataset_phy')
df_psychological = getSheetAsDf('QoL Monitor - Preprocessed Data', 'dataset_psy')

## Stratifying our dataset

In [None]:
# Separating predictors from the values to be predicted.
physical = df_physical.drop("phy_ref_score", axis=1)
physical_pred = df_physical["phy_ref_score"].copy()

psychological = df_psychological.drop("psy_ref_score", axis=1)
psychological_pred = df_psychological["psy_ref_score"].copy()

print('Shapes', physical.shape, physical_pred.shape, psychological.shape, psychological_pred.shape)

Shapes (710, 86) (710,) (710, 86) (710,)


## Training regression models

In [None]:
def rmsle_cv(name, model, n_folds, X, y):
    start = time.time()
    kf = KFold(n_folds, shuffle = True, random_state = 42).get_n_splits(X.values)
    rmse = np.sqrt(-cross_val_score(model, X.values, y, scoring = "neg_mean_squared_error", cv = kf))
    mae = -cross_val_score(model, X.values, y, scoring = "neg_mean_absolute_error", cv = kf)
    r2 = cross_val_score(model, X.values, y, scoring = "r2", cv = kf)
    end = time.time()
    print(name)
    print("\t|_ MAE scores  - Mean: {:.4f}, STD: {:.4f}".format(mae.mean(), mae.std()))
    print("\t|_ RMSE scores - Mean: {:.4f}, STD: {:.4f}".format(rmse.mean(), rmse.std()))
    print("\t|_ Exec. Time (in seconds):", end - start)
    #return(rmse)

In [None]:
def getDataset(domain):
    return {
        'physical': (physical, physical_pred),
        'psychological': (psychological, psychological_pred),
    }[domain]    

In [None]:
def modelingRegressors(domain):
    X, y = getDataset(domain)
    print('Modeling regressors for', domain, 'domain')
    rmsle_cv("Linear Regression", LinearRegression(), 10, X, y)
    rmsle_cv("Decision Tree Regressor", DecisionTreeRegressor(), 10, X, y)
    rmsle_cv("Random Forest Regressor", RandomForestRegressor(), 10, X, y)
    
    GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
    
    rmsle_cv("GBoost Regressor", GBoost, 10, X, y)

In [None]:
modelingRegressors('physical')

Modeling regressors for physical domain
Linear Regression
	|_ MAE scores  - Mean: 6.5866, STD: 1.7582
	|_ RMSE scores - Mean: 8.8457, STD: 2.9102
	|_ Exec. Time (in seconds): 0.6453731060028076
Decision Tree Regressor
	|_ MAE scores  - Mean: 6.1465, STD: 1.6188
	|_ RMSE scores - Mean: 9.3685, STD: 2.7071
	|_ Exec. Time (in seconds): 0.7077336311340332
Random Forest Regressor
	|_ MAE scores  - Mean: 4.9477, STD: 1.5283
	|_ RMSE scores - Mean: 7.2215, STD: 3.0008
	|_ Exec. Time (in seconds): 26.8660671710968
GBoost Regressor
	|_ MAE scores  - Mean: 4.9569, STD: 1.4472
	|_ RMSE scores - Mean: 6.9191, STD: 2.6899
	|_ Exec. Time (in seconds): 390.08537316322327


In [None]:
modelingRegressors('psychological')

Modeling regressors for psychological domain
Linear Regression
	|_ MAE scores  - Mean: 8.1918, STD: 1.9133
	|_ RMSE scores - Mean: 10.6146, STD: 2.4728
	|_ Exec. Time (in seconds): 0.6437489986419678
Decision Tree Regressor
	|_ MAE scores  - Mean: 5.8000, STD: 1.7678
	|_ RMSE scores - Mean: 9.5880, STD: 2.3525
	|_ Exec. Time (in seconds): 0.758786678314209
Random Forest Regressor
	|_ MAE scores  - Mean: 4.6830, STD: 1.2204
	|_ RMSE scores - Mean: 6.8838, STD: 2.2436
	|_ Exec. Time (in seconds): 29.55673909187317
GBoost Regressor
	|_ MAE scores  - Mean: 4.9707, STD: 1.3524
	|_ RMSE scores - Mean: 7.0034, STD: 2.2327
	|_ Exec. Time (in seconds): 394.80068492889404


## Grid Search

In [None]:
def performGridSearch(domain):
    param_grid = [
        # try 12 (3×4) combinations of hyperparameters
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        # then try 6 (2×3) combinations with bootstrap set as False
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]
    X, y = getDataset(domain)
    forest_reg = RandomForestRegressor(random_state=42)
    # train across 10 folds, that's a total of (12+6)*10=180 rounds of training 
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X, y)
    return grid_search

In [None]:
grid_search_phy = performGridSearch('physical')
print("Best parameters for physical domain:", grid_search_phy.best_params_)
print("Best estimator for physical domain: ", grid_search_phy.best_estimator_)
print("Best RMSE score for physical domain:", np.sqrt(-grid_search_phy.best_score_))

Best parameters for physical domain: {'max_features': 6, 'n_estimators': 30}
Best estimator for physical domain:  RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)
Best score for physical domain:      8.068110587411667


In [None]:
grid_search_psy = performGridSearch('psychological')
print("Best parameters for psychological domain:", grid_search_psy.best_params_)
print("Best estimator for psychological domain: ", grid_search_psy.best_params_)
print("Best RMSE score for psychological domain:", np.sqrt(-grid_search_psy.best_score_))

Best parameters for psychological domain: {'max_features': 6, 'n_estimators': 30}
Best estimator for psychological domain:  {'max_features': 6, 'n_estimators': 30}
Best RMSE score for psychological domain: 8.240263852378


## Getting feature relevance

In [None]:
feature_importances_phy = grid_search_phy.best_estimator_.feature_importances_
sorted(zip(feature_importances_phy, df_physical.columns), reverse=True)

[(0.06458776366275285, 'weight'),
 (0.05788934314164953, 'income'),
 (0.05537080661681995, 'height'),
 (0.05229968716362453, 'calories'),
 (0.05080180223044218, 'children'),
 (0.04959345298825555, 'personalization'),
 (0.04611030872585527, 'hr_avg'),
 (0.03962205208487731, 'maritalstatus_married'),
 (0.03759277292145109, 'remsleep'),
 (0.035390219905108565, 'maritalstatus_single'),
 (0.031462101998616133, 'specificage'),
 (0.030148283754507833, 'social'),
 (0.019180289041796673, 'communication'),
 (0.018676509211538996, 'blockedcalls'),
 (0.018510740966005145, 'healthfitness'),
 (0.01610508553971326, 'videoplayerseditors'),
 (0.015543538416792653, 'gender'),
 (0.01392731309792876, 'musicaudio'),
 (0.013925258248051249, 'other'),
 (0.013498992634696587, 'familyarr'),
 (0.012765907280436866, 'awakesleep'),
 (0.012658310778866718, 'hr_min'),
 (0.012623726095129996, 'walking'),
 (0.01256390687760054, 'profession_fulltimeworker'),
 (0.012328131302580687, 'differentlocations'),
 (0.012089983

In [None]:
feature_importances_psy = grid_search_psy.best_estimator_.feature_importances_
sorted(zip(feature_importances_psy, df_psychological.columns), reverse=True)

[(0.0964047661279916, 'income'),
 (0.07198043929107388, 'height'),
 (0.04762693034606955, 'hr_avg'),
 (0.041649024706253514, 'personalization'),
 (0.04032325463449599, 'blockedcalls'),
 (0.03785569919272723, 'weight'),
 (0.03355596780636753, 'business'),
 (0.030983182795119187, 'calories'),
 (0.030072519898700502, 'communication'),
 (0.029087057686748987, 'hr_max'),
 (0.02712446936079872, 'social'),
 (0.02575325347820097, 'musicaudio'),
 (0.025676413038303852, 'specificage'),
 (0.02415977700658549, 'videoplayerseditors'),
 (0.022641374276148058, 'hr_min'),
 (0.019949798837992045, 'maritalstatus_married'),
 (0.019729550887396242, 'lifestyle'),
 (0.019108391157890994, 'remsleep'),
 (0.018731425978330402, 'maritalstatus_single'),
 (0.018058154241326032, 'familyarr'),
 (0.017885793391197814, 'awakesleep'),
 (0.01654334994260197, 'edulevel'),
 (0.016423328086180147, 'mood'),
 (0.016059282269957967, 'steps'),
 (0.01599013475554473, 'education'),
 (0.01591867784993645, 'whatsappnotification')