In [19]:
import pandas as pd
from joblib import load, dump
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Import Support Vector Regressor
from sklearn.svm import SVR

# Import Ridge Regression
from sklearn.linear_model import Ridge

# Bedtimes Model

### This model also predicts for sleep score but the goal here is to find the ideal sleep and wake time to maximize sleep score 

In [20]:
bedtime_model = load("bedtimes_model.joblib")
bedtime_df = load("DataFrame")
bedtime_model_performance = load("bedtimes_model_performance.joblib")
bedtime_best_params = load("bedtimes_model_best_params.joblib")

X_train_bedtime = load("X_train_data.joblib")
X_test_bedtime = load("X_test_data.joblib")
y_train_bedtime = load("y_train_data.joblib")
y_test_bedtime = load("y_test_data.joblib")

In [21]:
bedtime_df

Unnamed: 0,Start,End,Sleep quality,Time in bed,Activity (steps),None,Stressful day,Drank coffee,Drank tea,Ate late,Worked out
0,1377,450,100,512,0,1,0,0,0,0,0
1,1277,1293,3,16,0,0,1,0,0,0,0
2,1362,433,98,510,0,1,0,0,0,0,0
3,1351,363,65,452,0,1,0,0,0,0,0
4,1332,296,72,404,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
882,1314,422,91,548,56,1,0,0,0,0,0
883,1429,420,81,431,64,1,0,0,0,0,0
884,1284,380,71,536,3316,1,0,0,0,0,0
885,1296,410,80,553,6555,1,0,0,0,0,0


### Finding Feature Importance 

In [22]:
X = np.array(bedtime_df[['Start','End','Time in bed','Activity (steps)','Stressful day','Drank coffee','Drank tea','Worked out']])
y = bedtime_df[['Sleep quality']].values

In [23]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [24]:
model =  GradientBoostingRegressor()


scores = cross_val_score(model, X, y, cv=kf, scoring='r2')

print("Cross-validation scores:", scores)
print("Average score:",np.mean(scores))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation scores: [0.63236525 0.66693158 0.37209605 0.40236447 0.6703828  0.70146142
 0.40583301 0.59810265 0.62215107 0.59049595]
Average score: 0.5662184239480268


In [25]:
kf = KFold(n_splits=10) 
scores = []
feature_importances = np.zeros(X.shape[1])

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)

    scores.append(model.score(X_test, y_test))
    feature_importances += model.feature_importances_

# Averaging feature importances over all folds
feature_importances /= kf.get_n_splits()

print("Cross-validation scores:", scores)
print("Average score:", np.mean(scores))
print("Average feature importances:", feature_importances)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation scores: [0.581641015831584, 0.6931580063954037, 0.5074119334983229, 0.5316749439292472, 0.028378919326938923, 0.46979642345518424, 0.6473202778882989, 0.641936360079205, 0.4489609635661548, 0.6092627789746317]
Average score: 0.5159541622944971
Average feature importances: [0.04794731 0.07151819 0.73683561 0.11192906 0.00168548 0.00953563
 0.01108312 0.00946559]


In [26]:
feature_names = bedtime_df[['Start','End','Time in bed','Activity (steps)','Stressful day','Drank coffee','Drank tea','Worked out']].columns

In [27]:
features_zip = list(zip(feature_importances, feature_names))

In [28]:
print(features_zip)

[(0.047947311942103533, 'Start'), (0.07151818973563392, 'End'), (0.736835612252958, 'Time in bed'), (0.11192905555072934, 'Activity (steps)'), (0.0016854846044626012, 'Stressful day'), (0.009535631512190498, 'Drank coffee'), (0.011083124322942752, 'Drank tea'), (0.00946559007897937, 'Worked out')]


In [29]:
sorted_importances = sorted(features_zip, key=lambda x: x[0], reverse=True)
features_ordered = []
# Print the sorted list
for importance, feature in sorted_importances:
    print(f"{importance}, {feature}")
    features_ordered.append(feature)

0.736835612252958, Time in bed
0.11192905555072934, Activity (steps)
0.07151818973563392, End
0.047947311942103533, Start
0.011083124322942752, Drank tea
0.009535631512190498, Drank coffee
0.00946559007897937, Worked out
0.0016854846044626012, Stressful day


In [30]:
print(features_ordered)

['Time in bed', 'Activity (steps)', 'End', 'Start', 'Drank tea', 'Drank coffee', 'Worked out', 'Stressful day']


In [31]:
main_features = features_ordered[:4]

In [32]:
main_features

['Time in bed', 'Activity (steps)', 'End', 'Start']

In [33]:
X_reduced = np.array(bedtime_df[main_features])


In [34]:
model =  GradientBoostingRegressor()


scores = cross_val_score(model, X_reduced, y, cv=kf, scoring='r2')

print("Cross-validation scores:", scores)
print("Average score:",np.mean(scores))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation scores: [0.59788444 0.70555012 0.49179861 0.50571841 0.02635601 0.42481695
 0.60196577 0.68039823 0.43937431 0.62664898]
Average score: 0.5100511846075154


In [41]:
param_grid_gb = {
    'n_estimators': [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150], 
    'learning_rate': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15],  # Finer steps around 0.1
    'max_depth': [2, 3, 4, 5, 6, 7],  # Expanded range with small increments
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],  # Finer steps starting from 2
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # Finer steps starting from 1
    'subsample': [0.8, 0.85, 0.9, 0.95, 1.0],  # Finer steps around 1.0
    'max_features': [None, 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(model, param_distributions=param_grid_gb, 
                                   n_iter=300, cv=10, scoring='r2', 
                                   random_state=42, n_jobs=-1)
random_search.fit(X_reduced, y)

print(f"Best Parameters:", random_search.best_params_)
print(f"Best Score:", random_search.best_score_)

Best Parameters: {'subsample': 0.9, 'n_estimators': 60, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.12}
Best Score: 0.532601257730672


  y = column_or_1d(y, warn=True)


In [42]:
param_grid_gb = {
    'n_estimators': [60, 65, 70, 75, 80],               # Centered around 70
    'learning_rate': [0.08, 0.09, 0.1, 0.11, 0.12],     # Centered around 0.1
    'max_depth': [1, 2, 3, 4],                          # Centered around 2
    'min_samples_split': [2, 3, 4, 5, 6],               # Centered around 4
    'min_samples_leaf': [3, 4, 5, 6, 7],                # Centered around 5
    'max_features': [None, 'sqrt', 'log2'],             # Centered around None with additional options
    'subsample': [0.75, 0.8, 0.85, 0.9, 0.95]           # Centered around 0.8
}

random_search = RandomizedSearchCV(model, param_distributions=param_grid_gb, 
                                   n_iter=300, cv=10, scoring='r2', 
                                   random_state=42, n_jobs=-1)
random_search.fit(X_reduced, y)

print(f"Best Parameters:", random_search.best_params_)
print(f"Best Score:", random_search.best_score_)

Best Parameters: {'subsample': 0.9, 'n_estimators': 70, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.12}
Best Score: 0.5329784734476043


  y = column_or_1d(y, warn=True)


In [43]:
param_grid_gb = {
    'n_estimators': [70, 75, 80, 85, 90],               # Centered around 80
    'learning_rate': [0.06, 0.07, 0.08, 0.09, 0.1],     # Centered around 0.08
    'max_depth': [3, 4, 5, 6],                          # Centered around 4
    'min_samples_split': [2, 3, 4, 5],                  # Centered around 3
    'min_samples_leaf': [4, 5, 6, 7],                   # Centered around 5
    'max_features': ['sqrt'],                           # Specific to 'sqrt'
    'subsample': [0.75, 0.8, 0.85, 0.9]                 # Centered around 0.8
}


random_search = RandomizedSearchCV(model, param_distributions=param_grid_gb, 
                                   n_iter=300, cv=10, scoring='r2', 
                                   random_state=42, n_jobs=-1)
random_search.fit(X_reduced, y)

print(f"Best Parameters:", random_search.best_params_)
print(f"Best Score:", random_search.best_score_)

Best Parameters: {'subsample': 0.75, 'n_estimators': 75, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 4, 'learning_rate': 0.06}
Best Score: 0.5317795957859224


  y = column_or_1d(y, warn=True)


In [35]:
param_grid_gb = {
    'n_estimators': [60, 70, 80],                    # Reduced range
    'learning_rate': [0.08, 0.1, 0.12],              # Reduced range
    'max_depth': [2, 4, 6],                          # Reduced range
    'min_samples_split': [4, 6],                     # Reduced range
    'min_samples_leaf': [4, 6],                      # Reduced range
    'max_features': ['sqrt', None],                  # Reduced options
    'subsample': [0.75, 0.85, 0.95]                  # Reduced range
}



grid_search = GridSearchCV(model, param_grid=param_grid_gb, 
                           cv=10, scoring='r2', 
                           verbose=2, n_jobs=-1)

grid_search.fit(X_reduced, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 60, 'subsample': 0.75}
Best Score: 0.5344340155965726


  y = column_or_1d(y, warn=True)


In [36]:
best_params = grid_search.best_params_

In [51]:
model =  GradientBoostingRegressor(**best_params)


### Model Training


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=.2,random_state=1)
y_train = y_train.ravel()
model.fit(X_train, y_train)

y_preds = model.predict(X_test)

r2 = r2_score(y_test,y_preds)

In [75]:
print(r2)

0.44954619449143995
