In [634]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [635]:
# Import csv to dataframe
df = pd.read_csv(Path('./df_transformed.csv'))
df.head()

Unnamed: 0,Sleep quality,Time in bed,Activity (steps),Stressful day Total,Drank coffee Total,Drank tea Total,Ate late Total,Worked out Total,Fell asleep
0,65,452,0,0,0,0,0,0,2231
1,89,530,0,0,1,1,0,0,2238
2,100,512,0,0,0,0,0,0,2257
3,87,442,0,0,0,1,0,0,2132
4,93,483,0,1,1,1,0,0,13


In [636]:
# Use get_dummies to spread out true and false columns over 14 columns
df_binary_encoded = pd.get_dummies(df, columns=["Stressful day Total", "Drank coffee Total", "Drank tea Total", "Ate late Total", "Worked out Total"])
df_binary_encoded.head()


Unnamed: 0,Sleep quality,Time in bed,Activity (steps),Fell asleep,Stressful day Total_0,Stressful day Total_1,Drank coffee Total_0,Drank coffee Total_1,Drank tea Total_0,Drank tea Total_1,Ate late Total_0,Ate late Total_1,Worked out Total_0,Worked out Total_1
0,65,452,0,2231,1,0,1,0,1,0,1,0,1,0
1,89,530,0,2238,1,0,0,1,0,1,1,0,1,0
2,100,512,0,2257,1,0,1,0,1,0,1,0,1,0
3,87,442,0,2132,1,0,1,0,0,1,1,0,1,0
4,93,483,0,13,0,1,0,1,0,1,1,0,1,0


In [637]:
# Check the Shape of the dataframe as using get_dummies
df_binary_encoded.shape

(887, 14)

In [638]:
# Set X to all columns but sleep quality and set y to Sleep quality
X = df_binary_encoded.drop('Sleep quality', axis=1)

y = df_binary_encoded.iloc[:,0].copy()

In [639]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [640]:
# from sklearn.preprocessing import StandardScaler
# data_scaler = StandardScaler()

# df_scaled = data_scaler.fit_transform(X_train)

# import numpy as np
# print(np.mean(df_scaled[:,0]))
# print(np.std(df_scaled[:,0]))

In [641]:
# Verify X is what we expect it to be
X[:5]

Unnamed: 0,Time in bed,Activity (steps),Fell asleep,Stressful day Total_0,Stressful day Total_1,Drank coffee Total_0,Drank coffee Total_1,Drank tea Total_0,Drank tea Total_1,Ate late Total_0,Ate late Total_1,Worked out Total_0,Worked out Total_1
0,452,0,2231,1,0,1,0,1,0,1,0,1,0
1,530,0,2238,1,0,0,1,0,1,1,0,1,0
2,512,0,2257,1,0,1,0,1,0,1,0,1,0
3,442,0,2132,1,0,1,0,0,1,1,0,1,0
4,483,0,13,0,1,0,1,0,1,1,0,1,0


In [642]:
# Rename X back to df
df = pd.DataFrame(X)
df.head()

Unnamed: 0,Time in bed,Activity (steps),Fell asleep,Stressful day Total_0,Stressful day Total_1,Drank coffee Total_0,Drank coffee Total_1,Drank tea Total_0,Drank tea Total_1,Ate late Total_0,Ate late Total_1,Worked out Total_0,Worked out Total_1
0,452,0,2231,1,0,1,0,1,0,1,0,1,0
1,530,0,2238,1,0,0,1,0,1,1,0,1,0
2,512,0,2257,1,0,1,0,1,0,1,0,1,0
3,442,0,2132,1,0,1,0,0,1,1,0,1,0
4,483,0,13,0,1,0,1,0,1,1,0,1,0


In [643]:
# Verify y matches the "Sleep_Quality" column dropped from X 
y.head()

0     65
1     89
2    100
3     87
4     93
Name: Sleep quality, dtype: int64

In [644]:
# Notice the shape of y
y.shape

(887,)

In [645]:
# Instantiate the LinearRegression Model
linear = LinearRegression()
linear

LinearRegression()

In [646]:
# Train the Model
linear.fit(X_train, y_train)

LinearRegression()

In [647]:
# Predict outcomes for test data set
y_pred = linear.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
522,80.772393,79
314,79.054583,86
768,79.260579,73
320,71.616312,72
809,70.284922,74
...,...,...
35,76.282396,64
46,78.570687,80
255,77.138954,78
670,81.788098,96


In [648]:
# Score the Model
linear.score(X_train, y_train)

0.603248893596652

In [649]:
# Instantiate the Lasso Model
classo = Lasso()
classo

Lasso()

In [650]:
# Train the data
classo.fit(X_train, y_train)

Lasso()

In [651]:
# Predict outcomes for test data set
y_pred = classo.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
522,79.229796,79
314,78.748267,86
768,77.738577,73
320,69.543845,72
809,69.110962,74
...,...,...
35,75.095852,64
46,77.403848,80
255,75.095852,78
670,80.663491,96


In [652]:
# Score the Model
classo.score(X_train, y_train)

0.5945859973126337

In [653]:
# .....................

In [654]:
# Instantiate the Ridge Model
ridge = Ridge()
ridge

Ridge()

In [655]:
# Train the Model
ridge.fit(X_train, y_train)

Ridge()

In [656]:
# Predict outcomes for test data set
y_pred = ridge.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
522,80.763934,79
314,79.058355,86
768,79.259823,73
320,71.608941,72
809,70.287187,74
...,...,...
35,76.276822,64
46,78.572625,80
255,77.131503,78
670,81.782649,96


In [657]:
# Score the model
ridge.score(X_train, y_train)

0.6032485121634987

In [658]:
# ................

In [659]:
# Instantiate the DecisionTreeRegressorModel
decision = DecisionTreeRegressor()

In [660]:
# Train the Model
decision.fit(X_train, y_train)

DecisionTreeRegressor()

In [661]:
# Predict outcomes for test data set
y_pred = decision.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
522,70.0,79
314,93.0,86
768,64.0,73
320,82.0,72
809,73.0,74
...,...,...
35,79.0,64
46,77.0,80
255,79.0,78
670,64.0,96


In [662]:
# Score the Model
decision.score(X_train, y_train)

1.0

In [663]:
#................

In [667]:
# Instantiate Multi_Variable_Linear_Regression
model = linear_model.LinearRegression()

In [668]:
# Train the Model
model.fit(X_train, y_train)

LinearRegression()

In [669]:
# Predict outcomes for test data set
y_pred = model.predict(X_test)
print(y_pred.shape)

(222,)


In [670]:
# with statsmodels
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(y, X).fit()
predictions = model.predict(X) 

# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

print_model = model.summary()
print(print_model)

Intercept: 
 16.321376454753022
Coefficients: 
 [ 0.00000000e+00  1.42124026e-01 -8.30921821e-04 -1.46657665e-03
 -6.13959299e-01  6.13959299e-01  3.35180366e-01 -3.35180366e-01
 -1.53005626e+00  1.53005626e+00 -7.68280325e-01  7.68280325e-01
  9.28714717e-04 -9.28714717e-04]
                            OLS Regression Results                            
Dep. Variable:          Sleep quality   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.549
Method:                 Least Squares   F-statistic:                     135.6
Date:                Tue, 04 Jan 2022   Prob (F-statistic):          9.38e-148
Time:                        23:21:34   Log-Likelihood:                -3328.5
No. Observations:                 887   AIC:                             6675.
Df Residuals:                     878   BIC:                             6718.
Df Model:                           8                                         
Covariance T

  x = pd.concat(x[::order], 1)


In [671]:
# Score the model
regr.score(X, y)

0.5527602473145632

In [672]:
#..................

In [673]:
#Use joblib to save the best model which is linear at 0.603248893596652 accuracy
from joblib import load, dump
dump(linear, "linear.joblib")

import os
os.getcwd()
# Reference
# model = load("linear.joblib")

'/Users/danielsquires/Desktop/Modeling/2nd_modeling_target_sleep_quality'