In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.formula.api import ols

In [2]:
# load dataframe 
bikes_df = pd.read_pickle('cleaned_data/bikes_df.pkl')
bikes_df.head()

Unnamed: 0_level_0,temp,hum,windspeed,casual,registered,cnt,season_fall,season_spring,season_summer,yr_2012,...,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Thu,weekday_Tue,weekday_Wed,workingday_working,weathersit_heavy_rain_snow,weathersit_light_rain_snow,weathersit_mist
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01,0.24,0.81,0.0,3,13,16,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2011-01-01,0.22,0.8,0.0,8,32,40,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2011-01-01,0.22,0.8,0.0,5,27,32,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2011-01-01,0.24,0.75,0.0,3,10,13,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2011-01-01,0.24,0.75,0.0,0,1,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [12]:
# Define the Problem
# use all column values 
targets = bikes_df[['casual','registered','cnt']]
x_cols = bikes_df.drop(columns = targets)

In [13]:
# split data into train and test, 25% test, 75% train 
X_train, X_test, y_train, y_test = train_test_split(
    x_cols, targets, test_size=0.25, random_state=42)

In [19]:
# divide into 3 components
y_train_casual, y_train_registered, y_train_cnt =\
y_train.casual, y_train.registered, y_train.cnt

y_test_casual, y_test_registered, y_test_cnt =\
y_test.casual, y_test.registered, y_test.cnt

In [21]:
reg_casual = LinearRegression().fit(X_train, y_train_casual)
reg_registered = LinearRegression().fit(X_train, y_train_registered)
reg_cnt = LinearRegression().fit(X_train, y_train_cnt)
# >>> reg.score(X, y)
# 1.0
# >>> reg.coef_
# array([1., 2.])
# >>> reg.intercept_ 
# 3.0000...
# >>> reg.predict(np.array([[3, 5]]))
# array([16.])

In [28]:
print('reg_casual:', reg_casual.score(X_train,y_train_casual))
print('reg_registered:', reg_registered.score(X_train,y_train_registered))
print('reg_cnt', reg_cnt.score(X_train, y_train_cnt))

reg_casual: 0.5820653809719947
reg_registered: 0.6792399241156122
reg_cnt 0.6820133682527982


In [30]:
# predicted 
y_pred_casual = reg_casual.predict(X_test)
y_pred_registered = reg_registered.predict(X_test)
y_pred_cnt = reg_cnt.predict(X_test)


In [33]:
print('reg_casual_t:', r2_score(y_test_casual,y_pred_casual))
print('reg_registered_t:', r2_score(y_test_registered,y_pred_registered))
print('reg_cnt_t', r2_score(y_test_cnt, y_pred_cnt))

reg_casual_t: 0.5801555683505369
reg_registered_t: 0.6763065722064621
reg_cnt_t 0.6799431104706595


In [35]:
predictors

'temp+hum+windspeed+season_fall+season_spring+season_summer+yr_2012+hr_0+hr_1+hr_10+hr_11+hr_12+hr_13+hr_14+hr_15+hr_16+hr_17+hr_18+hr_19+hr_2+hr_20+hr_21+hr_22+hr_3+hr_4+hr_5+hr_6+hr_7+hr_8+hr_9+holiday_holiday+weekday_Fri+weekday_Mon+weekday_Sat+weekday_Thu+weekday_Tue+weekday_Wed+workingday_working+weathersit_heavy_rain_snow+weathersit_light_rain_snow+weathersit_mist'

In [36]:
predictors = '+'.join(x_cols)

model_casual = ols(formula= 'casual ~'+ predictors, data=bikes_df).fit()
model_casual.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.582
Model:,OLS,Adj. R-squared:,0.581
Method:,Least Squares,F-statistic:,603.7
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,0.0
Time:,14:23:23,Log-Likelihood:,-84822.0
No. Observations:,17379,AIC:,169700.0
Df Residuals:,17338,BIC:,170000.0
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,11.7533,2.072,5.672,0.000,7.692,15.815
temp,89.6257,2.215,40.455,0.000,85.283,93.968
hum,-24.8225,1.695,-14.647,0.000,-28.144,-21.501
windspeed,-15.4857,2.134,-7.257,0.000,-19.668,-11.303
season_fall,-6.7409,0.928,-7.261,0.000,-8.560,-4.921
season_spring,-8.0464,0.760,-10.587,0.000,-9.536,-6.557
season_summer,4.4493,0.743,5.987,0.000,2.993,5.906
yr_2012,11.9398,0.489,24.432,0.000,10.982,12.898
hr_0,-4.1941,1.674,-2.505,0.012,-7.476,-0.912

0,1,2,3
Omnibus:,7699.183,Durbin-Watson:,0.255
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58480.077
Skew:,1.968,Prob(JB):,0.0
Kurtosis:,11.079,Cond. No.,2250000000000000.0


In [37]:
model_registered = ols(formula= 'registered ~'+ predictors, data=bikes_df).fit()
model_registered.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.679
Model:,OLS,Adj. R-squared:,0.678
Method:,Least Squares,F-statistic:,916.2
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,0.0
Time:,14:24:00,Log-Likelihood:,-102030.0
No. Observations:,17379,AIC:,204100.0
Df Residuals:,17338,BIC:,204500.0
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.2611,5.576,-0.226,0.821,-12.190,9.668
temp,155.3717,5.962,26.062,0.000,143.686,167.057
hum,-43.9683,4.561,-9.641,0.000,-52.907,-35.029
windspeed,-18.4517,5.742,-3.213,0.001,-29.707,-7.196
season_fall,-31.6895,2.498,-12.685,0.000,-36.586,-26.793
season_spring,-57.7315,2.045,-28.228,0.000,-61.740,-53.723
season_summer,-27.3489,2.000,-13.676,0.000,-31.269,-23.429
yr_2012,73.5485,1.315,55.927,0.000,70.971,76.126
hr_0,-28.0962,4.506,-6.236,0.000,-36.928,-19.265

0,1,2,3
Omnibus:,1245.295,Durbin-Watson:,0.589
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3441.816
Skew:,0.397,Prob(JB):,0.0
Kurtosis:,5.03,Cond. No.,2250000000000000.0


In [38]:
model_cnt = ols(formula= 'cnt ~'+ predictors, data=bikes_df).fit()
model_cnt.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.682
Model:,OLS,Adj. R-squared:,0.681
Method:,Least Squares,F-statistic:,929.0
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,0.0
Time:,14:24:42,Log-Likelihood:,-105090.0
No. Observations:,17379,AIC:,210300.0
Df Residuals:,17338,BIC:,210600.0
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.4921,6.651,1.578,0.115,-2.544,23.528
temp,244.9973,7.111,34.454,0.000,231.059,258.935
hum,-68.7908,5.440,-12.646,0.000,-79.453,-58.129
windspeed,-33.9374,6.849,-4.955,0.000,-47.363,-20.512
season_fall,-38.4304,2.980,-12.898,0.000,-44.271,-32.590
season_spring,-65.7778,2.439,-26.965,0.000,-70.559,-60.996
season_summer,-22.8996,2.385,-9.601,0.000,-27.575,-18.224
yr_2012,85.4884,1.569,54.501,0.000,82.414,88.563
hr_0,-32.2903,5.374,-6.008,0.000,-42.824,-21.756

0,1,2,3
Omnibus:,1231.952,Durbin-Watson:,0.498
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2624.305
Skew:,0.472,Prob(JB):,0.0
Kurtosis:,4.653,Cond. No.,2250000000000000.0
