<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

# Model and Evaluation
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import metrics


# statsmodel
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

# spicy
from scipy.stats import skew
from scipy.stats import boxcox
from scipy.stats import yeojohnson

import warnings
warnings.filterwarnings('ignore')

In [None]:
from platform import python_version
print('Python version:', python_version())

In [None]:
%reload_ext watermark
%watermark -a "Leandro Pessini" --iversions

In [None]:
hourly_rentals = pd.read_csv('../data/processed/hourly_rentals.csv')

In [None]:
df = hourly_rentals.copy()
df = df.astype({'holiday': 'category',
                'working_day': 'category',
                'peak': 'category',
                'timesofday': 'category',
                'rainfall_intensity': 'category',
                'wind_bft': 'category',
                'wind_speed_group': 'category'})

df['humidity_norm'] = df['rhum']/100

# predictors = ['temp','wdsp','rhum','rain_type','holiday','season','peak','timesofday']
predictors = ['wind_speed_group','rainfall_intensity']

# hourly_data_temp['temp_type'] =  np.where(hourly_data_temp['temp'] > 10, 'High', 'Low')
df['prodTempWind'] = hourly_rentals['temp']*hourly_rentals['wdsp']
df['prodRainWind'] = hourly_rentals['rain']+hourly_rentals['wdsp']
# predictors = ['prodRainWind','prodTempWind','temp','rain_type']

# OrdinalEnconder was chosen due 
# enc = OrdinalEncoder(dtype=np.int64, categories=[['no rain', 'drizzle', 'light rain', 'moderate rain', 'heavy rain']])
# df['rain_type'] = enc.fit_transform(df[['rain_type']])

X = df[[c for c in df.columns if c in predictors]]
y = df.pop('count')

# X['rain_type'] = pd.to_numeric(X['rain_type'])

num_vars = [n for n in df.select_dtypes(include=['number']).columns if n in predictors] # list comprehension to select only predictors features
cat_vars = [c for c in df.select_dtypes(include=['category']).columns if c in predictors]

dummies = pd.get_dummies(X[cat_vars], drop_first=True)
X = pd.concat([X[num_vars], dummies],axis=1)
# y = np.log(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

## OLS Assumptions

In [None]:
# X_train = X_train['temp_r']
y_train = np.log(y_train)
X_with_constant = sm.add_constant(X_train)
# model = smf.ols('count ~ temp + wdsp + rhum + rain_type + temp*wdsp +temp*rhum + peak + working_day + timesofday', data=X_with_constant).fit(cov_type='HC3')
# X_with_constant['prodTempWind'] = X_with_constant['temp']*X_with_constant['wdsp']
# X_with_constant['prodRainWind'] = X_with_constant['rain']+X_with_constant['wdsp']
# X_with_constant = X_with_constant[['const','temp']]
model = sm.OLS(y_train, X_with_constant).fit(cov_type='HC3')
print(model.summary())

In [None]:
X_test = X_test['temp_r']
y_test = np.log(y_test)
X_test_with_constant = sm.add_constant(X_test)
y_pred = model.predict(X_test_with_constant)
residuals = y_test - y_pred
# predicted_values = model.predict()
ols_residuals = model.resid

### Assumption 1 - Linearity

This assumes that there is a linear relationship between the predictors and the response variable.

<!-- > "In statistics, a regression model is linear when all terms in the model are either the constant or a parameter multiplied by an independent variable." (Frost, 2020, p. 202)
Frost, J. (2020). Regression Analysis: An Intuitive Guide for Using and Interpreting Linear Models. Statistics By Jim Publishing. -->


In [None]:
#create instance of influence
influence = model.get_influence()
#obtain standardized residuals
standardized_residuals = influence.resid_studentized_internal
#display standardized residuals
print(standardized_residuals)

### Residuals x Temperature

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))
sns.regplot(x=ols_residuals, y=model.fittedvalues, ax=ax, line_kws={'color': 'black', 'lw': 2, 'linestyle': '--'})
ax.set_title('Residuals vs. Temperature', fontsize=16)
ax.set(xlabel='Temperature', ylabel='Residuals')
plt.show()

### Residuals x Wind Speed

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))
sns.regplot(x=X_train['wdsp'], y=standardized_residuals, ax=ax, line_kws={'color': 'red'})
ax.set_title('Residuals vs. Wind Speed', fontsize=16)
ax.set(xlabel='Wind Speed', ylabel='Residuals')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))
sns.regplot(x=X_train['rhum'], y=standardized_residuals, ax=ax, line_kws={'color': 'red'})
ax.set_title('Residuals vs. Wind Speed', fontsize=16)
ax.set(xlabel='Wind Speed', ylabel='Residuals')
plt.show()

In [None]:
from sklearn.preprocessing import PowerTransformer
p = PowerTransformer(method = 'yeo-johnson')
y_train_tranformed = p.fit_transform(y_train.to_frame())

In [None]:
vif = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]
pd.DataFrame({'vif': vif[1:]}, index=X.columns)

In [None]:
fig = plt.figure(figsize=(16,12))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

sns.histplot(hourly_rentals['temp'],ax=ax0, stat='density', kde=True, 
             label= 'Skew :{0}'.format(np.round(skew(hourly_rentals['temp']),4)))
sns.histplot(hourly_rentals['rhum'],ax=ax1, stat='density', kde=True, 
             label= 'Skew :{0}'.format(np.round(skew(hourly_rentals['rhum']),4)))
sns.histplot(hourly_rentals['wdsp'],ax=ax2, stat='density', kde=True, 
             label= 'Skew :{0}'.format(np.round(skew(hourly_rentals['wdsp']),4)))
sns.histplot(hourly_rentals['count'],ax=ax3, stat='density', kde=True, 
             label= 'Skew :{0}'.format(np.round(skew(hourly_rentals['count']),4)))

ax0.set(xlabel='Temperature',title="Distribution - Temperature")
ax1.set(xlabel='Relative Humidity',title="Distribution - Relative Humidity")
ax2.set(xlabel='Wind Speed',title="Distribution - Wind Speed")
ax3.set(xlabel='Count', title="Distribution - Rentals Count")
ax0.legend(), ax1.legend(), ax2.legend(), ax3.legend()
plt.show()

In [None]:
poly = PolynomialFeatures(degree = 3)
X_poly = poly.fit_transform(X_train)
X_poly_constant = sm.add_constant(X_poly)
lin2 = sm.OLS(y_train, X_poly_constant).fit()
linearity_test(lin2, y_train) 

In [None]:
transformed_target, lam = boxcox(y_train)
fig,ax = plt.subplots(1,2,figsize=(13, 5))
sns.distplot(y_train, label= 'Orginal Skew :{0}'.format(np.round(skew(y_train),4)), color='r', ax=ax[0], axlabel='ORGINAL')
sns.distplot(transformed_target, label= 'Transformed Skew:{0}'.format(np.round(skew(transformed_target),4)), color='g', ax=ax[1], axlabel='BOX-COX TRANSFORMED')
ax[0].set(title='Distribution of Target Variable')
ax[1].set(title='After Transformation')
fig.legend()
plt.show()

In [None]:
transformed_target, lam = yeojohnson(y_train)
fig,ax = plt.subplots(1,2,figsize=(13, 5))
sns.distplot(y_train, label= 'Original Skew :{0}'.format(np.round(skew(y_train),4)), color='r', ax=ax[0], axlabel='ORGINAL')
sns.distplot(transformed_target, label= 'Transformed Skew:{0}'.format(np.round(skew(transformed_target),4)), color='g', ax=ax[1], axlabel='BOX-COX TRANSFORMED')
ax[0].set(title='Distribution of Target Variable')
ax[1].set(title='After Transformation')
fig.legend()
plt.show()

In [None]:
corrMatt = hourly_rentals[['temp','wdsp','rhum', 'rain','count']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
cmap = sns.diverging_palette(180, 20, as_cmap=True)

fig, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(corrMatt, mask=mask,vmax=.3, annot=True, ax=ax, cmap=cmap)
plt.show()

In [None]:
hourly_data_temp = X_train.copy()
# hourly_data_temp['temp_type'] =  np.where(hourly_data_temp['temp'] > 10, 'High', 'Low')
# hourly_data_temp['prodTempWind'] = hourly_data_temp['temp']*hourly_data_temp['wdsp']
hourly_data_temp['prodRainWind'] = hourly_data_temp['rain']+hourly_data_temp['wdsp']

In [None]:
hourly_data_temp.head(2)

In [None]:
hourly_data_temp.drop(columns=['rain', 'wdsp'], inplace=True)
hourly_data_temp.head(2)

In [None]:
X_with_constant = sm.add_constant(hourly_data_temp)
model = sm.OLS(y_train, X_with_constant).fit()
print(model.summary())

In [None]:
linearity_test(model, y_train)

In [None]:
X_with_constant.head()

In [None]:
vif = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]
pd.DataFrame({'vif': vif[1:]}, index=hourly_data_temp.columns).T