In [95]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()


In [97]:
# Read the dataset

bike= pd.read_csv("day.csv")


In [98]:
bike.head()


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,1,1,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,2,1,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,3,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,4,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,5,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [99]:
bike['dteday'].dtype


dtype('O')

In [100]:
bike['dteday'] =  pd.to_datetime(bike['dteday'],format='%d-%m-%Y')
bike['dteday'].dtype

dtype('<M8[ns]')

In [101]:
bike['year'] = pd.DatetimeIndex(bike['dteday']).year
bike['month'] = pd.DatetimeIndex(bike['dteday']).month


In [102]:
bike.head()


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month
0,1,2018-01-01,1,0,1,0,1,1,2,14.110847,18.18125,80.5833,10.749882,331,654,985,2018,1
1,2,2018-01-02,1,0,1,0,2,1,2,14.902598,17.68695,69.6087,16.652113,131,670,801,2018,1
2,3,2018-01-03,1,0,1,0,3,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349,2018,1
3,4,2018-01-04,1,0,1,0,4,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562,2018,1
4,5,2018-01-05,1,0,1,0,5,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600,2018,1


In [103]:

bike.drop(['yr','mnth'],axis=1,inplace=True)


In [107]:
bike.drop('holiday',axis=1,inplace=True)


In [108]:
bike.columns

Index(['season', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'cnt', 'year', 'month'],
      dtype='object')

In [109]:
bike.drop(['dteday','instant','casual','registered'],axis=1,inplace=True)


KeyError: "['dteday' 'instant' 'casual' 'registered'] not found in axis"

In [112]:
bike.columns

Index(['season', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
       'humidity', 'windspeed', 'count', 'year', 'month'],
      dtype='object')

In [113]:
# Renaming some columns for better understanding

bike.rename(columns={'hum':'humidity','cnt':'count'},inplace=True)

In [114]:
codes = {1:'spring',2:'summer',3:'fall',4:'winter'}
bike['season'] = bike['season'].map(codes)

In [115]:
codes = {1:'Clear',2:'Mist',3:'Light Snow',4:'Heavy Rain'}
bike['weathersit'] = bike['weathersit'].map(codes)


In [116]:
codes = {1:'working_day',0:'Holiday'}
bike['workingday'] = bike['workingday'].map(codes)


In [117]:
codes = {2019:1,2018:0}
bike['year'] = bike['year'].map(codes)

In [118]:

codes = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'July',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
bike['month'] = bike['month'].map(codes)


In [119]:
codes = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
bike['weekday'] = bike['weekday'].map(codes)


In [120]:
bike.drop('atemp',axis=1,inplace=True)


In [121]:
bike.columns

Index(['season', 'weekday', 'workingday', 'weathersit', 'temp', 'humidity',
       'windspeed', 'count', 'year', 'month'],
      dtype='object')

In [122]:
seasons = pd.get_dummies(bike['season'],drop_first=True)

working_day = pd.get_dummies(bike['workingday'],drop_first=True)

weather= pd.get_dummies(bike['weathersit'],drop_first=True)

month= pd.get_dummies(bike['month'],drop_first=True)

week_day= pd.get_dummies(bike['weekday'],drop_first=True)


In [123]:
bike= pd.concat([bike,seasons,working_day,weather,month,week_day],axis=1)


In [124]:
# Dropping the categorical variables as they are already dummy-encoded.

bike.drop(['season','workingday','weathersit','weekday','month'],axis=1,inplace=True)


In [125]:
bike.columns

Index(['temp', 'humidity', 'windspeed', 'count', 'year', 'spring', 'summer',
       'winter', 'working_day', 'Light Snow', 'Mist', 'Aug', 'Dec', 'Feb',
       'Jan', 'July', 'June', 'Mar', 'May', 'Nov', 'Oct', 'Sep', 'Mon', 'Sat',
       'Sun', 'Thu', 'Tue', 'Wed'],
      dtype='object')

In [126]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(bike, train_size = 0.7, test_size = 0.3, random_state = 100)


In [127]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()

In [128]:
# Apply scaler() to all the columns except the'dummy' variables.

num_vars=['temp','humidity','windspeed','count']

df_train[num_vars]= scaler.fit_transform(df_train[num_vars])


In [129]:
y_train = df_train.pop('count')
X_train = df_train


In [132]:
y_train

653    1.562685
576    1.399629
426   -0.215554
728   -1.379511
482   -0.136589
         ...   
526    1.116587
578    1.422703
53    -1.317467
350   -0.895981
79    -1.235426
Name: count, Length: 510, dtype: float64

In [133]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


In [134]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm,10) # running RFE
rfe = rfe.fit(X_train, y_train)


In [135]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))


[('temp', True, 1),
 ('humidity', False, 10),
 ('windspeed', False, 8),
 ('year', True, 1),
 ('spring', True, 1),
 ('summer', False, 7),
 ('winter', False, 2),
 ('working_day', True, 1),
 ('Light Snow', True, 1),
 ('Mist', True, 1),
 ('Aug', False, 13),
 ('Dec', False, 4),
 ('Feb', False, 6),
 ('Jan', False, 5),
 ('July', True, 1),
 ('June', False, 17),
 ('Mar', False, 18),
 ('May', False, 9),
 ('Nov', False, 3),
 ('Oct', False, 15),
 ('Sep', True, 1),
 ('Mon', True, 1),
 ('Sat', False, 16),
 ('Sun', True, 1),
 ('Thu', False, 14),
 ('Tue', False, 12),
 ('Wed', False, 11)]

In [71]:
col = X_train.columns[rfe.support_]
col


Index(['temp', 'year', 'spring', 'working_day', 'Light Snow', 'Mist', 'July',
       'Sep', 'Mon', 'Sun'],
      dtype='object')

In [72]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]


In [73]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)


In [74]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [75]:
lm.summary()


0,1,2,3
Dep. Variable:,count,R-squared:,0.816
Model:,OLS,Adj. R-squared:,0.812
Method:,Least Squares,F-statistic:,221.2
Date:,"Sat, 03 Oct 2020",Prob (F-statistic):,3.16e-176
Time:,11:01:07,Log-Likelihood:,-292.12
No. Observations:,510,AIC:,606.2
Df Residuals:,499,BIC:,652.8
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.5893,0.136,-4.340,0.000,-0.856,-0.323
temp,0.4029,0.027,14.765,0.000,0.349,0.457
year,1.0473,0.039,27.009,0.000,0.971,1.123
spring,-0.6760,0.057,-11.825,0.000,-0.788,-0.564
working_day,0.3710,0.134,2.777,0.006,0.109,0.633
Light Snow,-1.2971,0.116,-11.212,0.000,-1.524,-1.070
Mist,-0.3619,0.041,-8.739,0.000,-0.443,-0.281
July,-0.3136,0.082,-3.834,0.000,-0.474,-0.153
Sep,0.2874,0.074,3.884,0.000,0.142,0.433

0,1,2,3
Omnibus:,59.984,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,132.927
Skew:,-0.645,Prob(JB):,1.37e-29
Kurtosis:,5.143,Cond. No.,20.0


In [76]:
X_train1= X_train_rfe.drop('working_day',1)


In [77]:
X_train2= sm.add_constant(X_train1)
lm1 = sm.OLS(y_train,X_train2).fit() 


In [78]:
lm1.summary()


0,1,2,3
Dep. Variable:,count,R-squared:,0.813
Model:,OLS,Adj. R-squared:,0.81
Method:,Least Squares,F-statistic:,241.6
Date:,"Sat, 03 Oct 2020",Prob (F-statistic):,8.96e-176
Time:,11:01:18,Log-Likelihood:,-296.03
No. Observations:,510,AIC:,612.1
Df Residuals:,500,BIC:,654.4
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2266,0.037,-6.058,0.000,-0.300,-0.153
temp,0.4038,0.027,14.701,0.000,0.350,0.458
year,1.0458,0.039,26.795,0.000,0.969,1.122
spring,-0.6851,0.057,-11.924,0.000,-0.798,-0.572
Light Snow,-1.2881,0.116,-11.064,0.000,-1.517,-1.059
Mist,-0.3589,0.042,-8.612,0.000,-0.441,-0.277
July,-0.3098,0.082,-3.763,0.000,-0.471,-0.148
Sep,0.2729,0.074,3.673,0.000,0.127,0.419
Mon,0.1012,0.057,1.771,0.077,-0.011,0.213

0,1,2,3
Omnibus:,63.627,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,141.16
Skew:,-0.68,Prob(JB):,2.23e-31
Kurtosis:,5.189,Cond. No.,7.58


In [80]:
X_train1= X_train1.drop('Sun',1)

In [81]:
X_train1.columns

Index(['const', 'temp', 'year', 'spring', 'Light Snow', 'Mist', 'July', 'Sep',
       'Mon'],
      dtype='object')

In [82]:
X_train2= sm.add_constant(X_train1)
lm1 = sm.OLS(y_train,X_train2).fit() 



In [83]:
lm1.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.812
Model:,OLS,Adj. R-squared:,0.809
Method:,Least Squares,F-statistic:,271.0
Date:,"Sat, 03 Oct 2020",Prob (F-statistic):,1.5e-176
Time:,11:06:24,Log-Likelihood:,-297.1
No. Observations:,510,AIC:,612.2
Df Residuals:,501,BIC:,650.3
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2152,0.037,-5.878,0.000,-0.287,-0.143
temp,0.4055,0.027,14.757,0.000,0.351,0.459
year,1.0448,0.039,26.745,0.000,0.968,1.122
spring,-0.6827,0.057,-11.874,0.000,-0.796,-0.570
Light Snow,-1.2814,0.116,-11.004,0.000,-1.510,-1.053
Mist,-0.3560,0.042,-8.542,0.000,-0.438,-0.274
July,-0.3105,0.082,-3.768,0.000,-0.472,-0.149
Sep,0.2736,0.074,3.678,0.000,0.127,0.420
Mon,0.0882,0.056,1.562,0.119,-0.023,0.199

0,1,2,3
Omnibus:,63.612,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,143.669
Skew:,-0.674,Prob(JB):,6.35e-32
Kurtosis:,5.223,Cond. No.,7.53


In [84]:
X_train1= X_train1.drop('Mon',1)

In [85]:
X_train1.columns

Index(['const', 'temp', 'year', 'spring', 'Light Snow', 'Mist', 'July', 'Sep'], dtype='object')

In [86]:
X_train2= sm.add_constant(X_train1)
lm1 = sm.OLS(y_train,X_train2).fit() 

In [87]:
lm1.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.811
Model:,OLS,Adj. R-squared:,0.809
Method:,Least Squares,F-statistic:,308.5
Date:,"Sat, 03 Oct 2020",Prob (F-statistic):,2.7700000000000003e-177
Time:,11:07:06,Log-Likelihood:,-298.34
No. Observations:,510,AIC:,612.7
Df Residuals:,502,BIC:,646.6
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2059,0.036,-5.693,0.000,-0.277,-0.135
temp,0.4055,0.028,14.739,0.000,0.351,0.460
year,1.0458,0.039,26.735,0.000,0.969,1.123
spring,-0.6811,0.058,-11.830,0.000,-0.794,-0.568
Light Snow,-1.2858,0.117,-11.029,0.000,-1.515,-1.057
Mist,-0.3513,0.042,-8.439,0.000,-0.433,-0.269
July,-0.3051,0.082,-3.701,0.000,-0.467,-0.143
Sep,0.2750,0.074,3.692,0.000,0.129,0.421

0,1,2,3
Omnibus:,59.714,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,140.528
Skew:,-0.622,Prob(JB):,3.0500000000000003e-31
Kurtosis:,5.251,Cond. No.,7.48


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif