# Bike Sharing prediction with Linear Regression

## Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV,ElasticNet , ElasticNetCV ,LinearRegression
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport

In [2]:
df = pd.read_csv("day.csv")

In [3]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


### Checking rows and columns

In [4]:
df.shape

(731, 16)

### Checking for missing values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


### Checking for any null values

In [6]:
df.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

### Converting dteday column to datetime type

In [7]:
df['dteday'] =  pd.to_datetime(df['dteday'],format='%Y-%m-%d')
df['dteday']

0     2011-01-01
1     2011-01-02
2     2011-01-03
3     2011-01-04
4     2011-01-05
         ...    
726   2012-12-27
727   2012-12-28
728   2012-12-29
729   2012-12-30
730   2012-12-31
Name: dteday, Length: 731, dtype: datetime64[ns]

### Creating a separate columns for year and month from dteday column

In [8]:
df['year'] = pd.DatetimeIndex(df['dteday']).year
df['month'] = pd.DatetimeIndex(df['dteday']).month

In [9]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1


### Dropping yr and mnth column as we extracted year and month from dteday column which is more accurate

In [10]:
df.drop(columns=['yr','mnth'],inplace = True)

In [11]:
df.head()

Unnamed: 0,instant,dteday,season,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month
0,1,2011-01-01,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1
1,2,2011-01-02,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1
2,3,2011-01-03,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1
3,4,2011-01-04,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1
4,5,2011-01-05,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1


### Dropping the excess variable holiday as the workingday section covers sufficient data that is required.

In [12]:
df.drop(columns=['holiday'],inplace=True)

### Dropping the dteday as we have year and month, also we cannot work on non numerical columns and instant column is irrelevant column

In [13]:
df.drop(columns=['instant','dteday'],inplace=True)

In [14]:
df.head()

Unnamed: 0,season,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month
0,1,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1
1,1,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1
2,1,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1
3,1,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1
4,1,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1


### Renaming columns 

In [15]:
df.rename(columns={'hum':'humidity','cnt':'count'},inplace=True)

In [16]:
df.head()

Unnamed: 0,season,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count,year,month
0,1,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1
1,1,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1
2,1,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1
3,1,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1
4,1,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1


### Converting to categorical columns

In [17]:
labels = {1:'winter',2:'spring',3:'summer',4:'fall'}
df['season'] = df['season'].map(labels)

In [18]:
labels = {1:'Clear',2:'Mist',3:'Light Snow',4:'Heavy Rain'}
df['weathersit'] = df['weathersit'].map(labels)

In [19]:
labels = {1:'Working_day',0:'Holiday'}
df['workingday'] = df['workingday'].map(labels)

### converting to numerical column

In [20]:
labels = {2012:1,2011:0}
df['year'] = df['year'].map(labels)

### Converting month into categorical feature

In [21]:
labels = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'July',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
df['month'] = df['month'].map(labels)

### Converting weekday into categorical feature

In [22]:
labels = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['weekday'] = df['weekday'].map(labels)

### Now the dataset should look more meaningful

In [23]:
df.head()

Unnamed: 0,season,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count,year,month
0,winter,Sun,Holiday,Mist,0.344167,0.363625,0.805833,0.160446,331,654,985,0,Jan
1,winter,Mon,Holiday,Mist,0.363478,0.353739,0.696087,0.248539,131,670,801,0,Jan
2,winter,Tue,Working_day,Clear,0.196364,0.189405,0.437273,0.248309,120,1229,1349,0,Jan
3,winter,Wed,Working_day,Clear,0.2,0.212122,0.590435,0.160296,108,1454,1562,0,Jan
4,winter,Thu,Working_day,Clear,0.226957,0.22927,0.436957,0.1869,82,1518,1600,0,Jan


### Visualizing and stats info

In [24]:
pf = ProfileReport(df)
pf.to_widgets()

Summarize dataset:   0%|          | 0/26 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### To avoid multicollinearity
### Dropping atemp, casual, registered

In [25]:
df.drop(columns=['atemp', 'casual', 'registered'],inplace = True)

In [26]:
df.head()

Unnamed: 0,season,weekday,workingday,weathersit,temp,humidity,windspeed,count,year,month
0,winter,Sun,Holiday,Mist,0.344167,0.805833,0.160446,985,0,Jan
1,winter,Mon,Holiday,Mist,0.363478,0.696087,0.248539,801,0,Jan
2,winter,Tue,Working_day,Clear,0.196364,0.437273,0.248309,1349,0,Jan
3,winter,Wed,Working_day,Clear,0.2,0.590435,0.160296,1562,0,Jan
4,winter,Thu,Working_day,Clear,0.226957,0.436957,0.1869,1600,0,Jan


### Creating dummy indicators for season, weekday, workingday, weathersit, month

In [27]:
seasons = pd.get_dummies(df['season'],drop_first=True)
week_day = pd.get_dummies(df['weekday'],drop_first=True)
working_day = pd.get_dummies(df['workingday'],drop_first=True)
weather = pd.get_dummies(df['weathersit'],drop_first=True)
month = pd.get_dummies(df['month'],drop_first=True)

In [28]:
df = pd.concat([df,seasons,working_day,weather,month,week_day],axis=1)
df.head()

Unnamed: 0,season,weekday,workingday,weathersit,temp,humidity,windspeed,count,year,month,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,winter,Sun,Holiday,Mist,0.344167,0.805833,0.160446,985,0,Jan,...,0,0,0,0,0,0,1,0,0,0
1,winter,Mon,Holiday,Mist,0.363478,0.696087,0.248539,801,0,Jan,...,0,0,0,0,1,0,0,0,0,0
2,winter,Tue,Working_day,Clear,0.196364,0.437273,0.248309,1349,0,Jan,...,0,0,0,0,0,0,0,0,1,0
3,winter,Wed,Working_day,Clear,0.2,0.590435,0.160296,1562,0,Jan,...,0,0,0,0,0,0,0,0,0,1
4,winter,Thu,Working_day,Clear,0.226957,0.436957,0.1869,1600,0,Jan,...,0,0,0,0,0,0,0,1,0,0


### Dealing with categorical columns(season, weekday, workingday, weathersit, month) as we have dummy 
### indicators for each of these variables

In [29]:
df.drop(columns=['season', 'weekday', 'workingday', 'weathersit', 'month'],inplace=True)

In [30]:
df.head()

Unnamed: 0,temp,humidity,windspeed,count,year,spring,summer,winter,Working_day,Light Snow,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,0.344167,0.805833,0.160446,985,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.363478,0.696087,0.248539,801,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.196364,0.437273,0.248309,1349,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0.2,0.590435,0.160296,1562,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0.226957,0.436957,0.1869,1600,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


### Normalization applied other than the dummy variables

In [31]:
x = ['temp','humidity','windspeed','count']
x

['temp', 'humidity', 'windspeed', 'count']

### Normalization as the columns temp, humidity and windspeed are already normalized between 0 and 1
### And count is not normalized, hence we normalize it along with the other columns

In [32]:
normalized = MinMaxScaler()
df[x]= normalized.fit_transform(df[x])

### Now the data is on the same Scale

In [33]:
df.head()

Unnamed: 0,temp,humidity,windspeed,count,year,spring,summer,winter,Working_day,Light Snow,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,0.35517,0.82862,0.284606,0.110792,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.379232,0.715771,0.466215,0.089623,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.171,0.449638,0.46574,0.152669,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0.17553,0.607131,0.284297,0.177174,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0.20912,0.449313,0.339143,0.181546,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


### Dropping target variable

In [34]:
x = df.drop(columns=['count'])
x

Unnamed: 0,temp,humidity,windspeed,year,spring,summer,winter,Working_day,Light Snow,Mist,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,0.355170,0.828620,0.284606,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0.379232,0.715771,0.466215,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.171000,0.449638,0.465740,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0.175530,0.607131,0.284297,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.209120,0.449313,0.339143,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,0.243025,0.671380,0.675656,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
727,0.241986,0.606684,0.274350,1,0,0,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
728,0.241986,0.774208,0.210260,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
729,0.245101,0.497001,0.676936,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [35]:
y = df['count']
y

0      0.110792
1      0.089623
2      0.152669
3      0.177174
4      0.181546
         ...   
726    0.240681
727    0.353543
728    0.151749
729    0.204096
730    0.311436
Name: count, Length: 731, dtype: float64

In [36]:
arr = x.values
arr

array([[0.3551696 , 0.82862005, 0.2846062 , ..., 0.        , 0.        ,
        0.        ],
       [0.37923205, 0.71577069, 0.46621455, ..., 0.        , 0.        ,
        0.        ],
       [0.1709998 , 0.44963805, 0.4657404 , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.24198597, 0.77420771, 0.21026043, ..., 0.        , 0.        ,
        0.        ],
       [0.2451011 , 0.49700051, 0.67693615, ..., 0.        , 0.        ,
        0.        ],
       [0.19525913, 0.59383033, 0.27306151, ..., 0.        , 1.        ,
        0.        ]])

### Feature Selection using Variance Inflation Factor

In [37]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()

In [38]:
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [39]:
vif_df['feature'] = x.columns

In [40]:
vif_df

Unnamed: 0,vif,feature
0,39.626155,temp
1,39.601188,humidity
2,6.433388,windspeed
3,2.086738,year
4,9.358149,spring
5,10.486915,summer
6,8.625784,winter
7,20.475972,Working_day
8,1.342106,Light Snow
9,2.429744,Mist


In [41]:
x = df.drop(columns=['temp','humidity','Working_day','summer','count'])
x

Unnamed: 0,windspeed,year,spring,winter,Light Snow,Mist,Aug,Dec,Feb,Jan,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,0.284606,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0.466215,0,0,1,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.465740,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0.284297,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0.339143,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,0.675656,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
727,0.274350,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
728,0.210260,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
729,0.676936,1,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [42]:
arr = x.values
arr

array([[0.2846062 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.46621455, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.4657404 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.21026043, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.67693615, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.27306151, 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ]])

In [43]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]
vif_df['feature'] = x.columns
vif_df

Unnamed: 0,vif,feature
0,5.607441,windspeed
1,1.980372,year
2,4.127332,spring
3,7.64334,winter
4,1.111839,Light Snow
5,1.594072,Mist
6,1.778474,Aug
7,1.870574,Dec
8,3.438366,Feb
9,3.653071,Jan


### Splitting the dataset into training and testing sets.

In [44]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=100)

In [45]:
lr = LinearRegression()

In [46]:
lr.fit(x_train,y_train)

LinearRegression()

In [47]:
arr[0]

array([0.2846062, 0.       , 0.       , 1.       , 0.       , 1.       ,
       0.       , 0.       , 0.       , 1.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 1.       , 0.       , 0.       , 0.       ])

### Trying to predict the value of y 

In [48]:
lr.predict([arr[0]])

array([0.10122911])

### Actual Y value
### y_pred = 0.101 and y_actual = 0.110

In [49]:
y[0] 

0.11079153244362633

### Using R Square metrics to find out the accuracy.

In [50]:
score = lr.score(x_test,y_test)

In [51]:
print("Score : {:.2f} %".format(score*100))

Score : 75.28 %


### Comparing models

In [53]:
df.columns

Index(['temp', 'humidity', 'windspeed', 'count', 'year', 'spring', 'summer',
       'winter', 'Working_day', 'Light Snow', 'Mist', 'Aug', 'Dec', 'Feb',
       'Jan', 'July', 'June', 'Mar', 'May', 'Nov', 'Oct', 'Sep', 'Mon', 'Sat',
       'Sun', 'Thu', 'Tue', 'Wed'],
      dtype='object')

In [54]:
df.rename(columns={'Light Snow':'Light_snow'},inplace=True)

In [55]:
df.head()

Unnamed: 0,temp,humidity,windspeed,count,year,spring,summer,winter,Working_day,Light_snow,...,May,Nov,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed
0,0.35517,0.82862,0.284606,0.110792,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.379232,0.715771,0.466215,0.089623,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.171,0.449638,0.46574,0.152669,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0.17553,0.607131,0.284297,0.177174,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0.20912,0.449313,0.339143,0.181546,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


### Ordinal Least Square Method (OLS) 

In [56]:
import statsmodels.formula.api as smf
lm2 = smf.ols(formula='count~temp+humidity+windspeed+year+spring+summer+winter+Working_day+Light_snow+Mist+Aug+Dec+Feb+Jan+July+June+Mar+May+Nov+Oct+Sep+Mon+Sat+Sun+Thu+Tue+Wed',data=df).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.848
Model:,OLS,Adj. R-squared:,0.842
Method:,Least Squares,F-statistic:,145.3
Date:,"Thu, 04 Nov 2021",Prob (F-statistic):,1.49e-266
Time:,17:49:08,Log-Likelihood:,749.27
No. Observations:,731,AIC:,-1443.0
Df Residuals:,703,BIC:,-1314.0
Df Model:,27,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4071,0.045,9.100,0.000,0.319,0.495
temp,0.4143,0.038,10.896,0.000,0.340,0.489
humidity,-0.1699,0.033,-5.196,0.000,-0.234,-0.106
windspeed,-0.1633,0.023,-7.202,0.000,-0.208,-0.119
year,0.2322,0.007,34.660,0.000,0.219,0.245
spring,-0.0793,0.024,-3.248,0.001,-0.127,-0.031
summer,-0.0859,0.022,-3.901,0.000,-0.129,-0.043
winter,-0.1817,0.021,-8.722,0.000,-0.223,-0.141
Working_day,0.0694,0.021,3.352,0.001,0.029,0.110

0,1,2,3
Omnibus:,123.801,Durbin-Watson:,1.2
Prob(Omnibus):,0.0,Jarque-Bera (JB):,340.628
Skew:,-0.852,Prob(JB):,1.08e-74
Kurtosis:,5.877,Cond. No.,41.2


### we need to remove p > 0.05 as those doesnt contribute more and also has no significance
### Aug, Feb, Jan, July, June, Mar, May, Oct, Mon, Sat, Thu, Tue, Wed

In [57]:
df1 = df.drop(columns= ['Aug','Feb','Jan','July','June','Mar','May','Oct','Mon','Sat','Thu','Tue','Wed']).copy()
df1

Unnamed: 0,temp,humidity,windspeed,count,year,spring,summer,winter,Working_day,Light_snow,Mist,Dec,Nov,Sep,Sun
0,0.355170,0.828620,0.284606,0.110792,0,0,0,1,0,0,1,0,0,0,1
1,0.379232,0.715771,0.466215,0.089623,0,0,0,1,0,0,1,0,0,0,0
2,0.171000,0.449638,0.465740,0.152669,0,0,0,1,1,0,0,0,0,0,0
3,0.175530,0.607131,0.284297,0.177174,0,0,0,1,1,0,0,0,0,0,0
4,0.209120,0.449313,0.339143,0.181546,0,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,0.243025,0.671380,0.675656,0.240681,1,0,0,1,1,0,1,1,0,0,0
727,0.241986,0.606684,0.274350,0.353543,1,0,0,1,1,0,1,1,0,0,0
728,0.241986,0.774208,0.210260,0.151749,1,0,0,1,0,0,1,1,0,0,1
729,0.245101,0.497001,0.676936,0.204096,1,0,0,1,0,0,0,1,0,0,0


In [58]:
df1.columns

Index(['temp', 'humidity', 'windspeed', 'count', 'year', 'spring', 'summer',
       'winter', 'Working_day', 'Light_snow', 'Mist', 'Dec', 'Nov', 'Sep',
       'Sun'],
      dtype='object')

In [59]:
import statsmodels.formula.api as smf
lm2 = smf.ols(formula='count~temp+humidity+windspeed+year+spring+summer+winter+Working_day+Light_snow+Mist+Dec+Nov+Sep+Sun',data=df1).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.84
Model:,OLS,Adj. R-squared:,0.837
Method:,Least Squares,F-statistic:,268.3
Date:,"Thu, 04 Nov 2021",Prob (F-statistic):,1.6300000000000002e-273
Time:,17:49:12,Log-Likelihood:,730.17
No. Observations:,731,AIC:,-1430.0
Df Residuals:,716,BIC:,-1361.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4020,0.028,14.205,0.000,0.346,0.458
temp,0.4423,0.029,15.476,0.000,0.386,0.498
humidity,-0.1644,0.032,-5.149,0.000,-0.227,-0.102
windspeed,-0.1585,0.023,-6.965,0.000,-0.203,-0.114
year,0.2317,0.007,34.215,0.000,0.218,0.245
spring,-0.0664,0.012,-5.493,0.000,-0.090,-0.043
summer,-0.1133,0.014,-8.014,0.000,-0.141,-0.086
winter,-0.2019,0.013,-15.501,0.000,-0.227,-0.176
Working_day,0.0477,0.009,5.278,0.000,0.030,0.065

0,1,2,3
Omnibus:,113.241,Durbin-Watson:,1.164
Prob(Omnibus):,0.0,Jarque-Bera (JB):,281.884
Skew:,-0.815,Prob(JB):,6.1599999999999994e-62
Kurtosis:,5.568,Cond. No.,21.9


In [60]:
score = lm2.rsquared
print("R Square Score : {:.2f} %".format(score*100))
score = lm2.rsquared_adj
print("Adjusted R Square Score : {:.2f} %".format(score*100))

R Square Score : 83.99 %
Adjusted R Square Score : 83.68 %


In [61]:
df1.head()

Unnamed: 0,temp,humidity,windspeed,count,year,spring,summer,winter,Working_day,Light_snow,Mist,Dec,Nov,Sep,Sun
0,0.35517,0.82862,0.284606,0.110792,0,0,0,1,0,0,1,0,0,0,1
1,0.379232,0.715771,0.466215,0.089623,0,0,0,1,0,0,1,0,0,0,0
2,0.171,0.449638,0.46574,0.152669,0,0,0,1,1,0,0,0,0,0,0
3,0.17553,0.607131,0.284297,0.177174,0,0,0,1,1,0,0,0,0,0,0
4,0.20912,0.449313,0.339143,0.181546,0,0,0,1,1,0,0,0,0,0,0


In [62]:
pf1 = ProfileReport(df1)
pf1.to_widgets()

Summarize dataset:   0%|          | 0/28 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Temp and count has high corelation

In [63]:
x1 = df1.drop(columns=['count'])
x1

Unnamed: 0,temp,humidity,windspeed,year,spring,summer,winter,Working_day,Light_snow,Mist,Dec,Nov,Sep,Sun
0,0.355170,0.828620,0.284606,0,0,0,1,0,0,1,0,0,0,1
1,0.379232,0.715771,0.466215,0,0,0,1,0,0,1,0,0,0,0
2,0.171000,0.449638,0.465740,0,0,0,1,1,0,0,0,0,0,0
3,0.175530,0.607131,0.284297,0,0,0,1,1,0,0,0,0,0,0
4,0.209120,0.449313,0.339143,0,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,0.243025,0.671380,0.675656,1,0,0,1,1,0,1,1,0,0,0
727,0.241986,0.606684,0.274350,1,0,0,1,1,0,1,1,0,0,0
728,0.241986,0.774208,0.210260,1,0,0,1,0,0,1,1,0,0,1
729,0.245101,0.497001,0.676936,1,0,0,1,0,0,0,1,0,0,0


In [64]:
y1 = df1['count']
y1

0      0.110792
1      0.089623
2      0.152669
3      0.177174
4      0.181546
         ...   
726    0.240681
727    0.353543
728    0.151749
729    0.204096
730    0.311436
Name: count, Length: 731, dtype: float64

In [65]:
arr1 = x1.values
arr1

array([[0.3551696 , 0.82862005, 0.2846062 , ..., 0.        , 0.        ,
        1.        ],
       [0.37923205, 0.71577069, 0.46621455, ..., 0.        , 0.        ,
        0.        ],
       [0.1709998 , 0.44963805, 0.4657404 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.24198597, 0.77420771, 0.21026043, ..., 0.        , 0.        ,
        1.        ],
       [0.2451011 , 0.49700051, 0.67693615, ..., 0.        , 0.        ,
        0.        ],
       [0.19525913, 0.59383033, 0.27306151, ..., 0.        , 0.        ,
        0.        ]])

In [66]:
x_train, x_test, y_train, y_test = train_test_split(arr1,y1,test_size=0.20,random_state=100)

### Checking whether our model has consistency or not
### Regularization and cross validation

In [67]:
lassocv = LassoCV(alphas=None,cv=50,max_iter=200000,normalize=True)
lassocv.fit(x_train,y_train)

LassoCV(cv=50, max_iter=200000, normalize=True)

In [68]:
lassocv.alpha_

5.94032207717815e-06

In [69]:
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(x_train,y_train)

Lasso(alpha=5.94032207717815e-06)

In [70]:
score = lasso.score(x_test,y_test)
print("Lasso Score : {:.2f} %".format(score*100))

Lasso Score : 78.50 %


In [71]:
ridgecv = RidgeCV(alphas=np.random.uniform(0,10,50),cv = 10 , normalize=True)
ridgecv.fit(x_train,y_train)

RidgeCV(alphas=array([6.1826447 , 6.82301164, 3.251469  , 4.26034477, 9.63628454,
       8.9081008 , 6.35995852, 3.23274121, 7.00307435, 5.31919539,
       7.08400884, 8.66696997, 1.36568887, 5.64283504, 0.24948312,
       4.50973683, 1.64935289, 0.49418116, 1.14848407, 3.98661939,
       0.63664103, 1.96445663, 3.08656749, 5.92421942, 6.30069747,
       0.84409234, 1.29569883, 3.97766383, 2.01516456, 5.63959242,
       7.39089079, 5.68737152, 4.50452416, 9.12513873, 3.35713583,
       0.83526769, 2.51039865, 0.24590512, 4.97973356, 6.85373448,
       7.11072621, 5.15725045, 2.82770932, 7.17321284, 1.88227332,
       2.41966093, 8.07988389, 7.01130004, 8.3125676 , 7.71546657]),
        cv=10, normalize=True)

In [72]:
ridgecv.alpha_

0.24590511614445365

In [73]:
ridge_lr = Ridge(alpha=ridgecv.alpha_)
ridge_lr.fit(x_train,y_train)

Ridge(alpha=0.24590511614445365)

In [74]:
score = ridge_lr.score(x_test,y_test)
print("Lasso Score : {:.2f} %".format(score*100))

Lasso Score : 78.70 %


In [75]:
elastic= ElasticNetCV(alphas=None, cv = 10)
elastic.fit(x_train,y_train)

ElasticNetCV(cv=10)

In [76]:
elastic.alpha_

0.0001273884940194303

In [77]:
elastic.l1_ratio_

0.5

In [78]:
elastic_lr = ElasticNet(alpha=elastic.alpha_ , l1_ratio=elastic.l1_ratio_)

In [79]:
elastic_lr.fit(x_train,y_train)

ElasticNet(alpha=0.0001273884940194303)

In [80]:
score = elastic_lr.score(x_test,y_test)
print("Lasso Score : {:.2f} %".format(score*100))

Lasso Score : 78.63 %


### Optimization using Stochastic Gradient Descent (SGD).

In [81]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

sgdr = SGDRegressor(random_state=1, penalty=None)
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter': [10000, 20000, 30000, 40000]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(x_train, y_train)

results = pd.DataFrame.from_dict(gd_sr.cv_results_)
print("Cross-validation results:\n", results)


Cross-validation results:
     mean_fit_time  std_fit_time  mean_score_time  std_score_time param_eta0  \
0        0.004586  2.720663e-03         0.000199        0.000399     0.0001   
1        0.001995  2.336015e-07         0.000798        0.000399     0.0001   
2        0.002397  4.933608e-04         0.000599        0.000489     0.0001   
3        0.002789  3.974562e-04         0.000399        0.000489     0.0001   
4        0.001596  4.884420e-04         0.000399        0.000489      0.001   
5        0.000997  1.784161e-07         0.000399        0.000489      0.001   
6        0.001197  3.990650e-04         0.000399        0.000489      0.001   
7        0.001595  4.885193e-04         0.000399        0.000489      0.001   
8        0.000997  2.780415e-07         0.000598        0.000488       0.01   
9        0.000997  2.780415e-07         0.000199        0.000399       0.01   
10       0.000997  2.780415e-07         0.000199        0.000399       0.01   
11       0.000997  1.1680

### Mean cross-validated score of the best_estimator

In [84]:
best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_  
print("Best result: ", best_result)

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

Best parameters:  {'eta0': 0.1, 'max_iter': 10000}
Best result:  0.8164602099875449
Intercept:  [0.28671983]


In [85]:
print(pd.DataFrame(zip(x1.columns, best_model.coef_), columns=['Features', 'Coefficients']).sort_values(by=['Coefficients'], ascending=False))

       Features  Coefficients
0          temp      0.352194
3          year      0.239839
12          Sep      0.071406
13          Sun      0.063826
7   Working_day      0.050994
1      humidity      0.024662
4        spring     -0.022411
11          Nov     -0.034685
10          Dec     -0.037041
5        summer     -0.053152
9          Mist     -0.088130
2     windspeed     -0.117133
6        winter     -0.190485
8    Light_snow     -0.258561
