In [None]:
# Import necessary Python libraries
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm
import plotly.express as px

### Step 1:  EDA Process starts -  Inspecting the Dataframe for understanding provided data

In [None]:
df = pd.read_csv("E:/IIITB_Upgrad_AI_ML_Course/BikeLinearRegression/day.csv")
df.head(5)

In [None]:
#Checking size of the data

df.shape

In [None]:
#Checking data types of columns and null value analysis if any

df.info()

##### Observation : All data points are non-null and therefore do not require any imputation.

In [None]:
# Creating a copy of original dataframe for duplicate check
df_dup_check = df

# Checking for duplicates and dropping the entire duplicate row if any
df_dup_check.drop_duplicates(subset=None, inplace=True)
df_dup_check.shape

##### Observation: The shape after running the drop duplicate command is same as the original dataframe. Hence we can conclude that there were not any duplicate values in the dataset.

In [None]:
#Checking the spread of numerical columns

df.describe()

#### Performing numerical and categorical analysis on the data

In [None]:
# Pair Plot for numerical variables

vars=["cnt","temp","atemp","hum","windspeed","casual","registered"]
sns.pairplot(df[vars])
plt.show()

In [None]:
# Box Plots for categorical variables

plt.figure(figsize=(20, 15))
plt.subplot(3,3,1)
sns.boxplot(x='season',y='cnt',data=df)
plt.subplot(3,3,2)
sns.boxplot(x='yr',y='cnt',data=df)
plt.subplot(3,3,3)
sns.boxplot(x='mnth',y='cnt',data=df)
plt.subplot(3,3,4)
sns.boxplot(x='holiday',y='cnt',data=df)
plt.subplot(3,3,5)
sns.boxplot(x='weekday',y='cnt',data=df)
plt.subplot(3,3,6)
sns.boxplot(x='weathersit',y='cnt',data=df)
plt.show()

#### Observations from EDA of categorical and numerical variables:
###### Season - We can notice a positive trend in the number of customers in 2 - Summer, 3 - Fall and 4 - Winter seasons
###### Year - The overall business shows a increasing trend in their user base year on year
###### Month - Similar to the season trend, there is a postive trend in the months of summer, fall and winter.
###### Holiday : On holidays, the users show a wider spread in the counts. On normal days, the users are more than holidays
###### Weekday : Weekdays or weekends do not show any specific trend here.
###### Weathersit : Clearer weathers show a postive trend in the number of bike users
- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

In [None]:
#Visualizing corelations among the variables using a heatmap

plt.figure(figsize=(20, 12))
sns.heatmap(df.corr(),annot=True,cmap='viridis')
plt.show()

#### EDA Conlclusions:

###### Based on the high level analysis of the data and the data dictionary, the following variables can be removed for further analysis:
###### instant: It is only an index value
###### dteday: This has the date, Since we already have separate columns for 'year' & 'month' we could live without this column
###### casual : Count of bike booked by different categories of customers. 

###### From the pairplot as well as the correlation heatmap, we can concur that total bike rental value 'cnt = 'casual' + 'registered'. Since our objective is to find the total count of bikes and not by specific category, we will ignore these two columns

In [None]:
#dropping the unwanted columns
df.drop(['instant','dteday','casual'],axis=1,inplace=True)
df.shape

In [None]:
#Visualizing corelations among the variables using a heatmap

plt.figure(figsize=(20, 12))
sns.heatmap(df.corr(),annot=True,cmap='viridis')
plt.show()

#### Observations:
###### Before continue further by satistical significance, our top 3 independent variables explaining changes of bike demands are :

##### 1st : temp/atemp/instant (+0.63)
##### 2nd : yr (+0.57)
##### 3rd : seasosn (+0.4)

### Step 2 : Data Preparation 

#### Convert categorical variables into dummy variables

#### List of categorical variables:

#### season : 1:spring, 2:summer, 3:fall, 4:winter
#### yr: 0: 2018, 1:2019
#### months : 1 to 12
#### holiday : 0 and 1
#### workingday : 0 and 1
#### weekday : 1 to 6
#### weathersit : 1, 2, 3 and 4


### Observation:
#### From the list of columns we do not need to convert yr, holiday and workingday since they are already in a binary format.

#### Converting season into dummy variables

In [None]:
# Imputing season column codes with their descriptions

df.season=df.season.map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
df.season.value_counts()

In [None]:
# Getting dummy variable for season variable

season=pd.get_dummies(df.season, drop_first=True) #Dropping the first dummy variable

In [None]:
# Checking dummy variable for season

season.describe()

In [None]:
# Concatinating season to the original dataframe

df=pd.concat([df,season], axis=1)
df.head()

#### Converting weathersit into dummy variables

In [None]:
# checking weathersit column codes with their descriptions

df.weathersit.value_counts()

#### We do not have any data points for type 4 weather. We can create only two dummy variables.

In [None]:
# imputing weathersit column codes with their descriptions

df.weathersit=df.weathersit.map({1:'clear', 2:'misty', 3:'cloudy'})

In [None]:
# Checking dummy variable for weathersit

df.weathersit.value_counts()

In [None]:
# Getting dummy variable for weathersit variable

weathersit=pd.get_dummies(df.weathersit, drop_first=True) # Dropping the first dummy variable

In [None]:
# Checking dummy variable for weathersit

weathersit.describe()

In [None]:
# Concatinating weathersit to the original dataframe

df=pd.concat([df,weathersit], axis=1)
df.head()

##### Converting month into dummy variables

In [None]:
# imputing month column codes with their descriptions

df.mnth=df.mnth.map({1:'jan', 2:'feb', 3:'mar',4:'apr',5:'may',6:'jun',
                     7:'jul',8:'aug',9:'sep',10:'oct',11:'nov',12:'dec'})

In [None]:
df.mnth.value_counts()

In [None]:
# Getting dummy variable for month variable

mnth=pd.get_dummies(df.mnth, drop_first=True) # Dropping the first dummy variable
mnth.head()

In [None]:
# Concatinating mnth to the original dataframe

df=pd.concat([df,mnth], axis=1)
df.head()

#### Converting weekday into dummy variables

In [None]:
# imputing weekday column codes with their descriptions

df.weekday=df.weekday.map({0:'mon', 1:'tues', 2:'wed',3:'thurs',4:'fri',5:'sat',6:'sun'})

In [None]:
df.weekday.value_counts()

In [None]:
# Getting dummy variable for weekday variable

weekday=pd.get_dummies(df.weekday, drop_first=True)
weekday.head()
#We would not need all the 7 days here, we will drop off one day

In [None]:
# Concatinating mnth to the original dataframe
df=pd.concat([df,weekday], axis=1)
df.head()

#### We have now converted all dummy variables for all categorical variables in the data. Let's drop the converted categorical variables from the dataset.

In [None]:
# Dropping season and weathersit
df.drop(columns=['season','weathersit','mnth','weekday'], inplace=True)

#### We will convert registered users into categorical variables by performing binning.


In [None]:
#Checking the data distribution of registered users

plt.hist(df.registered,bins=15)
plt.show()

In [None]:
#Checking quantiles of registered variables in 5 bins

df.registered.quantile([0.2, 0.4, 0.6, 0.8, 1])

In [None]:
# Defining the names of the bins

bins=['very low','low','medium','high','very high']
df['registered_bin']=pd.qcut(df['registered'],q=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=bins)

In [None]:
# Checking the spread of data in the bins

df['registered_bin'].value_counts()

In [None]:
# Getting dummy variable for registered_bin variable

registered_bin=pd.get_dummies(df.registered_bin, drop_first=True) #Dropping the first dummy variable
registered_bin.head()

In [None]:
# Concatinating bins to the original dataframe

df=pd.concat([df,registered_bin], axis=1)
df.head()

##### Dropping other similar variables like date and instant from the dataset.

In [None]:
#Dropping atemp since we have a similar variable temp in the data

df.drop(columns=['atemp'], inplace=True)

#Dropping causal and registered since we can have only one target in the data

df.drop(columns=['registered','registered_bin'], inplace=True)

In [None]:
# Checking the fields in the dataset after data preparation

df.head()

#### Step 3: Splitting the data into test and train datasets

In [None]:
np.random.seed(0)

In [None]:
df_train, df_test = train_test_split(df, train_size=0.7, test_size=0.3, random_state=50)

In [None]:
# Checking the shape of train dataset

df_train.shape

In [None]:
df_train.describe()

In [None]:
# Checking the shape of test  dataset

df_test.shape

In [None]:
df_test.describe()

### Observation:
####  Based on the 70% - 30% split between train and test dataset we have 510 rows in train dataset and 219 in test dataset

#### Step 4: Resclaing the feature variables

#### From the data, we can see temp, hum and windspeed have larger and decimal values compared to others. We can normalized the numbers using the MinMax method and have all the numbers within 0 and 1 rang

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [None]:
# Applying fit_transform to normalize temp, atemp, hum and windspeed numerical columns

rescalevar=['temp','hum','windspeed','cnt']
df_train[rescalevar]=scaler.fit_transform(df_train[rescalevar])

In [None]:
# Checking minimum and maximum values of the normalized variables

df_train.describe()

#### Step 5: Checking linearity and corealtion in the train dataset after rescaling and dummy field conversion

In [None]:
# Plotting a heatmap to check linearity

plt.figure(figsize = (30, 20))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu",annot_kws={"fontsize":14})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.show()

#### Observation:
##### We can see that temperature has the most corelated to users than any other variable with 0.64 coefficient. We will proceed to try a regression model using temp as our predictor variable. We will not use registered bin variables yet and use them as an experimental addition

#### Step 6: Building our MLR model
##### Model 1: Using a single selected variable - temp

In [None]:
# import statmodels for our MLR

import statsmodels.api as sm

In [None]:
# Creating our feature and target variable datasets in y_train and X_train

y_train=df_train.pop('cnt')
X_train=df_train[:]

In [None]:
# Adding a constant to X_train

X_train_lm=sm.add_constant(X_train['temp'])

In [None]:
# Creating a LR object which we will use to fit the line

lr1 = sm.OLS(y_train, X_train_lm).fit()

In [None]:
#Checking the summary

lr1.summary()

#### Observation:We have R squared value of 0.41 with just the temperature variable. Meaning 41% of the variance is explained by temperature feature. P-value of the feature is also 0. So, temperature is statistically significant here.

#### Our line beta 0 is 0.6209 and beta 1 is 0.1668

In [None]:
# Plotting our regression line through the target variable

plt.scatter(X_train_lm.iloc[:, 1], y_train)
plt.plot(X_train_lm.iloc[:, 1], 0.1668 + 0.6209*X_train_lm.iloc[:, 1], 'r')
plt.show()

#### Step 7 : Using RFE to select variables

In [None]:
# importing libraries for RFE
from sklearn.feature_selection import RFE

##### Since we would be using registered bins as an experimental feature addition, we will exclude those features for RFE selection

In [None]:
X_train.drop(columns=['low','medium','high','very high'], inplace=True)

In [None]:
# Creating a RFE object

lm = LinearRegression()
lm.fit(X_train, y_train)

#rfe = RFE(lm,10)
rfe = RFE(lm, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train) #fitting the object on our train datase

In [None]:
# Result of Recursive elimination of variables and their rankings

list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
# Storing variables with significant weights in a variable

rfe_vars=X_train.columns[rfe.support_]
rfe_vars

In [None]:
# checking which columns have been eleminated 
X_train.columns[~rfe.support_]

##### Model 2: Building the model using RFE selected variables.

In [None]:
#Creating a train dataset using RFE variables

X_train_rfe=X_train[rfe_vars]
X_train_rfe.head()

In [None]:
# Adding a constant to X_train_rfe

X_train_rfe=sm.add_constant(X_train_rfe)
X_train_rfe.head()

In [None]:
# Creating a LR object which we will use to fit the line.

lr2 = sm.OLS(y_train, X_train_rfe).fit()

In [None]:
#Checking the summary

lr2.summary()

#### Observation: From the p-values, we can see that we have variables with high p - values or variables which are not significant.

#### Calculating VIF for model 2

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_rfe = X_train_rfe.drop(['const'], axis=1) # Dropping constant variable from the df

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### Dropping insignificant variables and running the model.
##### As we can see hum have high p-value. Such variable(s) are insignificant and should be dropped.

##### We will start with dropping a single variable and recalculate its impact on other variables. Since hum has the highest p-value and and VIF>5. We will drop this variable

##### Model 3: Building the model after dropping hum.

In [None]:
# Dropping hum variable
X_train_rfe.drop(columns='hum', inplace=True)

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)
X_train_lm3=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr3 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr3.summary()

#### Recalculating VIF for model 3

In [None]:
#Recalculating VIF
X_train_rfe = X_train_rfe.drop(['const'], axis=1) # Dropping constant variable from the df

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

##### Observation: We have all of our variables within VIF 5 and almost zero p-values coefficients.

##### At this stage, we have an adjusted R-sqaure values 0.84 which means our model explains 84% of the variance in the train data.
##### Let us try to manually add some significant variables to check if it improves our model.

In [None]:
#listing columns which have been used in lr3

X_train_rfe.columns

In [None]:
# listing columns which can be explored and added to the model

X_train.columns.difference(X_train_rfe.columns)

#### June has a co-relation of 0.22 with the count variable. Let's try adding june and recalculating the model.

#### Model 4: Building the model after adding june.

In [None]:
# Adding Jun variable
X_train_rfe['jun']=X_train['jun']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr4 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr4.summary()

#### The p-value increased after adding june. We should drop it.

In [None]:
X_train_rfe.drop(columns='jun', inplace=True)

#### Feb has a negative co-relation of 0.27 with the count variable. Let's try adding february and recalculating the model.

#### Model 5: Building the model after adding feb.

In [None]:
# Adding Feb variable and building the model.
X_train_rfe['feb']=X_train['feb']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr5 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr5.summary()

### The p-value increased after adding feb. We should drop it.

In [None]:
X_train_rfe.drop(columns='feb', inplace=True)

#### Model 6: Building the model after adding march.

In [None]:
# Adding mar variable and building the model.
X_train_rfe['mar']=X_train['mar']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr6 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr6.summary()

### The p-value increased after adding mar. We should drop it.

In [None]:
X_train_rfe.drop(columns='mar', inplace=True)

#### Model 7: Building the model after adding July.

In [None]:
# Adding july variable and building the model.
X_train_rfe['jul']=X_train['jul']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr7 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr7.summary()

### The p-value increased after adding july. We should drop it.

In [None]:
X_train_rfe.drop(columns='jul', inplace=True)

#### Model 8: Building the model after adding wednesday.

In [None]:
# Adding jan variable and building the model.
X_train_rfe['wed']=X_train['wed']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr8 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr8.summary()

### The p-value increased after adding wednesday. We should drop it.

In [None]:
X_train_rfe.drop(columns='wed', inplace=True)

#### Model 9: Building the model after adding workingday.

In [None]:
# Adding workingday variable and building the model.
X_train_rfe['workingday']=X_train['workingday']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr9 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr9.summary()


#### The p-value increased after adding working day We should drop it.

In [None]:
X_train_rfe.drop(columns='workingday', inplace=True)

#### Model 10: Building the model after adding thursday .

In [None]:
# Adding workingday variable and building the model.
X_train_rfe['thurs']=X_train['thurs']

# Adding a constant to X_train_rfe
X_train_rfe=sm.add_constant(X_train_rfe)

# Creating a LR object which we will use to fit the line.
lr10 = sm.OLS(y_train, X_train_rfe).fit()

#Checking the summary
lr10.summary()

### The p-value increased after adding thursday. We should drop it.

In [None]:
X_train_rfe.drop(columns='thurs', inplace=True)

#### Observation: After evaluating various models, we can consider Model 3 as best fit with adjusted R square value ~84%

#### Recalculating VIF for model 3

In [None]:
#Recalculating VIF
X_train_rfe = X_train_rfe.drop(['const'], axis=1) # Dropping constant variable from the df

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### We have VIFs of all feature variables below 5, so there is no multicollinearity.

In [None]:
# Printing the feature variables used.

X_train_rfe.columns

In [None]:
# Adding the constant

X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
#Building the model

lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [None]:
# Printing the summary

lm.summary()

In [None]:
y_train_pred = lm.predict(X_train_rfe)

In [None]:
# Checking the parameters obtained
lr3.params

#### Observation: This model looks good, as there seems to be VERY LOW Multicollinearity between the predictors and the p-values for all the predictors seems to be significant. For now, we will consider this as our final model (unless the Test data metrics are not significantly close to this number)

#### Step 8: Final Model Validation

#### Hypothesis Testing :
#### Hypothesis Testing States that
#### H0:B1=B2=...=Bn=0
#### H1: at least one Bi!=0

#### Observation: From the lr3 model summary, it is evident that all our coefficients are not equal to zero, which means we REJECT the NULL HYPOTHESIS

#### Model Validation: Validating Linear Relationship

In [None]:
sm.graphics.plot_ccpr(lr3, 'temp')
plt.show()

In [None]:
sm.graphics.plot_ccpr(lr3, 'windspeed')
plt.show()

##### Observation: The above plots represents the relationship between the model and the predictor variables. 
##### As we can see, linearity is well preserved

#### Model Validation: Homoscedasticity

In [None]:
y_train_pred = lr3.predict(X_train_lm3)
residual = y_train - y_train_pred
plt.title("Homoscedasticity")
plt.scatter(y_train,residual)
plt.plot(y_train,(y_train - y_train), '-r')
plt.xlabel('Count')
plt.ylabel('Residual')
plt.show()

#### Observation: There is no visible pattern in residual values, thus homoscedacity is well preserved

#### Model Validation: Heteroskedasticity

In [None]:
plt.subplots(figsize=(10,5))
plt.subplot(1,2,1)
plt.title("Heteroskedasticity")
plt.scatter(y_train_pred, (y_train-y_train_pred))
plt.xlabel("Fitted values")
plt.ylabel("Residuals")

##### No Heteroskedasticity.
##### From the scatter plot, we do not see a funnel like pattern and most of the points are centered around zero. So we do not have any heteroskedasticity.

#### Model Validation: Independence of residuals
##### Autocorrelation refers to the fact that observations’ errors are correlated. To verify that the observations are not auto-correlated, we can use the Durbin-Watson test. The test will output values between 0 and 4. The closer it is to 2, the less auto-correlation there is between the various variables.

##### 0 – 2: positive auto-correlation
##### 2 – 4: negative auto-correlation

In [None]:
print('The Durbin-Watson value for Final Model lr 3 is',round(sm.stats.stattools.durbin_watson((y_train - y_train_pred)),4))

##### Observation: There is almost no autocorrelation

##### Model Validation: Residuals must be normally distributed

In [None]:
res = y_train-y_train_pred

# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((res), bins = 20)
fig.suptitle('Error Terms')                  
plt.xlabel('Errors')                         
plt.show()

In [None]:
sm.qqplot((y_train - y_train_pred), fit=True, line='45')
plt.show()

#### Observation : Based on the histogram, we can conclude that error terms are following a normal distribution

#### Step 9 : Making Predictions on Final Model

In [None]:
#Printing the variables to be scaled

rescalevar

In [None]:
#Applying scaling on test data

df_test[rescalevar]=scaler.fit_transform(df_test[rescalevar])

In [None]:
df_test.describe()

In [None]:
# Splitting target and feature variables

y_test = df_test.pop('cnt')
X_test = df_test[:]

In [None]:
#Printing feature variables

X_train_rfe.columns

In [None]:
#Dropping constant variable

X_train_rfe.drop(columns='const', inplace=True)

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_rfe.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
# Making predictions

y_pred = lm.predict(X_test_new)

#### Step 10: Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=18)     

In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_true=y_train, y_pred=y_train_pred))
print(mean_squared_error(y_true=y_test, y_pred=y_pred))

##### We have a Mean Squared Error close to 0 on the training dataset, meaning our model is able to correctly predict all variances in the data.
##### On the test validation dataset, MSE is 0.01 also close to zero, meaning our model is able perform similarly on unknown data sets too.

In [None]:
print(r2_score(y_true=y_train, y_pred=y_train_pred))
print(r2_score(y_true=y_test, y_pred=y_pred))

#### We have a R-squared value of 84.26 % on train data and 81.48% on test data.


### Conclusions 

##### 1. We had a dataset with 510 records of data with the target and feature variables.
##### 2. Performed EDA on the dataset to check for significant inferences and identify variables for data preparation. Used scatter plots for numerical and boxplots for categorical variables.
##### 3. Prepared the data by converting categorical variables into dummy variables.
#####  - season
#####  - weathersit
#####  - month
#####  - weekday
#####  - registered
##### 4. Dropped irrelevant and categorical variables from the data.
##### - season
##### - weathersit
##### - month
##### - weekday
##### - instant
##### - dteday
##### - atemp
##### - registered
##### - casual
##### 5. Split the data into test and train datasets in a 70:30 ratio.
###### 6. Rescaled numerical variables using MinMax method.
###### 7.Plotted a heatmap to check linearity among all the variables and identified temp to be the most significant feature.
###### 8.Built a model using only temp feature with 41% adjusted R-Square.
###### 9.Adopted RFE for feature selection and built 5 other models to increase adjusted R-square to 84.3%.
##### 10. Manually identified other feature variables and built 6 other models and observed most of newly added features are having p-value > 0.05 which is insignificant.
##### 11.Performed residual analysis to confirm assumption of residuals hold true.
##### 12.Made predictions on the train data.
##### 13. Evaluated the model on test data with 81.5 % accuracy.