In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/datasets/SeoulBikeData.csv')

In [None]:
df.head(3)

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Celsius),Humidity(percent),Wind speed (metre per second),Visibility (10m),Dew point temperature(Celsius),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,1/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,1/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,1/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes


In [None]:
## PREPROCESSING
# finding null values

df.isnull().sum()

Unnamed: 0,0
Date,0
Rented Bike Count,0
Hour,0
Temperature(Celsius),0
Humidity(percent),0
Wind speed (metre per second),0
Visibility (10m),0
Dew point temperature(Celsius),0
Solar Radiation (MJ/m2),0
Rainfall(mm),0


In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
df.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(Celsius)',
       'Humidity(percent)', 'Wind speed (metre per second)',
       'Visibility (10m)', 'Dew point temperature(Celsius)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [None]:
# Labeling categorical cols

categorical_col =  ['Seasons','Holiday', 'Functioning Day']
d = {}                            # to store mappings
for c in categorical_col:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])
    classes = le.classes_         # to get unique values from categorical column
    print('classes', classes)
    map_d = dict(zip(classes, range(len(classes))))    # mapping 2 lists
    print("mapping:", map_d)
    d.update(map_d)


classes ['Autumn' 'Spring' 'Summer' 'Winter']
mapping: {'Autumn': 0, 'Spring': 1, 'Summer': 2, 'Winter': 3}
classes ['Holiday' 'No Holiday']
mapping: {'Holiday': 0, 'No Holiday': 1}
classes ['No' 'Yes']
mapping: {'No': 0, 'Yes': 1}


In [None]:
d

{'Autumn': 0,
 'Spring': 1,
 'Summer': 2,
 'Winter': 3,
 'Holiday': 0,
 'No Holiday': 1,
 'No': 0,
 'Yes': 1}

In [None]:
# dataframe type
# we have a date col here ---> data type object(string)

# Feature Engineerng:
#converting the 'Date' column using pd.to_datetime() is
# to ensure that it's treated as a proper datetime object, not a string.
# Without this conversion, you cannot extract components like month, year, or
# dayofweek.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Date                            8760 non-null   object 
 1   Rented Bike Count               8760 non-null   int64  
 2   Hour                            8760 non-null   int64  
 3   Temperature(Celsius)            8760 non-null   float64
 4   Humidity(percent)               8760 non-null   int64  
 5   Wind speed (metre per second)   8760 non-null   float64
 6   Visibility (10m)                8760 non-null   int64  
 7   Dew point temperature(Celsius)  8760 non-null   float64
 8   Solar Radiation (MJ/m2)         8760 non-null   float64
 9   Rainfall(mm)                    8760 non-null   float64
 10  Snowfall (cm)                   8760 non-null   float64
 11  Seasons                         8760 non-null   int64  
 12  Holiday                         87

In [None]:
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Celsius),Humidity(percent),Wind speed (metre per second),Visibility (10m),Dew point temperature(Celsius),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,1/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,3,1,1
1,1/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,3,1,1
2,1/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,3,1,1
3,1/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,3,1,1
4,1/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,3,1,1


In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') # we have mixed format or in d-m-y format when it wants m-d-y which throws error
df['month'] = df['Date'].dt.month
df['year'] = df['Date'].dt.year
df['dayofweek'] = df['Date'].dt.dayofweek

In [None]:
df.head(3)

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Celsius),Humidity(percent),Wind speed (metre per second),Visibility (10m),Dew point temperature(Celsius),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,month,year,dayofweek
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,3,1,1,12,2017,4
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,3,1,1,12,2017,4
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,3,1,1,12,2017,4


In [None]:
# we dont need the date column now that we have extracted the day,month seperately.

df.drop(['Date'], axis = 1, inplace = True)
df.head(3)

Unnamed: 0,Rented Bike Count,Hour,Temperature(Celsius),Humidity(percent),Wind speed (metre per second),Visibility (10m),Dew point temperature(Celsius),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,month,year,dayofweek
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,3,1,1,12,2017,4
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,3,1,1,12,2017,4
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,3,1,1,12,2017,4


In [None]:
# STANDARDISATION
# seperating feature(x) and label ( y= rented bike count) columns

x = df.iloc[:, 1:]
y = df['Rented Bike Count']

x.shape, y.shape

((8760, 15), (8760,))

In [None]:
# convert all values to similar scale
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_ft = ss.fit_transform(x) # X_ft is a variable which will have tranformed values in numpy array
X_ft = pd.DataFrame(X_ft) # we are converting X_ft to dataframe
X_ft.shape


(8760, 15)

In [None]:
# MODEL SELECTION : SPLIT TO TRAIN AND TEST (8760 ---> 80 percent train and 20 percent test)
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_ft, y, test_size = 0.20, random_state= 90)

In [None]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((7008, 15), (1752, 15), (7008,), (1752,))

In [None]:
# Model importing --> REGRESSION BASED PROBLEM
# 1) Linear regression

from sklearn.linear_model import LinearRegression
lr_m1 = LinearRegression()
lr_m1.fit(xtrain, ytrain)

In [None]:
# MODEL EVALUATION ---> rmse and mse

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# predicting xtest to see model's accuracy
pred_m1 = lr_m1.predict(xtest)
pred_m1

array([1377.86351345,  249.41994394, -461.06154982, ...,   92.42071973,
        -20.69839795,  669.9284258 ])

In [None]:
mean_squared_error(ytest,pred_m1), mean_absolute_error(ytest,pred_m1), r2_score(ytest,pred_m1)

(198917.9874127755, 329.574431613704, 0.5432335424551055)

In [None]:
# using np.sqrt
# r2 score --> % of error

np.sqrt(mean_squared_error(ytest,pred_m1)),  mean_absolute_error(ytest,pred_m1), r2_score(ytest,pred_m1)

(np.float64(446.00222803566294), 329.574431613704, 0.5432335424551055)

In [None]:
# 2) Decision tree regressor

# model importing

from sklearn.tree import DecisionTreeRegressor
dt_m2 = DecisionTreeRegressor()
dt_m2.fit(xtrain,ytrain)

In [None]:
# model prediction

pred_m2 = dt_m2.predict(xtest)
pred_m2

array([752.,  93.,   0., ...,  18., 433., 522.])

In [None]:
# model evaluation

mean_squared_error(ytest,pred_m2), mean_absolute_error(ytest,pred_m2), r2_score(ytest,pred_m2)


(60379.52739726027, 136.02625570776254, 0.8613531978872706)

In [None]:
np.sqrt(mean_squared_error(ytest,pred_m2)),  mean_absolute_error(ytest,pred_m2), r2_score(ytest,pred_m2)

(np.float64(245.72246009931666), 136.02625570776254, 0.8613531978872706)

In [None]:
# 3) Random forest regressor

# model importing

from sklearn.ensemble import RandomForestRegressor
rf_m3 = RandomForestRegressor()
rf_m3.fit(xtrain,ytrain)

In [None]:
# model prediction

pred_m3 = rf_m3.predict(xtest)
pred_m3

array([827.17, 264.18,  22.58, ...,  24.1 , 446.55, 507.16])

In [None]:
# model evaluation

mean_squared_error(ytest,pred_m3), mean_absolute_error(ytest,pred_m3), r2_score(ytest,pred_m3)

(27152.20791461187, 96.35889269406393, 0.937651602125132)

In [None]:
np.sqrt(mean_squared_error(ytest,pred_m3)),  mean_absolute_error(ytest,pred_m3), r2_score(ytest,pred_m3)

(np.float64(164.77927028182845), 96.35889269406393, 0.937651602125132)

In [None]:

print('Model 1 result')
print(np.sqrt(mean_squared_error(ytest,pred_m1)),  mean_absolute_error(ytest,pred_m1), r2_score(ytest,pred_m1))
print('Model 2 result')
print(np.sqrt(mean_squared_error(ytest,pred_m2)),  mean_absolute_error(ytest,pred_m2), r2_score(ytest,pred_m2))
print('Model 3 result')
print(np.sqrt(mean_squared_error(ytest,pred_m3)),  mean_absolute_error(ytest,pred_m3), r2_score(ytest,pred_m3))

Model 1 result
446.00222803566294 329.574431613704 0.5432335424551055
Model 2 result
245.72246009931666 136.02625570776254 0.8613531978872706
Model 3 result
164.77927028182845 96.35889269406393 0.937651602125132


In [None]:
# model 3's msqe is less than model 1, 2