In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
# import missingno as msno
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import  Lasso


In [None]:
from google.colab import drive #comment out this line if you are using Colab
drive.mount("/content/gdrive/") 

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import os
# main_path = "/content/gdrive/My Drive/ML_Project_data/Data_2/"
main_path = "/content/gdrive/My Drive/Colab Notebooks/Intro to DS/"
os.listdir(main_path)

['Group_19_first_model_ash.ipynb', 'Group_19_data_cleaned.csv']

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# importing the csv into DataFrame using Pandas
df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Intro to DS//Group_19_data_cleaned.csv",encoding= 'unicode_escape',parse_dates=[0])
df.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Â°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(Â°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,2017-01-12,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,0,1
1,1,2017-01-12,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,0,1


In [None]:
df = df.drop("Unnamed: 0",axis=1)

## <b> Data Preparation - Feature Engineering </font>

In [None]:
df.columns = [i.split("(")[0].strip().title().replace(" ","_") for i in [*df.columns]]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   8760 non-null   object 
 1   Rented_Bike_Count      8760 non-null   int64  
 2   Hour                   8760 non-null   int64  
 3   Temperature            8760 non-null   float64
 4   Humidity               8760 non-null   int64  
 5   Wind_Speed             8760 non-null   float64
 6   Visibility             8760 non-null   int64  
 7   Dew_Point_Temperature  8760 non-null   float64
 8   Solar_Radiation        8760 non-null   float64
 9   Rainfall               8760 non-null   float64
 10  Snowfall               8760 non-null   float64
 11  Seasons                8760 non-null   object 
 12  Holiday                8760 non-null   int64  
 13  Functioning_Day        8760 non-null   int64  
dtypes: float64(6), int64(6), object(2)
memory usage: 958.2+ 

In [None]:
df.Date = pd.to_datetime(df.Date, format="%Y/%m/%d")

In [None]:
df.insert(1,"Day", df.Date.dt.day)
df.insert(2, "Month", df.Date.dt.month)
df.insert(3, "Year", df.Date.dt.year)

In [None]:
df.insert(3, 'WeekDay',df["Date"].dt.day_name())

In [None]:
df.Year = df.Year.map({2017:0, 2018:1})

In [None]:
df.Seasons = df.Seasons.map({"Winter":1, 'Autumn':2, 'Spring':3, 'Summer':4})

In [None]:
df.insert(2, 'label_day_night', df['Hour'].apply(lambda x : 0 if (x<7) else( 1)))

In [None]:
df["Working_Day"] = 1
df.loc[(df.WeekDay=="Saturday")|(df.WeekDay=="Sunday"), "Working_Day"] = 0

In [None]:
df.Humidity[df.Humidity==0] = df.Humidity.mean()

In [None]:
categoryVariableList = ["WeekDay"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

In [None]:
for col in categoryVariableList:
    df_dummies_col = pd.get_dummies(df[col])
    
    df = pd.concat([df, df_dummies_col],axis=1)
    
    df.drop(col, axis=1, inplace=True)

In [None]:
df.columns

Index(['Date', 'Day', 'label_day_night', 'Month', 'Year', 'Rented_Bike_Count',
       'Hour', 'Temperature', 'Humidity', 'Wind_Speed', 'Visibility',
       'Dew_Point_Temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
       'Seasons', 'Holiday', 'Functioning_Day', 'Working_Day', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [None]:
columns = df.columns.tolist()
columns.append(columns.pop(5))
df = df[columns]

In [None]:
df.drop("Date",axis=1, inplace=True)

In [None]:
numerical_feature = ["Temperature","Humidity","Rainfall",
                     "Visibility","Dew_Point_Temperature",
                     'Wind_Speed','Solar_Radiation',"Snowfall"]

scale = StandardScaler()
df[numerical_feature] = scale.fit_transform(df[numerical_feature])

In [None]:
X = df.drop(['Rented_Bike_Count'],axis=1)
y = df['Rented_Bike_Count']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2,random_state =2)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size = 0.25,random_state =2)

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
gamma_range = np.logspace(-1, 1, 3)
params = {'C':[0.01,1,10,25,50, 60],'epsilon':[0.1,0.2,0.5,1], "gamma": gamma_range.tolist()+['scale', 'auto']}


In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

##### <font color="blue"> SVM with linear kernel</font>

In [None]:
from sklearn.svm import SVR
svr_linear = SVR(kernel='linear',gamma='scale', C=1.0, epsilon=0.1)
svr_linear.fit(X_train, y_train) 

SVR(kernel='linear')

In [None]:
svr_linear_score = svr_linear.score(X_val,y_val)
print('The score on validation set for SVM with linear kernel is : {:.2f}%'.format(svr_linear_score) )

The score on validation set for SVM with linear kernel is : 0.44%


##### <font color="blue"> Hyperparameter tuning for SVM with linear kernel

In [None]:
grid_linear = GridSearchCV(svr_linear,param_grid=params,cv=kfold,n_jobs=-1,scoring='r2',verbose=1,return_train_score=True)

In [None]:
grid_linear.fit(X_train,y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVR(kernel='linear'), n_jobs=-1,
             param_grid={'C': [0.01, 1, 10, 25, 50, 60],
                         'epsilon': [0.1, 0.2, 0.5, 1],
                         'gamma': [0.1, 1.0, 10.0, 'scale', 'auto']},
             return_train_score=True, scoring='r2', verbose=1)

In [None]:
grid_linear.best_params_

{'C': 60, 'epsilon': 0.2, 'gamma': 0.1}

In [None]:
grid_linear.best_estimator_

SVR(C=60, epsilon=0.2, gamma=0.1, kernel='linear')

##### <font color="blue"> SVM with linear kernel using best parameters

In [None]:
svr_best1=SVR(kernel='linear',gamma=0.1, C=60.0, epsilon=0.1)
svr_best1.fit(X_train, y_train)


SVR(C=60.0, gamma=0.1, kernel='linear')

In [None]:
svr_best1_score = svr_best1.score(X_test,y_test)
print('The score on test set for SVM WITH linear kernel is : {:.2f}%'.format(svr_best1_score) )

The score on test set for SVM WITH linear kernel is : 0.55%


In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM with linear kernel is : {:.2f}'.format((mean_absolute_error(y_test,svr_best1.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM with linear kernel is : {:.2f}'.format(np.sqrt(mean_squared_error(y_test,svr_best1.predict(X_test)))))

MAE fortest set with hyperparameter tuning for SVM with linear kernel is : 302.04
RMSE fortest set with hyperparameter tuning for SVM with linear kernel is : 416.98


##### <font color="red"> SVM without linear kernel


In [None]:
svr_rbf = SVR(kernel='rbf',gamma='scale', C=1.0, epsilon=0.1)
svr_rbf.fit(X_train, y_train) 

SVR()

In [None]:
svr_rbf_score = svr_rbf.score(X_val,y_val)

In [None]:
print('The score on validation set for SVM WITHOUT linear kernel is : {:.2f}%'.format(svr_rbf_score) )

The score on validation set for SVM WITHOUT linear kernel is : 0.08%


##### <font color="red">Hyperparameter tuning for SVM without linear kernel

In [None]:
grid_wlinear = GridSearchCV(svr_rbf,param_grid=params,cv=kfold,n_jobs=-1,scoring='r2',verbose=1,return_train_score=True)

In [None]:
grid_wlinear.fit(X_train,y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.01, 1, 10, 25, 50, 60],
                         'epsilon': [0.1, 0.2, 0.5, 1],
                         'gamma': [0.1, 1.0, 10.0, 'scale', 'auto']},
             return_train_score=True, scoring='r2', verbose=1)

In [None]:
grid_wlinear.best_estimator_

SVR(C=60, epsilon=1, gamma='auto')

In [None]:
grid_wlinear.best_params_

{'C': 60, 'epsilon': 1, 'gamma': 'auto'}

##### <font color="red"> SVM without linear kernel using best parameters

In [None]:
svr_best2=SVR(kernel='rbf',gamma='auto', C=60.0, epsilon=1)
svr_best2.fit(X_train, y_train)


SVR(C=60.0, epsilon=1, gamma='auto')

In [None]:
svr_best2_score = svr_best2.score(X_test,y_test)
print('The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}%'.format(svr_best2_score) )

The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel is : 0.71%


In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}'.format((mean_absolute_error(y_test,svr_best2.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}'.format(np.sqrt(mean_squared_error(y_test,svr_best2.predict(X_test)))))

MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : 208.28
RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : 337.67


In [None]:
svm_with_linear_filename = '/content/gdrive/My Drive/Colab Notebooks/Intro to DS/models/Group_19_newmodels_svm_with_linear.sav'
pickle.dump(svr_best1, open(svm_with_linear_filename, 'wb'))

In [None]:
svm_without_linear_filename = '/content/gdrive/My Drive/Colab Notebooks/Intro to DS/models/Group_19_newmodels_svm_without_linear.sav'
pickle.dump(svr_best2, open(svm_without_linear_filename, 'wb'))

##### <b><font color='Red'>The best model is SVM without linear kernel (Radial Basis Function kernel) with  hyperparameters as gamma ='auto', C='60' and epsilon as 1. The accuracy with test dataset is 0.71% using Lasso as variable selection method.</font>