In [189]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
# import missingno as msno
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import  Lasso


In [190]:
from google.colab import drive #comment out this line if you are using Colab
drive.mount("/content/gdrive/") 

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [191]:
import os
# main_path = "/content/gdrive/My Drive/ML_Project_data/Data_2/"
main_path = "/content/gdrive/My Drive/Colab Notebooks/Intro to DS/"
os.listdir(main_path)

['Group_19_data_cleaned.csv',
 'models',
 'Group_19_first_variable_selection1.ipynb',
 'Group_19_Bi_directional_elimination.ipynb',
 'Group_19_first_model_ash.ipynb',
 'Group_19_presentation_of_the_test_results.ipynb']

In [192]:
warnings.filterwarnings("ignore")

In [193]:
# importing the csv into DataFrame using Pandas
df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Intro to DS//Group_19_data_cleaned.csv",encoding= 'unicode_escape',parse_dates=[0])
df.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Â°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(Â°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,2017-01-12,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,0,1
1,1,2017-01-12,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,0,1


In [194]:
df = df.drop("Unnamed: 0",axis=1)

## <b> Data Preparation - Feature Engineering </font>

In [195]:
df.columns = [i.split("(")[0].strip().title().replace(" ","_") for i in [*df.columns]]

In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   8760 non-null   object 
 1   Rented_Bike_Count      8760 non-null   int64  
 2   Hour                   8760 non-null   int64  
 3   Temperature            8760 non-null   float64
 4   Humidity               8760 non-null   int64  
 5   Wind_Speed             8760 non-null   float64
 6   Visibility             8760 non-null   int64  
 7   Dew_Point_Temperature  8760 non-null   float64
 8   Solar_Radiation        8760 non-null   float64
 9   Rainfall               8760 non-null   float64
 10  Snowfall               8760 non-null   float64
 11  Seasons                8760 non-null   object 
 12  Holiday                8760 non-null   int64  
 13  Functioning_Day        8760 non-null   int64  
dtypes: float64(6), int64(6), object(2)
memory usage: 958.2+ 

In [197]:
df.Date = pd.to_datetime(df.Date, format="%Y/%m/%d")

In [198]:
df.insert(1,"Day", df.Date.dt.day)
df.insert(2, "Month", df.Date.dt.month)
df.insert(3, "Year", df.Date.dt.year)

In [199]:
df.insert(3, 'WeekDay',df["Date"].dt.day_name())

In [200]:
df.Year = df.Year.map({2017:0, 2018:1})

In [201]:
df.Seasons = df.Seasons.map({"Winter":1, 'Autumn':2, 'Spring':3, 'Summer':4})

In [202]:
df.insert(2, 'label_day_night', df['Hour'].apply(lambda x : 0 if (x<7) else( 1)))

In [203]:
df["Working_Day"] = 1
df.loc[(df.WeekDay=="Saturday")|(df.WeekDay=="Sunday"), "Working_Day"] = 0

In [204]:
df.Humidity[df.Humidity==0] = df.Humidity.mean()

In [205]:
categoryVariableList = ["WeekDay"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

In [206]:
for col in categoryVariableList:
    df_dummies_col = pd.get_dummies(df[col])
    
    df = pd.concat([df, df_dummies_col],axis=1)
    
    df.drop(col, axis=1, inplace=True)

In [207]:
df.columns

Index(['Date', 'Day', 'label_day_night', 'Month', 'Year', 'Rented_Bike_Count',
       'Hour', 'Temperature', 'Humidity', 'Wind_Speed', 'Visibility',
       'Dew_Point_Temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
       'Seasons', 'Holiday', 'Functioning_Day', 'Working_Day', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [208]:
columns = df.columns.tolist()
columns.append(columns.pop(5))
df = df[columns]

In [209]:
df.drop("Date",axis=1, inplace=True)

In [210]:
numerical_feature = ["Temperature","Humidity","Rainfall",
                     "Visibility","Dew_Point_Temperature",
                     'Wind_Speed','Solar_Radiation',"Snowfall"]

scale = StandardScaler()
df[numerical_feature] = scale.fit_transform(df[numerical_feature])

In [211]:
X = df.drop(['Rented_Bike_Count'],axis=1)
y = df['Rented_Bike_Count']

In [212]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2,random_state =2)

In [213]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size = 0.25,random_state =2)

In [214]:
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [215]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

##### <font color="blue"> SVM with linear kernel using best parameters

In [216]:
svr_best1=SVR(kernel='linear',gamma=0.1, C=60.0, epsilon=0.1)
svr_best1.fit(X_train, y_train)


SVR(C=60.0, gamma=0.1, kernel='linear')

In [217]:
svr_best1_score = svr_best1.score(X_test,y_test)
print('The score on test set for SVM WITH linear kernel is : {:.2f}%'.format(svr_best1_score) )

The score on test set for SVM WITH linear kernel is : 0.55%


In [218]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM with linear kernel is : {:.2f}'.format((mean_absolute_error(y_test,svr_best1.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM with linear kernel is : {:.2f}'.format((mean_squared_error(y_test,svr_best1.predict(X_test),squared=False))))

MAE fortest set with hyperparameter tuning for SVM with linear kernel is : 302.04
RMSE fortest set with hyperparameter tuning for SVM with linear kernel is : 416.98


In [219]:
model_comparison = {}

model_comparison['SVM with linear kernel']=[round(svr_best1_score,2),round(mean_absolute_error(y_test,svr_best1.predict(X_test)),2),round(mean_squared_error(y_test,svr_best1.predict(X_test),squared=False),2)]

##### <font color="red"> SVM without linear kernel using best parameters

In [220]:
svr_best2=SVR(kernel='rbf',gamma='auto', C=60.0, epsilon=1)
svr_best2.fit(X_train, y_train)


SVR(C=60.0, epsilon=1, gamma='auto')

In [221]:
svr_best2_score = svr_best2.score(X_test,y_test)
print('The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}%'.format(svr_best2_score) )

The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel is : 0.71%


In [222]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}'.format((mean_absolute_error(y_test,svr_best2.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : {:.2f}'.format((mean_squared_error(y_test,svr_best2.predict(X_test),squared=False))))

MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : 208.28
RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel is : 337.67


In [223]:

model_comparison['SVM without linear kernel']=[round(svr_best2_score,2),round(mean_absolute_error(y_test,svr_best2.predict(X_test)),2),round(mean_squared_error(y_test,svr_best2.predict(X_test),squared=False),2)]

## <B>LASSO

In [224]:
useful_features = ['Functioning_Day',
 'Holiday',
 'Hour',
 'Humidity',
 'Monday',
 'Month',
 'Rainfall',
 'Solar_Radiation',
 'Sunday',
 'Temperature',
 'Thursday',
 'Visibility',
 'Year',
 'label_day_night']

In [225]:
X_train, X_test, y_train, y_test = train_test_split(df[useful_features],
                                                  df.Rented_Bike_Count,
                                                  test_size=0.2,
                                                  random_state=2)

In [226]:
svr_best_lasso=SVR(kernel='rbf',gamma='auto', C=60.0, epsilon=1)
svr_best_lasso.fit(X_train, y_train)


SVR(C=60.0, epsilon=1, gamma='auto')

In [227]:
svr_best_lasso_score = svr_best_lasso.score(X_test,y_test)
print('The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}%'.format(svr_best_lasso_score) )


The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 0.76%


In [228]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}'.format((mean_absolute_error(y_test,svr_best_lasso.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}'.format(np.sqrt(mean_squared_error(y_test,svr_best_lasso.predict(X_test)))))

MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 192.92
RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 303.22


In [229]:
model_comparison['SVM without linear kernel with LASSO ']=[round(svr_best_lasso_score,2),round(mean_absolute_error(y_test,svr_best_lasso.predict(X_test)),2),round(mean_squared_error(y_test,svr_best_lasso.predict(X_test),squared=False),2)]

##<B> BI DIRECTIONAL

In [230]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2,random_state =2)

In [231]:
X_trains_new=X_train[['Hour', 'Temperature']]
sfs_svm=SVR(kernel='rbf',gamma='scale', C=60.0, epsilon=1)
sfs_fit=sfs_svm.fit(X_trains_new, y_train)

In [232]:

svr_best_bidirec_score = sfs_fit.score(X_test[['Hour', 'Temperature']],y_test)
print('The score on test set with SFS for SVM WITHOUT linear kernel WITH BIDIRECTIONALis : {:.2f}%'.format(svr_sfs_score) )

The score on test set with SFS for SVM WITHOUT linear kernel WITH BIDIRECTIONALis : 0.50%


In [233]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE for test set with hyperparameter tuning for SVM WITHOUT linear kernel with BIDIRECTIONAL variable selection is : {:.2f}'.format((mean_absolute_error(y_test,sfs_fit.predict(X_test[['Hour', 'Temperature']])))))
print('RMSE for test set with hyperparameter tuning for SVM WITHOUT linear kernel with BIDIRECTIONAL variable selection is : {:.2f}'.format(np.sqrt(mean_squared_error(y_test,sfs_fit.predict(X_test[['Hour', 'Temperature']])))))

MAE for test set with hyperparameter tuning for SVM WITHOUT linear kernel with BIDIRECTIONAL variable selection is : 301.89
RMSE for test set with hyperparameter tuning for SVM WITHOUT linear kernel with BIDIRECTIONAL variable selection is : 438.72


In [234]:
model_comparison['SVM without linear kernel with Bi directional elimination ']=[round(svr_best_bidirec_score,2),round(mean_absolute_error(y_test,sfs_fit.predict(X_test[['Hour', 'Temperature']])),2),round(mean_squared_error(y_test,sfs_fit.predict(X_test[['Hour', 'Temperature']]),squared=False),2)]

In [235]:
model_comparison

{'SVM with linear kernel': [0.55, 302.04, 416.98],
 'SVM without linear kernel': [0.71, 208.28, 337.67],
 'SVM without linear kernel with LASSO ': [0.76, 192.92, 303.22],
 'SVM without linear kernel with Bi directional elimination ': [0.5,
  301.89,
  438.72]}

In [238]:
Model_com_df=pd.DataFrame(model_comparison).T
Model_com_df.columns=['Score','MAE','RMSE']
Model_com_df=Model_com_df.sort_values(by='Score',ascending=False)
Model_com_df.style.format("{:.2%}").background_gradient(cmap='RdYlBu_r')

Unnamed: 0,Score,MAE,RMSE
SVM without linear kernel with LASSO,76.00%,19292.00%,30322.00%
SVM without linear kernel,71.00%,20828.00%,33767.00%
SVM with linear kernel,55.00%,30204.00%,41698.00%
SVM without linear kernel with Bi directional elimination,50.00%,30189.00%,43872.00%
