In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
# import missingno as msno
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import  Lasso


In [100]:
warnings.filterwarnings("ignore")

In [101]:
from google.colab import drive #comment out this line if you are using Colab
drive.mount("/content/gdrive/") 

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [102]:
import os
main_path = "/content/gdrive/My Drive/Colab Notebooks/Intro to DS/"
os.listdir(main_path)

['Group_19_first_variable_selection1.ipynb',
 'Group_19_data_cleaned.csv',
 'Group_19_first_model.ipynb',
 'Group_19_Bi_directional_elimination.ipynb',
 'models',
 'Group_19_presentation_of_the_test_results.ipynb']

In [103]:
# importing the csv into DataFrame using Pandas
df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Intro to DS//Group_19_data_cleaned.csv",encoding= 'unicode_escape',parse_dates=[0])
df.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(Â°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(Â°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,2017-01-12,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,0,1
1,1,2017-01-12,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,0,1


In [104]:
df = df.drop("Unnamed: 0",axis=1)

## <b> Data Preparation - Feature Engineering </font>

In [105]:
df.columns = [i.split("(")[0].strip().title().replace(" ","_") for i in [*df.columns]]

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   8760 non-null   object 
 1   Rented_Bike_Count      8760 non-null   int64  
 2   Hour                   8760 non-null   int64  
 3   Temperature            8760 non-null   float64
 4   Humidity               8760 non-null   int64  
 5   Wind_Speed             8760 non-null   float64
 6   Visibility             8760 non-null   int64  
 7   Dew_Point_Temperature  8760 non-null   float64
 8   Solar_Radiation        8760 non-null   float64
 9   Rainfall               8760 non-null   float64
 10  Snowfall               8760 non-null   float64
 11  Seasons                8760 non-null   object 
 12  Holiday                8760 non-null   int64  
 13  Functioning_Day        8760 non-null   int64  
dtypes: float64(6), int64(6), object(2)
memory usage: 958.2+ 

In [107]:
df.Date = pd.to_datetime(df.Date, format="%Y/%m/%d")

In [108]:
df.insert(1,"Day", df.Date.dt.day)
df.insert(2, "Month", df.Date.dt.month)
df.insert(3, "Year", df.Date.dt.year)

In [109]:
df.insert(3, 'WeekDay',df["Date"].dt.day_name())

In [110]:
df.Year = df.Year.map({2017:0, 2018:1})

In [111]:
df.Seasons = df.Seasons.map({"Winter":1, 'Autumn':2, 'Spring':3, 'Summer':4})

In [112]:
df.insert(2, 'label_day_night', df['Hour'].apply(lambda x : 0 if (x<7) else( 1)))

In [113]:
df["Working_Day"] = 1
df.loc[(df.WeekDay=="Saturday")|(df.WeekDay=="Sunday"), "Working_Day"] = 0

In [114]:
df.Humidity[df.Humidity==0] = df.Humidity.mean()

In [115]:
categoryVariableList = ["WeekDay"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

In [116]:
for col in categoryVariableList:
    df_dummies_col = pd.get_dummies(df[col])
    
    df = pd.concat([df, df_dummies_col],axis=1)
    
    df.drop(col, axis=1, inplace=True)

In [117]:
df.columns

Index(['Date', 'Day', 'label_day_night', 'Month', 'Year', 'Rented_Bike_Count',
       'Hour', 'Temperature', 'Humidity', 'Wind_Speed', 'Visibility',
       'Dew_Point_Temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
       'Seasons', 'Holiday', 'Functioning_Day', 'Working_Day', 'Friday',
       'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [118]:
columns = df.columns.tolist()
columns.append(columns.pop(5))
df = df[columns]

In [119]:
df.drop("Date",axis=1, inplace=True)

In [120]:
numerical_feature = ["Temperature","Humidity","Rainfall",
                     "Visibility","Dew_Point_Temperature",
                     'Wind_Speed','Solar_Radiation',"Snowfall"]

scale = StandardScaler()
df[numerical_feature] = scale.fit_transform(df[numerical_feature])

## <b> Variable Selection - Lasso
   

In [121]:
lr = Lasso(alpha=0.1, normalize=True)
lr.fit(df.drop(['Rented_Bike_Count'], axis=1), df.Rented_Bike_Count)
[*zip(df.columns[:-1], lr.coef_)]

[('Day', 0.0),
 ('label_day_night', 320.1623306749111),
 ('Month', 9.786319088275862),
 ('Year', 96.21144806941491),
 ('Hour', 9.968478947345982),
 ('Temperature', 353.06605121333814),
 ('Humidity', -132.9704421240069),
 ('Wind_Speed', 0.0),
 ('Visibility', 16.946707033845104),
 ('Dew_Point_Temperature', 0.0),
 ('Solar_Radiation', -90.1679043726255),
 ('Rainfall', -62.56075942985916),
 ('Snowfall', -0.0),
 ('Seasons', 0.0),
 ('Holiday', -77.62332531151428),
 ('Functioning_Day', 755.9649521061426),
 ('Working_Day', 0.0),
 ('Friday', 0.0),
 ('Monday', 0.0),
 ('Saturday', 0.0),
 ('Sunday', -43.83774858337618),
 ('Thursday', -0.0),
 ('Tuesday', -0.0),
 ('Wednesday', 0.0)]

In [122]:
useful_features = set(df.columns).difference(["Day","Wind_Speed","Snowfall","Seasons","Working_Day",
                                                    "Dew_Point_Temperature","Friday","Tuesday","Wednesday","Thursday"
                                                    "Monday","Saturday","Rented_Bike_Count"])

In [123]:
useful_features

{'Functioning_Day',
 'Holiday',
 'Hour',
 'Humidity',
 'Monday',
 'Month',
 'Rainfall',
 'Solar_Radiation',
 'Sunday',
 'Temperature',
 'Thursday',
 'Visibility',
 'Year',
 'label_day_night'}

In [124]:
X_train, X_test, y_train, y_test = train_test_split(df[useful_features],
                                                  df.Rented_Bike_Count,
                                                  test_size=0.2,
                                                  random_state=2)

In [125]:
# X_train, X_val, y_train, y_val = train_test_split(df[useful_features],
#                                                   df.Rented_Bike_Count,
#                                                   test_size=0.2,
#                                                   random_state=2)

##### <font color="red"> SVM without linear kernel using best parameters and variable selection

In [126]:
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [127]:
svr_best_lasso=SVR(kernel='rbf',gamma='auto', C=60.0, epsilon=1)
svr_best_lasso.fit(X_train, y_train)


SVR(C=60.0, epsilon=1, gamma='auto')

In [128]:
svr_best_lasso_score = svr_best_lasso.score(X_test,y_test)
print('The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}%'.format(svr_best_lasso_score) )


The score on test set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 0.76%


In [129]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}'.format((mean_absolute_error(y_test,svr_best_lasso.predict(X_test)))))
print('RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : {:.2f}'.format(np.sqrt(mean_squared_error(y_test,svr_best_lasso.predict(X_test)))))

MAE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 192.92
RMSE fortest set with hyperparameter tuning for SVM WITHOUT linear kernel with LASSO variable selection is : 303.22


In [130]:
Group_19_first_variable_selection_filename = '/content/gdrive/My Drive/Colab Notebooks/Intro to DS/models/Group_19_first_variable_selection.sav'
pickle.dump(svr_best_lasso, open(Group_19_first_variable_selection_filename, 'wb'))