# Importing libraries

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score
import warnings
pd.set_option("display.max_columns", 100)

from datetime import datetime

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()


# for Q-Q plots
import scipy.stats as stats
from sklearn.base import clone
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Reading the data

In [2]:
data=pd.read_csv('/kaggle/input/seoul-bike-rental-ai-pro-iti/train.csv')


In [3]:
data.head()

In [4]:
data.info()

In [5]:
col_names=list(data.columns)
for index,col_name in enumerate(col_names):
    print(index ," :" ,col_name)

In [6]:
data=data.rename(columns={col_names[4]: "Temperature(C)", col_names[5]: "humidity(%)",col_names[6]:"Wind Speed(m/s)",\
              col_names[7]:"Visibility(10m)",col_names[8]:"Dew Temperature(C)",col_names[2]:"Number of rented bikes"})

In [7]:
data.columns

# Checking Null Values

In [8]:
data.isna().sum().plot(kind="bar")
plt.show()

In [9]:
data.isna().sum()

# Checking the type of Data

In [10]:

data.info()

# Converting Date and splitting it

In [11]:
def convert_date(df):
    df["Date"]=pd.to_datetime(df["Date"],format="%d/%m/%Y") 
    df["month"]=df["Date"].dt.month
    df["day"]=df["Date"].dt.day
    df["year"]=df["Date"].dt.year
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['month_start'] = df['Date'].dt.is_month_start
    df['woy'] = df['Date'].dt.isocalendar().week.astype(int)
    
    return df

In [12]:
data=convert_date(data)

In [13]:
data.head()

# Statitical description of the features

In [14]:
data.describe().T

## Visualizations

## showing distribution of rented bikes

In [15]:
fig, ax = plt.subplots(2, 3, figsize=(25, 20))
sns.boxplot(y=data["Number of rented bikes"],ax=ax[0][0])
ax[0][0].set_title("Box plot on Count ")
sns.boxplot(x="Seasons",y="Number of rented bikes",data=data,ax=ax[0][1])
ax[0][1].set_title("Box plot on Count Across Seasons")
sns.boxplot(x="Hour",y="Number of rented bikes",data=data,ax=ax[0][2])
ax[0][2].set_title("Box plot on Count Across hour")
sns.boxplot(x="Holiday",y="Number of rented bikes",data=data,ax=ax[1][0])
ax[1][0].set_title("Box plot on Count Across Holiday")
sns.boxplot(x="Functioning Day",y="Number of rented bikes",data=data,ax=ax[1][1])
ax[1][1].set_title("Box plot on Count Across Functioning day")
sns.boxplot(x="dayofweek",y="Number of rented bikes",data=data,ax=ax[1][2])
ax[1][2].set_title("Box plot on Count Across Day of week")

# Correlations between variables

In [16]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr("pearson"),
            vmin=-1, vmax=1,
            cmap='coolwarm',
            annot=True, 
            square=True)

#  Sorted correlation of variables to the target

In [17]:
df1Corr=pd.DataFrame(data.corr().unstack().sort_values(ascending=False)['Number of rented bikes'],columns=['Correlation to the target'])
df1Corr.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

# Plotting variables to the target

In [18]:
data.plot(x='Date',y='Number of rented bikes',kind='kde');

# Seasons with most rents¶

In [19]:
df1VizSeasons=pd.DataFrame(data.groupby('Seasons').sum()['Number of rented bikes'].sort_values(ascending=False))
df1VizSeasons.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

# Checking the link to solar radiation

In [20]:
df1VizSolarRadiation=pd.DataFrame(data.groupby('Seasons').sum()['Solar Radiation (MJ/m2)'].sort_values(ascending=False))
df1VizSolarRadiation.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

In [21]:
plt.figure(figsize=(9,8))
data.groupby('Seasons').sum()['Number of rented bikes'].plot.pie();
plt.title("Number of rented bikes share per season");

# Rented bikes vs Solar radiation

In [22]:
data.groupby('Solar Radiation (MJ/m2)').mean()['Number of rented bikes'].hist()

# Checking rents per hours

In [23]:
data.groupby('Hour').sum()['Number of rented bikes'].plot.bar()

# Rented bikes vs Days of the week

In [24]:
df1VizDays=pd.DataFrame(data.groupby('dayofweek').sum()['Number of rented bikes'].sort_values(ascending=False))
df1VizDays.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

Days are encoded 0 Mondays and 6 Sundays

# Temperature vs rented bikes

In [25]:
data.groupby('Temperature(C)').mean()['Number of rented bikes'].plot()

# Rain vs rented bikes

In [26]:
data.groupby('Rainfall(mm)').mean()['Number of rented bikes'].hist();

# Snowfall vs rented bikes

In [27]:
data.groupby('Snowfall (cm)')['Number of rented bikes'].mean().hist();

# Visualizing rents with years, months and hours

In [28]:
#plt.figure(figsize=(14,14))
sns.barplot(data=data, x='year', y=data['Number of rented bikes']);

In [29]:
sns.barplot(data=data, x='month', y=data['Number of rented bikes'], hue='year');


In [30]:
plt.figure(figsize=(14,14))
sns.pointplot(data=data, x='Hour', y=data['Number of rented bikes'], hue='dayofweek');

# Dropping not important Columns

In [31]:
def drop_cols(df):
    df=df.drop(['ID','Date'],axis=1)
    return df

In [32]:
data=drop_cols(data)

In [33]:
data.info()

# Check the outliers of Data

In [34]:
features_plot=['Number of rented bikes',  'Temperature(C)', 'humidity(%)',
       'Wind Speed(m/s)', 'Visibility(10m)', 'Dew Temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']

In [35]:
data[features_plot].describe()

In [36]:
def my_plot(feature):
    plt.hist(data[feature])
    plt.title(feature)
    plt.show()
        
for i in features_plot:
    my_plot(i)

In [37]:
def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments

    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.histplot(df[variable], bins='auto')
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('Variable quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')

    plt.show()

In [38]:
# Average values across each of the categorical columns 
fig = plt.figure(figsize=(15, 12))
axes = fig.add_subplot(2, 2, 1)
group_season = pd.DataFrame(data.groupby(['Seasons'])['Number of rented bikes'].mean()).reset_index()
sns.barplot(data=group_season, x='Seasons', y='Number of rented bikes', ax=axes)
axes.set(xlabel='Seasons', ylabel='Number of rented bikes', title='Average bike rentals across Seasons')

axes = fig.add_subplot(2, 2, 2)
group_workingday = pd.DataFrame(data.groupby(['Functioning Day'])['Number of rented bikes'].mean()).reset_index()
sns.barplot(data=group_workingday, x='Functioning Day', y='Number of rented bikes', ax=axes)
axes.set(xlabel='Functioning Day', ylabel='Number of rented bikes', title='Average bike rentals across Working Day')

axes = fig.add_subplot(2, 2, 3)
group_season = pd.DataFrame(data.groupby(['Holiday'])['Number of rented bikes'].mean()).reset_index()
sns.barplot(data=group_season, x='Holiday', y='Number of rented bikes', ax=axes)
axes.set(xlabel='Holiday', ylabel='Number of rented bikes', title='Average bike rentals across Holiday')
plt.show()

# Hourly count based on functioning day

In [39]:
# seaborn boxplots across hours
f, axes = plt.subplots(1, 1, figsize=(15, 6))
sns.boxplot(data=data, y='Number of rented bikes', x='Hour', hue='Functioning Day', ax=axes)
handles, _ = axes.get_legend_handles_labels()
axes.legend(handles, ['Not a Functioning Day', 'Functioning Day'])
axes.set(title='Hourly Count based on Functioning day or not')

plt.show()

# Plotting average bike count for each hour as a function of various categories.

In [40]:

# Plots of average count across hour in a day for various categories

f, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 18))
group_work_hour = pd.DataFrame(data.groupby(['Functioning Day', 'Hour'])['Number of rented bikes'].mean()).reset_index()
sns.pointplot(data=group_work_hour, x='Hour', y='Number of rented bikes', hue='Functioning Day', ax=axes[0], legend=True)
handles, _ = axes[0].get_legend_handles_labels()
axes[0].legend(handles, ['Not a Functioning Day', 'Functioning Day'])
axes[0].set(xlabel='Hour in the day', ylabel='Functioning Day', title='Average Bike Rentals by the day if Working day or Not')

hue_order= [0,1,2,3,4,5,6]
group_day_hour = pd.DataFrame(data.groupby(['day', 'Hour'])['Number of rented bikes'].mean()).reset_index()
sns.pointplot(data=group_day_hour, x='Hour', y='Number of rented bikes', hue='day', ax=axes[1], hue_order=hue_order)
axes[1].set(xlabel='Hour in the day', ylabel='Number of rented bikes', title='Average Bike Rentals by the day across Weekdays')


plt.show()

# Monthly distributions

In [41]:
# Average Monthly Count Distribution plot
f, axes = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
group_month = pd.DataFrame(data.groupby(['month', 'Functioning Day'])['Number of rented bikes'].mean()).reset_index()
sns.barplot(data=group_month, x='month', y='Number of rented bikes', hue='Functioning Day', ax=axes)
axes.set(xlabel='Month', ylabel='Number of rented bikes', title='Average bike rentals per Month')
handles, _ = axes.get_legend_handles_labels()
axes.legend(handles, ['Not a Functioning Day', 'Functioning Day'])
plt.show()

# Rented bikes vs Temperature, Humidity and Windspeed

In [42]:
# Regression Plots with respect to Temperature, Humidity and Windspeed
fig = plt.figure(figsize=(18, 8))
axes = fig.add_subplot(1, 3, 1)
sns.regplot(data=data, x='Temperature(C)', y='Number of rented bikes',ax=axes)
axes.set(title='Reg Plot for Temperature vs. Number of rented bikes')
axes = fig.add_subplot(1, 3, 2)
sns.regplot(data=data, x='humidity(%)', y='Number of rented bikes',ax=axes, color='r')
axes.set(title='Reg Plot for Humidity vs. Number of rented bikes')
axes = fig.add_subplot(1, 3, 3)
sns.regplot(data=data, x='Wind Speed(m/s)', y='Number of rented bikes',ax=axes, color='g')
axes.set(title='Reg Plot for Windspeed vs. Number of rented bikes')
plt.show()

# Showing the distribution of categorical variables

# Number of rented bikes

In [43]:
diagnostic_plots(data, 'Number of rented bikes')

# Temperatures

In [44]:
#showing distribution for temperature
diagnostic_plots(data, 'Temperature(C)')

# Humidity

In [45]:
diagnostic_plots(data, 'humidity(%)')

# Wind Speed

In [46]:
diagnostic_plots(data, 'Wind Speed(m/s)')

# Visibility

In [47]:
diagnostic_plots(data, 'Visibility(10m)')

# Dew Temperature

In [48]:
diagnostic_plots(data, 'Dew Temperature(C)')

# Solar Radiation

In [49]:
diagnostic_plots(data, 'Solar Radiation (MJ/m2)')

In [50]:
diagnostic_plots(data,'Rainfall(mm)')

# Snowfall

In [51]:
diagnostic_plots(data,'Snowfall (cm)')

In [52]:
data[data['Snowfall (cm)']>0].shape

In [53]:
data.shape   # the data size is much larger than number of outliers in data

## Handling Outliers

In [54]:
def find_skewed_boundaries(df, variable, distance):

    # Let's calculate the boundaries outside which sit the outliers
    # for skewed distributions

    # distance passed as an argument, gives us the option to
    # estimate 1.5 times or 3 times the IQR to calculate
    # the boundaries.

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

In [55]:
#Finding upper and lower limt for windspeed
RM_upper_limit, RM_lower_limit = find_skewed_boundaries(data, 'Wind Speed(m/s)', 1.5)
RM_upper_limit, RM_lower_limit

In [56]:
# replace the outliers of windspeed by the maximum and minimum limit

data['Wind Speed(m/s)']= np.where(data['Wind Speed(m/s)'] > RM_upper_limit, RM_upper_limit,
                       np.where(data['Wind Speed(m/s)'] < RM_lower_limit, RM_lower_limit,data['Wind Speed(m/s)']))

In [57]:
diagnostic_plots(data, 'Wind Speed(m/s)')

In [58]:
#Finding the maximum and minimum for solar radiation
RM_upper_limit, RM_lower_limit = find_skewed_boundaries(data, 'Solar Radiation (MJ/m2)', 1.5)
RM_upper_limit, RM_lower_limit

In [59]:
 # replace the outliers by the maximum and minimum limit

data['Solar Radiation (MJ/m2)']= np.where(data['Solar Radiation (MJ/m2)'] > RM_upper_limit, RM_upper_limit,
                       np.where(data['Solar Radiation (MJ/m2)'] < RM_lower_limit, RM_lower_limit,data['Solar Radiation (MJ/m2)']))

In [60]:
diagnostic_plots(data, 'Solar Radiation (MJ/m2)')

In [61]:
#Finding the maximum and minimum for solar radiation
RM_upper_limit, RM_lower_limit = find_skewed_boundaries(data, 'Snowfall (cm)', 3)
RM_upper_limit, RM_lower_limit

In [62]:
data['Snowfall (cm)']= np.where(data['Snowfall (cm)'] > RM_upper_limit, RM_upper_limit,
                       np.where(data['Snowfall (cm)'] < RM_lower_limit, RM_lower_limit,data['Snowfall (cm)']))

# showing correlation between variables

In [63]:
plt.figure(figsize=(20,20))
correlation=data.corr()
sns.heatmap(correlation,annot=True)

In [64]:
# data.drop("Dew Temperature(C)",axis=1,inplace=True)

In [65]:
data.columns

# applying additional function to data :peak,ideal,sticky

In [66]:
def additional(df):
    df['ideal'] = df[['Temperature(C)', 'Wind Speed(m/s)']].apply(lambda x: (0, 1)[x['Temperature(C)'] > 27 and x['Wind Speed(m/s)'] < 30], axis = 1)
    df['sticky'] = df[['humidity(%)', 'Functioning Day']].apply(lambda x: (0, 1)[x['Functioning Day'] == 1 and x['humidity(%)'] >= 60], axis = 1)
    df['solar_visibilty']=df[['Solar Radiation (MJ/m2)', 'Visibility(10m)']].apply(lambda x: (0, 1)[x['Solar Radiation (MJ/m2)'] <0.2 and x['Visibility(10m)'] > 1600], axis = 1)
    df["windchill"]=13.12+0.6215*df['Temperature(C)']-11.37*(df['Wind Speed(m/s)']*3.6)**0.16+0.3965*df['Temperature(C)'] \
    *(df['Wind Speed(m/s)']*3.6)**0.16
    df['newweather']=17.3+df['Temperature(C)']-0.11*df['humidity(%)']+0.34*df['Wind Speed(m/s)']
    df['hour_sin'] = np.sin(2 * np.pi * df['Hour']/23.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['Hour']/23.0)
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['log_wind']=np.log1p(df['Wind Speed(m/s)'])
    df['log_hum']=np.log1p(df['humidity(%)'])
    
    return df

In [67]:
cond1 = data['Functioning Day']== 'Yes'
cond2 = data['Hour'].isin([7,8,9,17,18,19])
cond3 = data['Hour'].isin([10,11,12,13,14,15,16])
cond4 = data['Hour'].isin([0,1,2,3,4,5,6,20,21,22,23])

conditions =[~cond1,cond1&cond2,cond1&cond3,cond1&cond4]
vals = ['No','High','Medium','Low']
data['Map demand'] = np.select(conditions,vals)


In [68]:
data=additional(data)

In [69]:
data.head()

# Showing the distribution of categorical variables

In [70]:
cat_features = data.columns[data.dtypes=='object']
cat_features=list(cat_features)

In [71]:
sns.countplot(x=cat_features[0],data=data)
plt.show()

In [72]:
sns.countplot(x=cat_features[1],data=data)
plt.show()

In [73]:
sns.countplot(x=cat_features[2],data=data)
plt.show()

In [74]:
def count_category(name):
    print(name)
    print(data[name].value_counts())
    print("-------------------------------------------------")
for i in cat_features:
    count_category(i)

# EDA

In [75]:
count_bikes_by_hour = data.groupby("Hour")["Number of rented bikes"].sum()
print(count_bikes_by_hour)

In [76]:
count_bikes_by_hour.plot(kind="bar",
title="count bikes by hour")
plt.savefig('count bikes by hr')
plt.show()

In [77]:
count_bikes_by_Holiday = data.groupby("Holiday")["Number of rented bikes"].sum()
print(count_bikes_by_Holiday)

In [78]:
data.groupby("Holiday")["Number of rented bikes"].sum().plot(kind="bar")
plt.savefig("count bikes by holiday")
plt.show()

In [79]:
count_bikes_by_funcday = data.groupby("Functioning Day")["Number of rented bikes"].sum()
print(count_bikes_by_funcday)

In [80]:
data.groupby("Functioning Day")["Number of rented bikes"].sum().plot(kind="bar")
plt.savefig("count_bikes_by_funcday")
plt.show()

In [81]:
count_bikes_by_Season = data.groupby("Seasons")["Number of rented bikes"].sum()
print(count_bikes_by_Season)

In [82]:
count_bikes_by_Season.plot(kind="bar")

## conclusion

There is a peak for renting bikes at 8 am and 6 pm which is most probably the time a person is going to or leaving his work

This is the order of season for number of rented bikes(descending order) ,

1)Autumn

2)Summer

3)Spring,

4)Winter

Most people rent bikes on Functioning days and days which aren't holidays

# Rolling Mean with window of 3

In [83]:
def rolling_mean(df,cols,window=3):
    for col in cols:
        new_col = col+'_rolled'
        df[new_col] = df[col].rolling(window=window).mean()
    return df


In [84]:
data =  rolling_mean(data,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)',
                           'Snowfall (cm)','humidity(%)','Visibility(10m)'])


In [85]:
def shifting(df,cols):
    for col in cols:
        new_col = col+'_shifted'
        df[new_col] = df[col].shift(periods=-1)
    return df
data =  shifting(data,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)','humidity(%)','Visibility(10m)'])
def shifting_1(df,cols):
    for col in cols:
        new_col = col+'_shifted1'
        df[new_col] = df[col].shift(periods=-2)
    return df
data =  shifting_1(data,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)'])

def shifting_2(df,cols):
    for col in cols:
        new_col = col+'_shifted2'
        df[new_col] = df[col].shift(periods=-3)
    return df
data =  shifting_2(data,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)'])

In [86]:
data=data.fillna(data.mean())

In [87]:
data.head()

# Converting Categorical Varibales

In [88]:
def conv_cat_features(df):
    df['Holiday'].replace({"No Holiday":1, "Holiday":0}, inplace=True)
    df['Functioning Day'].replace({"Yes":1, "No":0}, inplace=True)
    data_cat_transformed = pd.get_dummies(df, drop_first=True)
    return data_cat_transformed

In [89]:
data_cat_transformed=conv_cat_features(data)
# Print the number of features after one-hot encoding
print("{} total features after one-hot encoding.".format(len(data_cat_transformed.columns)))
print(data_cat_transformed.columns)#

# Splitting the data into features and Target Label

In [90]:
# Split the data into features and target label
X = data_cat_transformed.drop('Number of rented bikes', axis=1)
y = data_cat_transformed['Number of rented bikes']
def custom_train_valid_split(data, cutoff_day=15):
    train = data[data['day'] <= cutoff_day]
    valid = data[data['day'] > cutoff_day]
    return train,valid

train,valid =  custom_train_valid_split(data, cutoff_day=15)
train_idx = train.index
valid_idx = valid.index
myCViterator = []
myCViterator.append( (train_idx, valid_idx) )

# Create Training Model

## Define Loss Function

In [91]:
def RMSLE(y_train,y_pred):
    return np.sqrt(mean_squared_log_error(y_train, y_pred))

In [92]:
def train_predict(regressor): 
    '''
    inputs:
       - regressor: the learning algorithm to be trained and predicted on
    '''
    
    results = {}
    
    # Fit the classifier to the training data and Calculate the time
    start_train = time()
    regressor = regressor.fit(X, y)
    end_train = time()
    results['train_time'] = round((end_train-start_train), 2)

    # Predict the training/testing labels and Calculate the time
    start_test = time()
    y_pred_train = regressor.predict(X)
    y_pred_test = regressor.predict(test_cat_transformed)
    end_test = time()   
    results['pred_time'] = round((end_test-start_test), 2)
    results['RMSLE_train'] =round(RMSLE(y, y_pred_train),4) 
    results['Rsquared_train'] = r2_score(y, y_pred_train)
    results['MSE_train'] = mean_squared_error(y, y_pred_train)
    return results,y_pred_test

# Test Data

In [93]:
test_df=pd.read_csv('/kaggle/input/seoul-bike-rental-ai-pro-iti/test.csv')
test_df_copy=test_df.copy()

In [94]:
test_df.describe()

In [95]:
test_df.head()

In [96]:
test_df.info()

In [97]:
test_df.columns

In [98]:
test_df=test_df.rename(columns={col_names[4]: "Temperature(C)", col_names[5]: "humidity(%)",col_names[6]:"Wind Speed(m/s)",\
              col_names[7]:"Visibility(10m)",col_names[8]:"Dew Temperature(C)",col_names[2]:"Number of rented bikes"})

In [99]:
test_df=convert_date(test_df)

In [100]:
cond1 = test_df['Functioning Day']=='Yes'
cond2 = test_df['Hour'].isin([7,8,9,17,18,19])
cond3 = test_df['Hour'].isin([10,11,12,13,14,15,16])
cond4 = test_df['Hour'].isin([0,1,2,3,4,5,6,20,21,22,23])
conditions =[~cond1,cond1&cond2,cond1&cond3,cond1&cond4]
vals = ['No','High','Medium','Low']
test_df['Map demand'] = np.select(conditions,vals)

In [101]:
test_df=test_df.drop(['ID','Date'],axis=1)

In [102]:
test_df=additional(test_df)

In [103]:
test_df =  rolling_mean(test_df,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)','humidity(%)','Visibility(10m)'])
test_df =  shifting(test_df,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)','humidity(%)','Visibility(10m)'])
test_df =  shifting_1(test_df,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)'])
test_df =  shifting_2(test_df,['Temperature(C)','Wind Speed(m/s)','Rainfall(mm)', 'Snowfall (cm)'])
test_df=test_df.fillna(test_df.mean())

In [104]:
test_cat_transformed=conv_cat_features(test_df)

# Model Tuning

In [105]:
# parameters={
#  "learning_rate"    :np.arange(0.04,0.2,0.01)   ,                             # [0.05,0.06,0.065,0.07,0.075,0.08,0.085,0.09,0.10,0.125,0.15,0.2] ,
#   "max_depth"        :np.arange(4,16,1) ,                    #[4,5,6,7,8,9,10]
#   "min_child_weight" : np.arange(11,17, 1),  #[1,2,3,4,5,6,7,8,9,10],
#   "gamma"            : np.arange(2,10,0.5)  ,               #    [0.5,1,2,3,5,7,6,8,9,10],
#   "n_estimators"     :np.arange(2500,3500, 50),      # [500,700,800,900,1000,1200,1300,1400,1500,1600,1700,1800,1900,2000,2200,2500,2750],
#   "subsample"        :np.arange(0.7, 1, 0.02) ,
#   "colsample_bytree" :np.arange(0.4,1,0.05), # [0.7,0.75,0.8,0.85,0.9,0.95,1],
#   "tree_method"    : ["gpu_hist"]
#      }

In [106]:
# def model_tuning(reg, parameters):    

#     # Evaluate using neg_mean_squared_log_error 
#     #scorer = make_scorer(RMSLE,greater_is_better=False)

#     # Perform grid search on the regressor and tune parameters, using scorer evaluation
#     grid_obj = RandomizedSearchCV(estimator=reg, param_distributions=parameters,n_iter=300, 
#                                   scoring="neg_mean_squared_log_error",n_jobs=-1,cv=5,verbose=3)

#     # Fit the grid search object to the training data and find the optimal parameters
#     grid_fit = grid_obj.fit(X, y)

#     # Get the best estimator
#     best_clf = grid_fit.best_estimator_
#     print(f"best mean cross validation score: {grid_fit.best_score_}")
#     print(f"best parameters: {grid_fit.best_params_}")
    
#     # Make predictions using the optimized and the best model
#     y_best_pred = best_clf.predict(test_cat_transformed)
    
#     return best_clf,grid_fit, y_best_pred

In [107]:
# clf, best_clf,grid_fit, y_best_pred=model_tuning(regressor, parameters)

In [108]:
# best_clf

In [109]:
# grid_fit

> After Running HyperParameter Tuning jobs on xgboost model and catboost model the optimum parameters reached were:
> - XGBoost: {}
> - CatBoost: {}

In [110]:
X_train=X.copy()
X_test=test_cat_transformed.copy()

In [111]:
print(X_train.shape)
print(X_test.shape)

In [112]:
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()

In [113]:
X_train_xgb=X_train.drop(['hour_sin', 'hour_cos','month_sin', 'month_cos','woy','windchill', 
                      'newweather','Temperature(C)_rolled','Dew Temperature(C)','Visibility(10m)_rolled',
                      'humidity(%)','Visibility(10m)_shifted','Wind Speed(m/s)_rolled','Seasons_Spring', 
                      'Seasons_Summer','Seasons_Winter','log_hum'],axis=1)
X_test_xgb=X_test.drop(['hour_sin', 'hour_cos','month_sin', 'month_cos','woy','windchill', 
                    'newweather','Temperature(C)_rolled','Dew Temperature(C)','Visibility(10m)_rolled',
                    'humidity(%)','Visibility(10m)_shifted','Wind Speed(m/s)_rolled','Seasons_Spring',
                    'Seasons_Summer','Seasons_Winter','log_hum'],axis=1)

In [114]:
xgb_v = XGBRegressor(
   objective='count:poisson',random_state=42)



xgb= XGBRegressor(
    objective='count:poisson',random_state=42,
   subsample= 0.83, n_estimators= 3000, 
   min_child_weight= 16, max_depth= 5, 
   learning_rate= 0.055, gamma= 1, colsample_bytree= 0.5)

catboost_v=CatBoostRegressor(objective='Poisson',iterations=4000,l2_leaf_reg=1,learning_rate=0.054  , silent = True)
catboosttrial=CatBoostRegressor(objective='Poisson',iterations=4000,l2_leaf_reg=1,learning_rate=0.054,silent=True)                              
xgb_models={
    "XGB_vanella":xgb_v,
    "XGB_mod" : xgb }
cat_models ={
    "Catbbost_mod" :catboosttrial,
    "Catboost_Vanilla" :catboost_v}

In [115]:
columns = ['Classifier', 'RMSLE']

df_scores_cat = pd.DataFrame(columns=columns)
for name, model in cat_models.items():
    model.fit(X_train_cat, y)
    Y_predicted = model.predict(X_train_cat)
    rmsle = np.sqrt(mean_squared_log_error(y, Y_predicted))
    df_row = pd.DataFrame([[name, rmsle]], columns=columns)
    df_scores_cat = df_scores_cat.append(df_row, ignore_index=True)

In [116]:
columns = ['Classifier', 'RMSLE']

df_scores_xgb = pd.DataFrame(columns=columns)
for name, model in xgb_models.items():
    model.fit(X_train_xgb, y)
    Y_predicted = model.predict(X_train_xgb)
    rmsle = np.sqrt(mean_squared_log_error(y, Y_predicted))
    df_row = pd.DataFrame([[name, rmsle]], columns=columns)
    df_scores_xgb = df_scores_xgb.append(df_row, ignore_index=True)

In [117]:
df_scores_cat.sort_values(by=['RMSLE'])

In [118]:
df_scores_xgb.sort_values(by=['RMSLE'])

# Plot Feature importance from Xgboostregressor without any hyperparameter tuning

In [119]:
from xgboost import plot_importance, plot_tree
fig = plt.figure(figsize=(15,12))
fig = plot_importance(xgb_v, height=0.9,max_num_features = 25)
plt.show()

# Plot Feature importance from Xgboostregressor with best parameters after hyperparameter tuning

In [120]:
fig = plt.figure(figsize=(15,12))
fig = plot_importance(xgb, height=0.9,max_num_features = 25)
plt.show()

# Getting the results on Test Data

In [121]:
y_cat_pred=catboosttrial.predict(X_test_cat)

In [122]:
y_xgb_pred=xgb.predict(X_test_xgb)

In [123]:
# #getting the output from train predicgt func
test_df_copy['yxgb']=y_xgb_pred.astype('int')
test_df_copy['ycat']=y_cat_pred.astype('int')

In [124]:
yavg = 0.64*test_df_copy['yxgb'] + 0.36*test_df_copy['ycat']
test_df_copy['y'] = yavg.astype('int')

In [125]:
test_df_copy[test_df_copy["Functioning Day"]=='No']

In [126]:
test_df_copy[['ID','y']].to_csv("submission.csv",index=False)