## Bike rental demand prediction using Machine Learning

### Preprocessing: 
To make the data suitable for machine learning, we are going to do some preprocessing, including, handling missing data, transform some columns, etc.

* Use One hot encoding or pd.get_dummies() to convert ordinal, binary and all other categorical columns to numeric
* Data Transformation (Optional): Standardization/Normalization/log/sqrt especially if you are using distance based algorithms like KNN, or Neural Networks.


### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
plt.rcParams['figure.figsize'] = (12.0, 5.0)

In [2]:
df = pd.read_csv('../data/train.csv', header = 0, error_bad_lines=False)
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
df.nunique()

datetime      10886
season            4
holiday           2
workingday        2
weather           4
temp             49
atemp            60
humidity         89
windspeed        28
casual          309
registered      731
count           822
dtype: int64

In [4]:
def preprocessing(df):
    
    df_num = ['mtemp','humidity','windspeed']
    cat_features = ['weekday','season','day_typ','workingday','weather']
    
    def parse_datetime(df):

        # Convert the datetime col in datetime format
        df['datetime'] = pd.to_datetime(df.datetime)

        # extract month, day, weekday, and hour from datetime
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['weekday'] = df['datetime'].dt.dayofweek
        df['hour'] = df['datetime'].dt.hour

        #rearrange columns
        df = df.set_index('datetime')
        df = df[['month','day', 'weekday', 'hour','season','holiday','workingday','weather','temp','atemp','humidity','windspeed','casual','registered','count']]
        return df

    
    def feature_transformation(df):
        
        #Creat a new variable having categories like weekend, holiday & weekday 
        '''#df['day_typ'] = np.where( ( (df['holiday'] == 0) & (df['workingday'] == 0 ) ) , 0,0)
        #df['day_typ'] = np.where( ( df['holiday']== 1) , 1,0)
        #df['day_typ'] = np.where( ( (df['holiday'] == 0) & (df['workingday'] == 1 ) ) , 2,0)
        '''
        
        df.loc[(df['holiday']==0) & (df['workingday']==0),'day_typ'] = 'weekend'
        df.loc[(df['holiday']==1),'day_typ'] = 'holiday'
        df.loc[(df['holiday']==0) & (df['workingday']==1),'day_typ'] = 'workday'
        
    
        #Aggregate those temp & atemp to derive a new feature 'm_temp' representing mean of the 2 temperature values
        columns = ['temp','atemp']
        df['mtemp'] = df.apply(lambda row: row[columns].mean(), axis=1)
        return df
    
        '''#Label/Bin the hour distribution to categorize off-peak, average and peak situations
        df['hour_typ'] = df['hour'].map(lambda x:4 if 0 <= x < 6 else(1 if 6 <= x < 12 else(2 if 12 <= x < 19 else(3 if x>=19 else 0))))

        #rearrange columns
        df = df[['month','day', 'day_typ', 'weekday', 'hour', 'hour_typ','season', 'holiday','workingday', 'weather', 'temp','atemp', 'm_temp', 'humidity','windspeed','casual','registered','count']]
        return df
        '''
        
    #feature scaling/normalization
    def normalize(df):

        for i in df_num:
            max_value = df[i].max()
            min_value = df[i].min()
            df[i] = (df[i] - min_value) / (max_value - min_value)
        return df
    
    def feature_encoding(df):

        for i in cat_features:
            ''' Creating dummies for each variable in one_hot_var and merging dummies dataframe to our original dataframe '''
            temp = pd.get_dummies(df[i], prefix = i)
            df = df.join(temp)
        return df
    
    def unwanted_cols(df):

        #Drop columns unneeded for modeling, including those which we have created new label and one hot encoded variants out of
        df = df.drop(['month','day','weekday','season','holiday','workingday','day_typ','temp','atemp','weather', 'casual', 'registered'], axis=1)
        return df
    
    df = parse_datetime(df)
    df = feature_transformation(df)
    df = normalize(df)
    df = feature_encoding(df)
    df = unwanted_cols(df)
    return df

In [5]:
df = preprocessing(df)
df

Unnamed: 0_level_0,hour,humidity,windspeed,count,mtemp,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,season_4,day_typ_holiday,day_typ_weekend,day_typ_workday,workingday_0,workingday_1,weather_1,weather_2,weather_3,weather_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01 00:00:00,0,0.81,0.000000,16,0.272181,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2011-01-01 01:00:00,1,0.80,0.000000,40,0.253199,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2011-01-01 02:00:00,2,0.80,0.000000,32,0.253199,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2011-01-01 03:00:00,3,0.75,0.000000,13,0.272181,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2011-01-01 04:00:00,4,0.75,0.000000,1,0.272181,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,19,0.50,0.456213,336,0.404818,0,0,1,0,0,...,1,0,0,1,0,1,1,0,0,0
2012-12-19 20:00:00,20,0.57,0.263195,241,0.367694,0,0,1,0,0,...,1,0,0,1,0,1,1,0,0,0
2012-12-19 21:00:00,21,0.61,0.263195,168,0.339641,0,0,1,0,0,...,1,0,0,1,0,1,1,0,0,0
2012-12-19 22:00:00,22,0.61,0.105325,129,0.357842,0,0,1,0,0,...,1,0,0,1,0,1,1,0,0,0


In [6]:
df.columns.values

array(['hour', 'humidity', 'windspeed', 'count', 'mtemp', 'weekday_0',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'season_1', 'season_2', 'season_3', 'season_4',
       'day_typ_holiday', 'day_typ_weekend', 'day_typ_workday',
       'workingday_0', 'workingday_1', 'weather_1', 'weather_2',
       'weather_3', 'weather_4'], dtype=object)