## Introduction to the Data Set

In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
bike_rentals = pd.read_csv('bike_rental_hour.csv')

In [3]:
bike_rentals.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
bike_rentals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [6]:
bike_rentals['cnt'].value_counts()

5      260
6      236
4      231
3      224
2      208
      ... 
725      1
709      1
661      1
629      1
887      1
Name: cnt, Length: 869, dtype: int64

In [12]:
bike_rentals.corr()['cnt'].abs().sort_values()

weekday       0.026900
workingday    0.030284
holiday       0.030927
windspeed     0.093234
mnth          0.120638
weathersit    0.142426
season        0.178056
yr            0.250495
instant       0.278379
hum           0.322911
hr            0.394071
atemp         0.400929
temp          0.404772
casual        0.694564
registered    0.972151
cnt           1.000000
Name: cnt, dtype: float64

## Calculating Features

In [16]:
bike_rentals.hr.value_counts(bins=[0,6,12,18,24])

(-0.001, 6.0]    5001
(12.0, 18.0]     4375
(6.0, 12.0]      4363
(18.0, 24.0]     3640
Name: hr, dtype: int64

In [28]:
def hour_transform(x):
    if x >= 6 and x < 12:
        return 1
    elif x >= 12 and x < 18:
        return 2
    elif x >= 18 and x < 24:
        return 3
    else:
        return 4

bike_rentals['time_label'] = bike_rentals['hr'].apply(lambda x: hour_transform(x))

In [29]:
bike_rentals.sample(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,time_label
8595,8596,2011-12-29,1,0,12,22,0,4,1,1,0.28,0.2879,0.65,0.1343,7,48,55,3
13689,13690,2012-07-29,3,1,7,14,0,0,0,1,0.8,0.7273,0.46,0.194,231,291,522,2
3624,3625,2011-06-04,2,0,6,22,0,6,0,1,0.64,0.6212,0.57,0.2239,49,141,190,3
15805,15806,2012-10-25,4,1,10,18,0,4,1,1,0.56,0.5303,0.83,0.1045,77,732,809,3
14719,14720,2012-09-10,3,1,9,12,0,1,1,1,0.64,0.6212,0.44,0.3881,76,256,332,2


## Splitting the Data Into Train and Test Sets

In [30]:
bike_rentals.corr()['cnt'].abs().sort_values()

weekday       0.026900
workingday    0.030284
holiday       0.030927
windspeed     0.093234
mnth          0.120638
weathersit    0.142426
season        0.178056
yr            0.250495
instant       0.278379
hum           0.322911
time_label    0.378318
hr            0.394071
atemp         0.400929
temp          0.404772
casual        0.694564
registered    0.972151
cnt           1.000000
Name: cnt, dtype: float64

In [55]:
bike_rentals.groupby(['weekday','time_label'])['cnt'].mean()

weekday  time_label
0        1             144.071656
         2             356.604762
         3             158.300000
         4              47.491857
1        1             211.293269
         2             265.839744
         3             237.283654
         4              15.991763
2        1             231.661765
         2             258.791332
         3             253.227564
         4              13.621212
3        1             237.123397
         2             252.945513
         3             255.048544
         4              15.804598
4        1             238.836570
         2             264.400000
         3             261.889423
         4              17.154351
5        1             235.245192
         2             297.391026
         3             228.089744
         4              21.295935
6        1             160.093651
         2             370.844444
         3             187.394231
         4              42.009554
Name: cnt, dtype: float64

In [56]:
bike_rentals.groupby('season')['cnt'].mean()

season
1    111.114569
2    208.344069
3    236.016237
4    198.868856
Name: cnt, dtype: float64

In [46]:
bike_rentals.groupby('weathersit')['cnt'].sum()

weathersit
1    2338173
2     795952
3     158331
4        223
Name: cnt, dtype: int64

In [73]:
cat_features = ['weekday', 'holiday', 'weathersit', 'time_label']

num_features = ['atemp', 'hum']

data = bike_rentals[cat_features+num_features].copy()

In [75]:
 for cat_feature in cat_features:
    data = pd.concat([data, pd.get_dummies(data[cat_feature], prefix=cat_feature)], axis=1)
data

Unnamed: 0,weekday,holiday,weathersit,time_label,atemp,hum,weekday_0,weekday_1,weekday_2,weekday_3,...,holiday_0,holiday_1,weathersit_1,weathersit_2,weathersit_3,weathersit_4,time_label_1,time_label_2,time_label_3,time_label_4
0,6,0,1,4,0.2879,0.81,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
1,6,0,1,4,0.2727,0.80,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
2,6,0,1,4,0.2727,0.80,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
3,6,0,1,4,0.2879,0.75,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
4,6,0,1,4,0.2879,0.75,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,0,2,3,0.2576,0.60,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
17375,1,0,2,3,0.2576,0.60,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
17376,1,0,1,3,0.2576,0.60,0,1,0,0,...,1,0,1,0,0,0,0,0,1,0
17377,1,0,1,3,0.2727,0.56,0,1,0,0,...,1,0,1,0,0,0,0,0,1,0


In [77]:
data = data.drop(cat_features, axis=1)

In [78]:
target = bike_rentals['cnt']

In [89]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

## Applying Linear Regression

In [121]:
lr = LinearRegression()
lr.fit(X_train, y_train)

preds = lr.predict(X_test)
print(mean_absolute_error(y_test, preds))
print(mean_squared_error(y_test, preds))

102.20771001150747
20652.174626006905


## Applying Decision Trees

In [122]:
dt = DecisionTreeRegressor(max_depth=5)

dt.fit(X_train, y_train)

preds = dt.predict(X_test)
print(mean_absolute_error(y_test, preds))
print(mean_squared_error(y_test, preds))

93.93313432570368
19487.9475712944


## Applying Random Forests

In [123]:
rf = RandomForestRegressor(n_estimators=100, max_depth=5, n_jobs=4)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
print(mean_absolute_error(y_test, preds))
print(mean_squared_error(y_test, preds))

92.547142614442
18853.651885259744


In [101]:
rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, n_jobs=4)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
print(mean_absolute_error(y_test, preds))

85.47301599714558


In [107]:
rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=4, n_jobs=4)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
print(mean_absolute_error(y_test, preds))

84.69018881307676


In [115]:
rf = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, min_samples_split=13, n_jobs=4)

rf.fit(X_train, y_train)

preds = rf.predict(X_test)
print(mean_absolute_error(y_test, preds))

84.6700606328947
