In [1]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import seaborn as sns

from functools import partial
from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

%matplotlib inline

In [2]:
train_df = pd.read_csv('../data/raw/train.csv', parse_dates=['datetime'])
test_df = pd.read_csv('../data/raw/test.csv', parse_dates=['datetime'])

In [3]:
print('train df:')
print('shape:', train_df.shape)
print('columns:', list(train_df.columns))
print('\ntest df:')
print('shape:', test_df.shape)
print('columns:', list(test_df.columns))

train df:
shape: (10886, 12)
columns: ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']

test df:
shape: (6493, 9)
columns: ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']


**Data Fields**
* datetime - hourly date + timestamp  
* season - 
    * 1 = spring, 
    * 2 = summer, 
    * 3 = fall, 
    * 4 = winter 
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday
* weather - 
    * 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
    * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
    * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
    * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals

In [4]:
# add missing columns in the test dataset
test_df['casual'] = np.NaN
test_df['registered'] = np.NaN
test_df['count'] = np.NaN

In [5]:
train_df.sample(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
8906,2012-08-13 11:00:00,3,0,1,1,31.16,34.09,40,6.0032,94,151,245
7125,2012-04-15 06:00:00,2,0,0,1,20.5,24.24,63,8.9981,5,11,16
9743,2012-10-10 08:00:00,4,0,1,1,20.5,24.24,77,11.0014,33,806,839


In [6]:
test_df.sample(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
3213,2011-12-31 14:00:00,1,0,0,1,18.86,22.725,51,22.0028,,,
2115,2011-08-31 16:00:00,3,0,1,1,30.34,33.335,42,8.9981,,,
4654,2012-06-22 18:00:00,3,0,1,2,32.8,37.88,55,12.998,,,


#### Feature Engineering

In [7]:
# The distribution of values for the 'count' column is highly 
# right skewed, so we need to transform data to get better results.
log_target_variable = True

if log_target_variable == True:
    train_df['count'] = train_df['count'].apply(lambda x:np.log(x))
    test_df['count'] = test_df['count'].apply(lambda x:np.log(x))
#sns.distplot(train_df['count'])
#plt.show()

In [8]:
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day
train_df['hour'] = train_df['datetime'].dt.hour
train_df['dayofweek'] = train_df['datetime'].dt.dayofweek
train_df['mnth+day'] = train_df['datetime'].apply(lambda x:str(x.month) + '_' + str(x.day))

test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day
test_df['hour'] = test_df['datetime'].dt.hour
test_df['dayofweek'] = test_df['datetime'].dt.dayofweek
test_df['mnth+day'] = test_df['datetime'].apply(lambda x:str(x.month) + '_' + str(x.day))

#### Feature Selection

In [9]:
# TODO: here we could automatically select best features using i.e. random forest

In [10]:
# reference https://www.kaggle.com/miteshyadav/comprehensive-eda-with-xgboost-top-10-percentile
#features_to_drop = ['datetime','temp','windspeed','casual','registered','mnth+day','day']
features_to_drop = ['datetime']

train_df = train_df.drop(features_to_drop, axis=1)
test_df = test_df.drop(features_to_drop, axis=1)


In [11]:
# Encode categorical columns
def encode_cat_columns(df):
    weather_df = pd.get_dummies(df['weather'], prefix='w',drop_first=True)
    #year_df    = pd.get_dummies(df['year'], prefix='y',drop_first=True)
    #month_df   = pd.get_dummies(df['month'], prefix='m',drop_first=True)
    #hour_df    = pd.get_dummies(df['hour'], prefix='h',drop_first=True)
    season_df  = pd.get_dummies(df['season'], prefix='s',drop_first=True)

    df = df.join(weather_df)
    #df = df.join(year_df)
    #df = df.join(month_df)                     
    #df = df.join(hour_df)
    df = df.join(season_df)
    
    df.drop(['weather'], axis=1, inplace=True)
    #df.drop(['year'], axis=1, inplace=True)
    #df.drop(['month'], axis=1, inplace=True)
    #df.drop(['hour'], axis=1, inplace=True)
    df.drop(['season'], axis=1, inplace=True)
    
    return df

#train_df = encode_cat_columns(train_df)
#test_df = encode_cat_columns(test_df)

#### Modelling

In [12]:
X = train_df.iloc[:, train_df.columns!='count'].values
y = train_df['count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
# scoring rule
def RMSLE(y_pred, y_test): 
    y_pred = y_pred.clip(0)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_test))**2))

In [14]:
# create suggested model (https://www.kaggle.com/miteshyadav/comprehensive-eda-with-xgboost-top-10-percentile)
# without hypertuning parameters
model = XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4,colsample_bytree=0.6,subsample=0.6)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("RMSLE: {}".format(RMSLE(y_pred, y_test)))

RMSLE: 0.028691949270849972


In [15]:
if log_target_variable == True:
    y_pred = np.exp(y_pred)
    y_test = np.exp(y_test)

# compare prediction with real values
def compare_y_and_y_pred(y_pred, y_test):
    d = {'y_pred' : y_pred,
         'y' : y_test}
    print(pd.DataFrame(data=d))

compare_y_and_y_pred(y_pred, y_test)

          y_pred      y
0     246.517365  244.0
1     250.575638  239.0
2     239.655457  229.0
3     481.374603  467.0
4     361.733154  335.0
5      42.564625   40.0
6     333.194641  329.0
7       2.065551    2.0
8     150.984131  141.0
9     422.621033  391.0
10    107.416206  104.0
11    132.745529  126.0
12    188.483566  202.0
13    137.803726  149.0
14    178.265030  187.0
15    259.390930  283.0
16    425.138245  413.0
17    109.909256  133.0
18     73.249954   68.0
19      4.283044    4.0
20     17.647654   20.0
21    459.413879  450.0
22    133.338806  136.0
23     21.653208   20.0
24     30.331596   31.0
25    110.520302  106.0
26     55.952267   49.0
27    294.698334  290.0
28    138.760880  148.0
29    134.716553  121.0
...          ...    ...
2148   88.565781   84.0
2149  293.942017  297.0
2150    7.589005    8.0
2151  369.855774  362.0
2152  182.335556  180.0
2153   18.574974   17.0
2154   33.353996   36.0
2155  227.456284  224.0
2156    3.065707    3.0
2157  289.739166

In [16]:
# feature importance
ft_cols = train_df.columns[train_df.columns!='count']
feat_importance_df = pd.DataFrame(model.feature_importances_, columns=['weights'], index=ft_cols) \
                           .sort_values(['weights'], ascending=[0])
feat_importance_df

Unnamed: 0,weights
registered,0.337742
casual,0.194303
hour,0.186165
workingday,0.04883
year,0.035605
month,0.031536
temp,0.030519
atemp,0.023398
dayofweek,0.023398
humidity,0.02238


In [17]:
print ("Export the model to model_regression.pkl")
f = open('../output/model_regression.pkl', 'wb')
pickle.dump(model, f)
f.close()

Export the model to model_regression.pkl
