In [171]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
%matplotlib inline

In [122]:
!ls 1.collect_data/data_files/

15min.pkl                      2018.pkl
2012.csv.zip                   2019.csv.zip
2012.pkl                       2019.pkl
2013.csv.zip                   48hrs.pkl
2013.pkl                       5min.2019.pkl
2014.csv.zip                   5min.pkl
2014.pkl                       Annual_Parking_Study_Data.csv
2015.csv.zip                   Blockface.csv
2015.pkl                       fixed.zip
2016.csv.zip                   kaggle_usholidays.csv
2016.pkl                       make_five_min_freq.py
2017.csv.zip                   [34mtest[m[m
2017.pkl                       transpose_and_trim_five_min.py
2018.csv.zip                   [34mweather[m[m


In [123]:
df = pd.read_pickle('1.collect_data/data_files/15min.pkl').dropna()

In [124]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PaidOccupancy,ParkingSpaceCount,PercentOccupied
SourceElementKey,OccupancyDateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,2012-01-03 09:00:00,0.133333,7.0,0.019048
1001,2012-01-03 09:15:00,1.0,7.0,0.142857
1001,2012-01-03 09:30:00,1.0,7.0,0.142857
1001,2012-01-03 09:45:00,1.8,7.0,0.257143
1001,2012-01-03 10:00:00,1.933333,7.0,0.27619


In [125]:
df.reset_index(inplace=True)

In [126]:
df.head()

Unnamed: 0,SourceElementKey,OccupancyDateTime,PaidOccupancy,ParkingSpaceCount,PercentOccupied
0,1001,2012-01-03 09:00:00,0.133333,7.0,0.019048
1,1001,2012-01-03 09:15:00,1.0,7.0,0.142857
2,1001,2012-01-03 09:30:00,1.0,7.0,0.142857
3,1001,2012-01-03 09:45:00,1.8,7.0,0.257143
4,1001,2012-01-03 10:00:00,1.933333,7.0,0.27619


In [127]:
df.tail().OccupancyDateTime.dt.dayofweek

144176748    4
144176749    4
144176750    4
144176751    4
144176752    4
Name: OccupancyDateTime, dtype: int64

In [128]:
df['Month'] = df.OccupancyDateTime.dt.month
df['Year'] = df.OccupancyDateTime.dt.year
df['Hour'] = df.OccupancyDateTime.dt.hour
df['DayOfWeek'] = df.OccupancyDateTime.dt.dayofweek
df['Day'] = df.OccupancyDateTime.dt.day

In [129]:
df['Dummy'] = 0

In [130]:
df.head()

Unnamed: 0,SourceElementKey,OccupancyDateTime,PaidOccupancy,ParkingSpaceCount,PercentOccupied,Month,Year,Hour,DayOfWeek,Day,Dummy
0,1001,2012-01-03 09:00:00,0.133333,7.0,0.019048,1,2012,9,1,3,0
1,1001,2012-01-03 09:15:00,1.0,7.0,0.142857,1,2012,9,1,3,0
2,1001,2012-01-03 09:30:00,1.0,7.0,0.142857,1,2012,9,1,3,0
3,1001,2012-01-03 09:45:00,1.8,7.0,0.257143,1,2012,9,1,3,0
4,1001,2012-01-03 10:00:00,1.933333,7.0,0.27619,1,2012,10,1,3,0


In [131]:
# y = df['PercentOccupied'].values
# X = df[['Dummy', 'SourceElementKey']]

In [132]:
df_train, df_test = train_test_split(
    df, test_size=0.3, random_state=42)

In [133]:
baseline_columns = ['Dummy', 'SourceElementKey']

In [137]:
lgb_train = lgb.Dataset(data=df_train[baseline_columns], label=df_train['PercentOccupied'])
lgb_test = lgb.Dataset(data=df_test[baseline_columns], label=df_test['PercentOccupied'], reference=lgb_train)

In [138]:
params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': {'mape'},
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 0
}

In [139]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train)

print('Saving model...')
# save model to file
gbm.save_model('baseline_model.txt')

Starting training...
Saving model...


<lightgbm.basic.Booster at 0x118ac7d68>

In [140]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

In [141]:
def smape_error(forecast, actual):
    numerator = np.absolute(forecast-actual)
    denominator = np.absolute(forecast) + np.absolute(actual)

    num_samples = len(numerator)

    return 100/num_samples * np.sum(numerator/denominator)

In [142]:
baseline_smape = smape_error(y_pred, y_test)
print('smape: %f' % baseline_smape)

smape: 38.330624


# Simple Model

In [143]:
df.tail().OccupancyDateTime.dt.day

144176748    30
144176749    30
144176750    30
144176751    30
144176752    30
Name: OccupancyDateTime, dtype: int64

In [144]:
simple_columns = ['SourceElementKey', 'Year', 'Month', 'Day', 'Hour', 'DayOfWeek']

In [145]:
simple_train = lgb.Dataset(data=df_train[simple_columns], label=df_train['PercentOccupied'])
simple_test = lgb.Dataset(data=df_test[simple_columns], label=df_test['PercentOccupied'], reference=simple_train)

In [146]:
print('Starting training...')
# train
gbm_simple = lgb.train(params,
                simple_train)

print('Saving model...')
# save model to file
gbm_simple.save_model('simple_ml_model.txt')

Starting training...
Saving model...


<lightgbm.basic.Booster at 0x1161b9860>

In [147]:
y_pred = gbm_simple.predict(X_test, num_iteration=gbm_simple.best_iteration)

In [149]:
simple_smape = smape_error(y_pred, y_test)
print('smape: %f' % simple_smape)

smape: 39.558633


# Baseline Timeseries

In [162]:
# shift values by one
pred_values = pd.concat([pd.Series([0]), df['PercentOccupied']])
timeseries_smape = smape_error(pred_values.values[:-1], df['PercentOccupied'])
print('smape: %f' % timeseries_smape)

smape: 12.955439


In [170]:
# shift values by four (i.e. 1 hour)
pred_values = pd.concat([pd.Series([0,0,0,0]), df['PaidOccupancy']])
timeseries_smape = smape_error(pred_values.values[:-4], df['PaidOccupancy'])
print('smape: %f' % timeseries_smape)

smape: 28.535425


In [164]:
def using_Grouper(df):
    level_values = df.index.get_level_values
    return (df.groupby([level_values(0)]
                       +[pd.Grouper(freq='1D', level=-1)]).sum())

In [167]:
daily_data = using_Grouper(pd.read_pickle('1.collect_data/data_files/15min.pkl').dropna())


In [168]:
daily_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PaidOccupancy,ParkingSpaceCount,PercentOccupied
SourceElementKey,OccupancyDateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,2012-01-03,65.019349,227.468172,9.382775
1001,2012-01-04,54.146468,253.898471,7.871416
1001,2012-01-05,66.146468,253.898471,9.585702
1001,2012-01-06,75.914688,254.290896,10.970685
1001,2012-01-07,64.4,259.0,9.2


In [169]:
# shift values by one
pred_values = pd.concat([pd.Series([0]), daily_data['PaidOccupancy']])
timeseries_smape = smape_error(pred_values.values[:-1], daily_data['PaidOccupancy'])
print('smape: %f' % timeseries_smape)

smape: 19.771590
