In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import json
from keras.regularizers import l2
import lightgbm as lgb
import pickle
warnings.filterwarnings('ignore')

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

Using TensorFlow backend.


## Reading

In [2]:
# reading data
c2g_data = pd.read_csv('data/interpol/c2g_interpol_demand.csv', index_col=0)

In [3]:
c2g_data.columns

Index(['tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower', 'Monday', 'Tuesday', 'Wednesday', 'Thursday

In [4]:
c2g_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'interpolate'], inplace=True)

In [5]:
c2g_data.index = pd.to_datetime(c2g_data.index)

TIME_FRAME_START = "2016-12-13 15:00:00"
TIME_FRAME_FINISH = "2017-02-25 17:00:00"

c2g_data = c2g_data.loc[TIME_FRAME_START:TIME_FRAME_FINISH]

## Multivar

In [6]:
PAST_LAGS = 24
FUTURE_LAGS = 12
TRAIN_VAL_SPLIT = 0.6
VAL_TEST_SPLIT = 0.8

In [7]:
def undo_one_hot(df, new_col_name, columns = []):
    new_df = df.copy().drop(columns=columns)
    
    def get_cat(row):
        for c in df.loc[:, columns].columns:
            if row[c]==1:
                return c
            
    new_df[new_col_name] = pd.Categorical(df.apply(get_cat, axis=1))
    
    return new_df

In [8]:
c2g_data = undo_one_hot(c2g_data, 'Weather', columns=['Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower'])
c2g_data = undo_one_hot(c2g_data, 'Weekday', columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday'])
c2g_data.head()

Unnamed: 0,tempC,precipMM,FeelsLikeC,uvIndex,visibility,windspeedMiles,travels,Weather,Weekday
2016-12-13 15:00:00,-2,0.0,-2,1,10,5,120.0,Sunny,Tuesday
2016-12-13 16:00:00,-2,0.0,-3,1,10,6,156.0,Sunny,Tuesday
2016-12-13 17:00:00,-2,0.0,-4,1,10,6,167.0,Sunny,Tuesday
2016-12-13 18:00:00,-3,0.0,-4,1,10,6,154.0,Sunny,Tuesday
2016-12-13 19:00:00,-3,0.0,-5,1,10,7,174.0,Clear,Tuesday


In [9]:
def gen_supervised_learning(df, past_lags, future_lags):
    X = df.copy()
    for lag in range(1, past_lags):
        X = X.join(df.shift(lag), rsuffix=f'_t-{lag}')
    
    y = pd.DataFrame(df.travels.copy())
    for lag in range(1, future_lags):
        y = y.join(df.travels.shift(lag), rsuffix=f'_t-{lag}')
        
    X = X.shift(future_lags)
    
    return X.iloc[past_lags + future_lags:], y.iloc[past_lags + future_lags:]

In [10]:
X, y = gen_supervised_learning(c2g_data, PAST_LAGS, 1)

In [11]:
def splitter(data, ratio):
    size = len(data)
    data_train = data.iloc[:int(ratio[0]*size)]
    data_val = data.iloc[int(ratio[0]*size):int(ratio[1]*size)]
    data_test = data.iloc[int(ratio[1]*size):]
    return data_train, data_val, data_test

In [12]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [13]:
train_data = lgb.Dataset(X_train, label=y_train)

In [14]:
validation_data = train_data.create_valid(X_val, label=y_val)

In [15]:
basic_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['mae', 'rmse'],
    'verbose': 0
}

param_grid = {
    "num_leaves": [32, 64, 128],
    "max_depth": [6, 8, 10],
    'learning_rate': [0.005, 0.001],
    "num_iterations": [15000],
    "n_estimators": [100, 200, 300],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.7],
    'bagging_freq': [10],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    'eval_metric':'mae',
    "early_stopping_rounds":1000
}

In [16]:
gbm = lgb.LGBMRegressor(**basic_params)

In [17]:
grid = GridSearchCV(gbm, param_grid, verbose=1, cv=3, n_jobs=6)

In [18]:
# Uncomment to run
grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  4.5min
[Parallel(n_jobs=6)]: Done 162 out of 162 | elapsed: 33.4min finished


[1]	valid_0's l1: 70.4295	valid_0's rmse: 87.1797
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 70.3677	valid_0's rmse: 87.1032
[3]	valid_0's l1: 70.3059	valid_0's rmse: 87.0268
[4]	valid_0's l1: 70.2446	valid_0's rmse: 86.9503
[5]	valid_0's l1: 70.1821	valid_0's rmse: 86.8729
[6]	valid_0's l1: 70.1204	valid_0's rmse: 86.7968
[7]	valid_0's l1: 70.0589	valid_0's rmse: 86.7207
[8]	valid_0's l1: 69.9974	valid_0's rmse: 86.6447
[9]	valid_0's l1: 69.9378	valid_0's rmse: 86.5707
[10]	valid_0's l1: 69.8768	valid_0's rmse: 86.4947
[11]	valid_0's l1: 69.8148	valid_0's rmse: 86.4179
[12]	valid_0's l1: 69.753	valid_0's rmse: 86.3412
[13]	valid_0's l1: 69.6911	valid_0's rmse: 86.2645
[14]	valid_0's l1: 69.6294	valid_0's rmse: 86.1881
[15]	valid_0's l1: 69.5696	valid_0's rmse: 86.1132
[16]	valid_0's l1: 69.5077	valid_0's rmse: 86.0368
[17]	valid_0's l1: 69.447	valid_0's rmse: 85.9613
[18]	valid_0's l1: 69.3854	valid_0's rmse: 85.8851
[19]	valid_0's l1: 69.3243	val

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, metric=['mae', 'rmse'],
                                     min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective='regression', random_state=None,
                                     reg_alpha=...
                                     subsample_freq=0, task='train',
                                     verbose=0),
             iid='deprecated', n_jobs=6,
             param_grid={'bagging_fraction': [0.7], 'bagging_freq': [10],
                         'feature_fraction': [0.9],
                         'learning

In [19]:
best_model = grid.best_estimator_
best_model

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.001, max_depth=10,
              metric=['mae', 'rmse'], min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
              n_jobs=-1, num_iterations=15000, num_leaves=32,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, task='train',
              verbose=0)

In [20]:
pickle.dump(best_model, open(r'models/c2g_model_light_gbm.sav', 'wb'))

In [21]:
grid.best_estimator_.predict(X_test)

array([136.5666317 , 110.53513387,  78.78730978,  47.14971617,
        31.18066809,  12.55124885,  17.23714109,  35.28024357,
        50.09170371,  86.7018892 , 119.77998246, 114.51036939,
       147.00568743, 158.67704264, 160.21301471, 180.41487444,
       184.97657921, 189.45843604, 223.13561286, 209.84910745,
       242.9087319 , 246.26704181, 214.58612344, 191.0775396 ,
       138.26328818, 126.66478872,  94.31166643,  65.9368535 ,
        33.71480305,  14.684157  ,  13.73219256,  19.9880131 ,
        41.1955799 ,  81.84011739, 119.26537675, 131.84878621,
       149.97932135, 169.29319363, 170.98094083, 172.17209376,
       183.24880063, 197.50503623, 226.57487978, 230.21083812,
       228.50997021, 213.78709713, 190.00149863, 131.08172429,
       133.0374962 ,  85.88071022,  62.95028118,  46.34655521,
        15.96368822,  10.85871606,  14.62876932,  24.42072242,
        54.7276928 ,  92.56238057, 132.05319987, 136.02510298,
       125.34373976, 139.68132972, 155.14941595, 164.73

## Univar

In [22]:
unic2g_data = pd.DataFrame(c2g_data.travels)
unic2g_data.head()

Unnamed: 0,travels
2016-12-13 15:00:00,120.0
2016-12-13 16:00:00,156.0
2016-12-13 17:00:00,167.0
2016-12-13 18:00:00,154.0
2016-12-13 19:00:00,174.0


In [23]:
X, y = gen_supervised_learning(unic2g_data, PAST_LAGS, 1)

In [24]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [25]:
param_grid = {
    "num_leaves": [32, 64, 128],
    "max_depth": [3, 4, 6, 8],
    'learning_rate': [0.005, 0.001],
    "num_iterations": [15000],
    "n_estimators": [40, 70, 100, 200],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.7],
    'bagging_freq': [10],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    'eval_metric':'mae',
    "early_stopping_rounds":1000
}

In [26]:
grid = GridSearchCV(gbm, param_grid, verbose=1, cv=3, n_jobs=6)

In [27]:
grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 27.7min
[Parallel(n_jobs=6)]: Done 288 out of 288 | elapsed: 53.7min finished


[1]	valid_0's l1: 70.4281	valid_0's rmse: 87.1786
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 70.3665	valid_0's rmse: 87.1023
[3]	valid_0's l1: 70.3051	valid_0's rmse: 87.0258
[4]	valid_0's l1: 70.2436	valid_0's rmse: 86.9495
[5]	valid_0's l1: 70.1843	valid_0's rmse: 86.8753
[6]	valid_0's l1: 70.1252	valid_0's rmse: 86.8015
[7]	valid_0's l1: 70.0635	valid_0's rmse: 86.725
[8]	valid_0's l1: 70.0016	valid_0's rmse: 86.649
[9]	valid_0's l1: 69.9404	valid_0's rmse: 86.5733
[10]	valid_0's l1: 69.8794	valid_0's rmse: 86.4973
[11]	valid_0's l1: 69.8181	valid_0's rmse: 86.4209
[12]	valid_0's l1: 69.7582	valid_0's rmse: 86.3459
[13]	valid_0's l1: 69.6969	valid_0's rmse: 86.2696
[14]	valid_0's l1: 69.6357	valid_0's rmse: 86.1933
[15]	valid_0's l1: 69.5746	valid_0's rmse: 86.1172
[16]	valid_0's l1: 69.5142	valid_0's rmse: 86.0419
[17]	valid_0's l1: 69.4532	valid_0's rmse: 85.966
[18]	valid_0's l1: 69.3937	valid_0's rmse: 85.8914
[19]	valid_0's l1: 69.3328	vali

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, metric=['mae', 'rmse'],
                                     min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective='regression', random_state=None,
                                     reg_alpha=...
                                     subsample_freq=0, task='train',
                                     verbose=0),
             iid='deprecated', n_jobs=6,
             param_grid={'bagging_fraction': [0.7], 'bagging_freq': [10],
                         'feature_fraction': [0.9],
                         'learning

In [28]:
best_model = grid.best_estimator_
best_model

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.001, max_depth=8,
              metric=['mae', 'rmse'], min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=40,
              n_jobs=-1, num_iterations=15000, num_leaves=32,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, task='train',
              verbose=0)

In [29]:
pickle.dump(best_model, open(r'models/unic2g_model_light_gbm.sav', 'wb'))

In [32]:
used_param_dict = {
    "TIME_FRAME_START":TIME_FRAME_START,
    "TIME_FRAME_FINISH":TIME_FRAME_FINISH,
    "TRAIN_VAL_SPLIT":TRAIN_VAL_SPLIT,
    "VAL_TEST_SPLIT":VAL_TEST_SPLIT,
    "PAST_LAGS":PAST_LAGS,
    "FUTURE_LAGS":FUTURE_LAGS
}

In [33]:
json_param = json.dumps(used_param_dict, indent = 4) 
with open(f'models/LightGBM_Data_Parameters.json', "w") as outfile: 
    outfile.write(json_param)