In [5]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import json
from keras.regularizers import l2
import lightgbm as lgb
import pickle
warnings.filterwarnings('ignore')

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

Using TensorFlow backend.


## Reading

In [6]:
# reading data
c2g_data = pd.read_csv('data/interpol/c2g_interpol_demand.csv', index_col=0)

In [7]:
c2g_data.columns

Index(['tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower', 'Monday', 'Tuesday', 'Wednesday', 'Thursday

In [8]:
c2g_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'interpolate'], inplace=True)

In [9]:
c2g_data.index = pd.to_datetime(c2g_data.index)

TIME_FRAME_START = "2016-12-13 15:00:00"
TIME_FRAME_FINISH = "2017-02-25 17:00:00"

c2g_data = c2g_data.loc[TIME_FRAME_START:TIME_FRAME_FINISH]

## Multivar

In [10]:
PAST_LAGS = 24
FUTURE_LAGS = 12
TRAIN_VAL_SPLIT = 0.8
VAL_TEST_SPLIT = 0.9

In [6]:
def undo_one_hot(df, new_col_name, columns = []):
    new_df = df.copy().drop(columns=columns)
    
    def get_cat(row):
        for c in df.loc[:, columns].columns:
            if row[c]==1:
                return c
            
    new_df[new_col_name] = pd.Categorical(df.apply(get_cat, axis=1))
    
    return new_df

In [7]:
c2g_data = undo_one_hot(c2g_data, 'Weather', columns=['Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower'])
c2g_data = undo_one_hot(c2g_data, 'Weekday', columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday'])
c2g_data.head()

Unnamed: 0,tempC,precipMM,FeelsLikeC,uvIndex,visibility,windspeedMiles,travels,Weather,Weekday
2016-12-13 15:00:00,-2,0.0,-2,1,10,5,120.0,Sunny,Tuesday
2016-12-13 16:00:00,-2,0.0,-3,1,10,6,156.0,Sunny,Tuesday
2016-12-13 17:00:00,-2,0.0,-4,1,10,6,167.0,Sunny,Tuesday
2016-12-13 18:00:00,-3,0.0,-4,1,10,6,154.0,Sunny,Tuesday
2016-12-13 19:00:00,-3,0.0,-5,1,10,7,174.0,Clear,Tuesday


In [8]:
def gen_supervised_learning(df, past_lags, future_lags):
    X = df.copy()
    for lag in range(1, past_lags):
        X = X.join(df.shift(lag), rsuffix=f'_t-{lag}')
    
    y = pd.DataFrame(df.travels.copy())
    for lag in range(1, future_lags):
        y = y.join(df.travels.shift(lag), rsuffix=f'_t-{lag}')
        
    X = X.shift(future_lags)
    
    return X.iloc[past_lags + future_lags:], y.iloc[past_lags + future_lags:]

In [9]:
X, y = gen_supervised_learning(c2g_data, PAST_LAGS, 1)

In [10]:
def splitter(data, ratio):
    size = len(data)
    data_train = data.iloc[:int(ratio[0]*size)]
    data_val = data.iloc[int(ratio[0]*size):int(ratio[1]*size)]
    data_test = data.iloc[int(ratio[1]*size):]
    return data_train, data_val, data_test

In [11]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [12]:
train_data = lgb.Dataset(X_train, label=y_train)

In [13]:
validation_data = train_data.create_valid(X_val, label=y_val)

In [14]:
basic_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['mae', 'rmse'],
    'verbose': 0
}

param_grid = {
    "num_leaves": [32, 64, 128],
    "max_depth": [6, 8, 10],
    'learning_rate': [0.005, 0.001],
    "num_iterations": [15000],
    "n_estimators": [100, 200, 300],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.7],
    'bagging_freq': [10],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    'eval_metric':'mae',
    "early_stopping_rounds":1000
}

In [15]:
gbm = lgb.LGBMRegressor(**basic_params)

In [16]:
grid = GridSearchCV(gbm, param_grid, verbose=1, cv=3, n_jobs=6)

In [20]:
# Uncomment to run
grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.8min
[Parallel(n_jobs=6)]: Done 162 out of 162 | elapsed: 78.4min finished


[1]	valid_0's l1: 74.6394	valid_0's rmse: 91.2462
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 74.2996	valid_0's rmse: 90.8474
[3]	valid_0's l1: 73.9615	valid_0's rmse: 90.4509
[4]	valid_0's l1: 73.6262	valid_0's rmse: 90.0604
[5]	valid_0's l1: 73.2876	valid_0's rmse: 89.662
[6]	valid_0's l1: 72.9551	valid_0's rmse: 89.2721
[7]	valid_0's l1: 72.6191	valid_0's rmse: 88.877
[8]	valid_0's l1: 72.2879	valid_0's rmse: 88.4896
[9]	valid_0's l1: 71.9852	valid_0's rmse: 88.1296
[10]	valid_0's l1: 71.6626	valid_0's rmse: 87.7544
[11]	valid_0's l1: 71.3368	valid_0's rmse: 87.3733
[12]	valid_0's l1: 71.0154	valid_0's rmse: 86.9957
[13]	valid_0's l1: 70.6958	valid_0's rmse: 86.62
[14]	valid_0's l1: 70.3803	valid_0's rmse: 86.2482
[15]	valid_0's l1: 70.0976	valid_0's rmse: 85.9087
[16]	valid_0's l1: 69.7914	valid_0's rmse: 85.5432
[17]	valid_0's l1: 69.4854	valid_0's rmse: 85.1781
[18]	valid_0's l1: 69.1795	valid_0's rmse: 84.812
[19]	valid_0's l1: 68.871	valid_0

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, metric=['mae', 'rmse'],
                                     min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective='regression', random_state=None,
                                     reg_alpha=...
                                     subsample_freq=0, task='train',
                                     verbose=0),
             iid='deprecated', n_jobs=6,
             param_grid={'bagging_fraction': [0.7], 'bagging_freq': [10],
                         'feature_fraction': [0.9],
                         'learning

In [21]:
best_model = grid.best_estimator_
best_model

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.005, max_depth=8,
              metric=['mae', 'rmse'], min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
              n_jobs=-1, num_iterations=15000, num_leaves=32,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, task='train',
              verbose=0)

In [22]:
pickle.dump(best_model, open(r'models/c2g_model_light_gbm.sav', 'wb'))

In [23]:
grid.best_estimator_.predict(X_test)

array([ 44.64200036,  70.11765434, 113.3609176 , 176.28877166,
       220.77933532, 188.15507475, 207.03265264, 219.9274343 ,
       232.17771556, 238.18515723, 248.61008964, 260.53438436,
       314.96982224, 327.09297559, 317.38872353, 267.64953187,
       234.04108623, 202.38947597, 181.65679798, 136.93269171,
       106.82176768,  63.60184391,  40.51306912,  24.55255807,
        39.81372288,  52.61763454,  98.83507832, 112.3140385 ,
       172.75513986, 219.80002451, 224.9532166 , 227.21368659,
       218.74037272, 213.64536553, 226.34428123, 235.97471765,
       264.58773774, 281.02447611, 274.04008446, 186.43021476,
       151.90219467, 163.9085138 ,  86.78377503,  62.77977994,
        32.48856838,  19.87040326,  12.02083236,  26.56779083,
        61.15425343, 113.41060229, 221.43124817, 225.06814885,
       199.52701581, 175.29816129, 165.81472075, 148.54342268,
       185.57657023, 184.51538995, 223.22850305, 292.76007218,
       317.33293291, 339.68864875, 270.79868212, 208.10

## Univar

In [24]:
unic2g_data = pd.DataFrame(c2g_data.travels)
unic2g_data.head()

Unnamed: 0,travels
2016-12-13 15:00:00,120.0
2016-12-13 16:00:00,156.0
2016-12-13 17:00:00,167.0
2016-12-13 18:00:00,154.0
2016-12-13 19:00:00,174.0


In [25]:
X, y = gen_supervised_learning(unic2g_data, PAST_LAGS, 1)

In [26]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [27]:
param_grid = {
    "num_leaves": [32, 64, 128],
    "max_depth": [3, 4, 6, 8],
    'learning_rate': [0.005, 0.001],
    "num_iterations": [15000],
    "n_estimators": [40, 70, 100, 200],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.7],
    'bagging_freq': [10],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    'eval_metric':'mae',
    "early_stopping_rounds":1000
}

In [28]:
grid = GridSearchCV(gbm, param_grid, verbose=1, cv=3, n_jobs=6)

In [29]:
grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.9min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 41.7min
[Parallel(n_jobs=6)]: Done 288 out of 288 | elapsed: 73.5min finished


[1]	valid_0's l1: 74.9098	valid_0's rmse: 91.5614
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 74.8412	valid_0's rmse: 91.481
[3]	valid_0's l1: 74.7745	valid_0's rmse: 91.4027
[4]	valid_0's l1: 74.7061	valid_0's rmse: 91.3225
[5]	valid_0's l1: 74.6423	valid_0's rmse: 91.2463
[6]	valid_0's l1: 74.5788	valid_0's rmse: 91.1702
[7]	valid_0's l1: 74.5108	valid_0's rmse: 91.0889
[8]	valid_0's l1: 74.443	valid_0's rmse: 91.0094
[9]	valid_0's l1: 74.3749	valid_0's rmse: 90.9295
[10]	valid_0's l1: 74.3074	valid_0's rmse: 90.8504
[11]	valid_0's l1: 74.2422	valid_0's rmse: 90.7722
[12]	valid_0's l1: 74.183	valid_0's rmse: 90.6991
[13]	valid_0's l1: 74.1184	valid_0's rmse: 90.6213
[14]	valid_0's l1: 74.0533	valid_0's rmse: 90.5434
[15]	valid_0's l1: 73.9877	valid_0's rmse: 90.4652
[16]	valid_0's l1: 73.9228	valid_0's rmse: 90.3875
[17]	valid_0's l1: 73.858	valid_0's rmse: 90.3098
[18]	valid_0's l1: 73.798	valid_0's rmse: 90.2362
[19]	valid_0's l1: 73.7338	valid_

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMRegressor(bagging_fraction=0.7, bagging_freq=10,
                                     boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0, feature_fraction=0.9,
                                     importance_type='split',
                                     learning_rate=0.005, max_depth=8,
                                     metric=['mae', 'rmse'],
                                     min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     num_it...
                                     subsample_freq=0, task='train',
                                     verbose=0),
             iid='deprecated', n_jobs=6,
             param_grid={'bagging_fraction': [0.7], 'bagging_freq': [10],
                         'feature_fraction': [0.9

In [30]:
best_model = grid.best_estimator_
best_model

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.001, max_depth=8,
              metric=['mae', 'rmse'], min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=40,
              n_jobs=-1, num_iterations=15000, num_leaves=32,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, task='train',
              verbose=0)

In [31]:
pickle.dump(best_model, open(r'models/unic2g_model_light_gbm.sav', 'wb'))

In [32]:
def predict_n(model, data, n=12):
    all_results = []
    for index, row in data.iterrows():
        values = pd.DataFrame(row.copy()).T
        result = []
        for _ in range(n):
            prediction = model.predict(values)
            values.iloc[:, :-1] = values.iloc[:, 1:].values
            values.iloc[:, -1] = prediction
            result.append(prediction)
        all_results.append(result)
    return np.array(all_results)

In [33]:
predict_n(gbm, X_test)

array([[[ 41.25869913],
        [ 27.75776614],
        [ 26.19459019],
        ...,
        [275.37662009],
        [295.61867098],
        [324.69160698]],

       [[ 65.56863037],
        [ 41.60374523],
        [ 28.30258646],
        ...,
        [249.73125367],
        [275.18758251],
        [297.58644015]],

       [[107.37699367],
        [ 59.51211761],
        [ 39.48526548],
        ...,
        [205.46305084],
        [249.43686241],
        [274.3162683 ]],

       ...,

       [[211.34764011],
        [227.53724285],
        [178.83331133],
        ...,
        [ 80.50911977],
        [134.14855103],
        [188.001062  ]],

       [[213.68975689],
        [219.48217147],
        [230.58842049],
        ...,
        [ 46.18937452],
        [ 78.03660122],
        [134.20168986]],

       [[221.43144067],
        [217.30864152],
        [220.73204795],
        ...,
        [ 25.32867525],
        [ 46.12639073],
        [ 78.66042307]]])

In [11]:
used_param_dict = {
    "TIME_FRAME_START":TIME_FRAME_START,
    "TIME_FRAME_FINISH":TIME_FRAME_FINISH,
    "TRAIN_VAL_SPLIT":TRAIN_VAL_SPLIT,
    "VAL_TEST_SPLIT":VAL_TEST_SPLIT,
    "PAST_LAGS":PAST_LAGS,
    "FUTURE_LAGS":FUTURE_LAGS
}

In [14]:
json_param = json.dumps(used_param_dict, indent = 4) 
with open(f'models/LightGBM_Data_Parameters.json', "w") as outfile: 
    outfile.write(json_param)