In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import json
from keras.regularizers import l2
import catboost
import pickle
warnings.filterwarnings('ignore')

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

Using TensorFlow backend.


## Reading

In [2]:
# reading data
c2g_data = pd.read_csv('data/interpol/c2g_interpol_demand.csv', index_col=0)

In [3]:
c2g_data.columns

Index(['tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower', 'Monday', 'Tuesday', 'Wednesday', 'Thursday

In [4]:
c2g_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'interpolate'], inplace=True)

In [5]:
c2g_data.index = pd.to_datetime(c2g_data.index)

TIME_FRAME_START = "2016-12-13 15:00:00"
TIME_FRAME_FINISH = "2017-02-25 17:00:00"

c2g_data = c2g_data.loc[TIME_FRAME_START:TIME_FRAME_FINISH]

## Multivar

In [6]:
PAST_LAGS = 24
FUTURE_LAGS = 12
TRAIN_VAL_SPLIT = 0.6
VAL_TEST_SPLIT = 0.8

In [7]:
def undo_one_hot(df, new_col_name, columns = []):
    new_df = df.copy().drop(columns=columns)
    
    def get_cat(row):
        for c in df.loc[:, columns].columns:
            if row[c]==1:
                return c
            
    new_df[new_col_name] = pd.Categorical(df.apply(get_cat, axis=1))
    
    return new_df

In [8]:
c2g_data = undo_one_hot(c2g_data, 'Weather', columns=['Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower'])
c2g_data = undo_one_hot(c2g_data, 'Weekday', columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday'])
c2g_data.head()

Unnamed: 0,tempC,precipMM,FeelsLikeC,uvIndex,visibility,windspeedMiles,travels,Weather,Weekday
2016-12-13 15:00:00,-2,0.0,-2,1,10,5,120.0,Sunny,Tuesday
2016-12-13 16:00:00,-2,0.0,-3,1,10,6,156.0,Sunny,Tuesday
2016-12-13 17:00:00,-2,0.0,-4,1,10,6,167.0,Sunny,Tuesday
2016-12-13 18:00:00,-3,0.0,-4,1,10,6,154.0,Sunny,Tuesday
2016-12-13 19:00:00,-3,0.0,-5,1,10,7,174.0,Clear,Tuesday


In [9]:
def gen_supervised_learning(df, past_lags, future_lags):
    X = df.copy()
    for lag in range(1, past_lags):
        X = X.join(df.shift(lag), rsuffix=f'_t-{lag}')
    
    y = pd.DataFrame(df.travels.copy())
    for lag in range(1, future_lags):
        y = y.join(df.travels.shift(lag), rsuffix=f'_t-{lag}')
        
    X = X.shift(future_lags)
    
    return X.iloc[past_lags + future_lags:], y.iloc[past_lags + future_lags:]

In [10]:
X, y = gen_supervised_learning(c2g_data, PAST_LAGS, 1)

In [11]:
X.columns

Index(['tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'travels', 'Weather', 'Weekday', 'tempC_t-1',
       ...
       'Weekday_t-22', 'tempC_t-23', 'precipMM_t-23', 'FeelsLikeC_t-23',
       'uvIndex_t-23', 'visibility_t-23', 'windspeedMiles_t-23',
       'travels_t-23', 'Weather_t-23', 'Weekday_t-23'],
      dtype='object', length=216)

In [11]:
def splitter(data, ratio):
    size = len(data)
    data_train = data.iloc[:int(ratio[0]*size)]
    data_val = data.iloc[int(ratio[0]*size):int(ratio[1]*size)]
    data_test = data.iloc[int(ratio[1]*size):]
    return data_train, data_val, data_test

In [12]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [13]:
cat_cols = list(X_train.select_dtypes(include='category').columns)
cat_cols

['Weather',
 'Weekday',
 'Weather_t-1',
 'Weekday_t-1',
 'Weather_t-2',
 'Weekday_t-2',
 'Weather_t-3',
 'Weekday_t-3',
 'Weather_t-4',
 'Weekday_t-4',
 'Weather_t-5',
 'Weekday_t-5',
 'Weather_t-6',
 'Weekday_t-6',
 'Weather_t-7',
 'Weekday_t-7',
 'Weather_t-8',
 'Weekday_t-8',
 'Weather_t-9',
 'Weekday_t-9',
 'Weather_t-10',
 'Weekday_t-10',
 'Weather_t-11',
 'Weekday_t-11',
 'Weather_t-12',
 'Weekday_t-12',
 'Weather_t-13',
 'Weekday_t-13',
 'Weather_t-14',
 'Weekday_t-14',
 'Weather_t-15',
 'Weekday_t-15',
 'Weather_t-16',
 'Weekday_t-16',
 'Weather_t-17',
 'Weekday_t-17',
 'Weather_t-18',
 'Weekday_t-18',
 'Weather_t-19',
 'Weekday_t-19',
 'Weather_t-20',
 'Weekday_t-20',
 'Weather_t-21',
 'Weekday_t-21',
 'Weather_t-22',
 'Weekday_t-22',
 'Weather_t-23',
 'Weekday_t-23']

In [14]:
train_pool = catboost.Pool(X_train, y_train, cat_cols)
val_pool = catboost.Pool(X_val, y_val, cat_cols)
test_pool = catboost.Pool(X_test, y_test, cat_cols)

In [15]:
cat = catboost.CatBoostRegressor(iterations=2, 
                          depth=2, 
                          learning_rate=1, 
                          loss_function='RMSE')

In [16]:
basic_params = {
    'has_time': True,
    'loss_function': 'MAE',
    'eval_metric':'MAE',
}

param_grid = {
    "num_leaves": [31, 63, 127],
    "max_depth": [3, 4 , 5, 6],
    'learning_rate': [0.005, 0.001, 0.03],
    "iterations": [15000],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    "early_stopping_rounds":1000,
    'cat_features': cat_cols
}

In [17]:
cat = catboost.CatBoostRegressor(**basic_params)

In [18]:
grid = GridSearchCV(cat, param_grid, cv=3, n_jobs=6)

In [19]:
grid.fit(X_train, y_train, **fit_params)

0:	learn: 73.5513155	test: 70.2378338	best: 70.2378338 (0)	total: 126ms	remaining: 31m 22s
1:	learn: 73.2603652	test: 69.9714359	best: 69.9714359 (1)	total: 179ms	remaining: 22m 22s
2:	learn: 72.9537791	test: 69.6902289	best: 69.6902289 (2)	total: 237ms	remaining: 19m 44s
3:	learn: 72.6426003	test: 69.3910852	best: 69.3910852 (3)	total: 290ms	remaining: 18m 7s
4:	learn: 72.3242828	test: 69.0905502	best: 69.0905502 (4)	total: 346ms	remaining: 17m 18s
5:	learn: 72.0242572	test: 68.8195884	best: 68.8195884 (5)	total: 397ms	remaining: 16m 31s
6:	learn: 71.7263630	test: 68.5382963	best: 68.5382963 (6)	total: 452ms	remaining: 16m 8s
7:	learn: 71.4102539	test: 68.2545255	best: 68.2545255 (7)	total: 498ms	remaining: 15m 33s
8:	learn: 71.1019479	test: 67.9612038	best: 67.9612038 (8)	total: 549ms	remaining: 15m 14s
9:	learn: 70.7818791	test: 67.6637451	best: 67.6637451 (9)	total: 593ms	remaining: 14m 48s
10:	learn: 70.5166960	test: 67.4036786	best: 67.4036786 (10)	total: 641ms	remaining: 14m 34s

GridSearchCV(cv=3, error_score=nan,
             estimator=<catboost.core.CatBoostRegressor object at 0x00000274CBA474C8>,
             iid='deprecated', n_jobs=6,
             param_grid={'iterations': [15000],
                         'learning_rate': [0.005, 0.001, 0.03],
                         'max_depth': [3, 4, 5, 6],
                         'num_leaves': [31, 63, 127]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
grid.best_estimator_

<catboost.core.CatBoostRegressor at 0x274cbae5c88>

In [21]:
best_model = grid.best_estimator_

In [22]:
pickle.dump(best_model, open(r'models/c2g_model_catboost.sav', 'wb'))

## Univar

In [23]:
unic2g_data = pd.DataFrame(c2g_data.travels)
unic2g_data.head()

Unnamed: 0,travels
2016-12-13 15:00:00,120.0
2016-12-13 16:00:00,156.0
2016-12-13 17:00:00,167.0
2016-12-13 18:00:00,154.0
2016-12-13 19:00:00,174.0


In [24]:
X, y = gen_supervised_learning(unic2g_data, PAST_LAGS, 1)

In [25]:
X_train, X_val, X_test = splitter(X, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])
y_train, y_val, y_test = splitter(y, [TRAIN_VAL_SPLIT, VAL_TEST_SPLIT])

In [26]:
param_grid = {
    "num_leaves": [15, 20, 26 ,31,],
    "max_depth": [3, 4],
    'learning_rate': [0.005, 0.001, 0.03],
    "iterations": [15000],
}

fit_params = {
    'eval_set':[(X_val, y_val)],
    "early_stopping_rounds":1000,
}

In [27]:
grid = GridSearchCV(cat, param_grid, verbose=1, cv=3, n_jobs=6)

In [28]:
grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done  72 out of  72 | elapsed:  6.6min finished


0:	learn: 73.7760066	test: 70.4195090	best: 70.4195090 (0)	total: 2.25ms	remaining: 33.7s
1:	learn: 73.7223546	test: 70.3709105	best: 70.3709105 (1)	total: 3.39ms	remaining: 25.4s
2:	learn: 73.6580209	test: 70.3107597	best: 70.3107597 (2)	total: 4.39ms	remaining: 21.9s
3:	learn: 73.5956599	test: 70.2523635	best: 70.2523635 (3)	total: 5.4ms	remaining: 20.2s
4:	learn: 73.5318394	test: 70.1935753	best: 70.1935753 (4)	total: 6.42ms	remaining: 19.3s
5:	learn: 73.4739235	test: 70.1393462	best: 70.1393462 (5)	total: 7.61ms	remaining: 19s
6:	learn: 73.4151431	test: 70.0856051	best: 70.0856051 (6)	total: 8.66ms	remaining: 18.6s
7:	learn: 73.3522509	test: 70.0276047	best: 70.0276047 (7)	total: 9.68ms	remaining: 18.1s
8:	learn: 73.2871827	test: 69.9677083	best: 69.9677083 (8)	total: 10.7ms	remaining: 17.8s
9:	learn: 73.2231202	test: 69.9078533	best: 69.9078533 (9)	total: 11.8ms	remaining: 17.6s
10:	learn: 73.1644014	test: 69.8536277	best: 69.8536277 (10)	total: 12.9ms	remaining: 17.5s
11:	learn: 

GridSearchCV(cv=3, error_score=nan,
             estimator=<catboost.core.CatBoostRegressor object at 0x00000274CBA474C8>,
             iid='deprecated', n_jobs=6,
             param_grid={'iterations': [15000],
                         'learning_rate': [0.005, 0.001, 0.03],
                         'max_depth': [3, 4], 'num_leaves': [15, 20, 26, 31]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [29]:
grid.best_estimator_.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'MAE',
 'iterations': 15000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Exact',
 'od_pval': 0,
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'od_type': 'Iter',
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'subsample': 0.800000011920929,
 'use_best_model': True,
 'od_wait': 1000,
 'random_seed': 0,
 'depth': 3,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'MAE',
 'learning_rate': 0.0010000000474974513,
 'score_function': 'Cosine',
 'task_type': 'CPU',
 'leaf_estimation_iterations': 1,
 'bootstrap_typ

In [30]:
best_model = grid.best_estimator_

In [31]:
pickle.dump(best_model, open(r'models/unic2g_model_catboost.sav', 'wb'))

In [32]:
used_param_dict = {
    "TIME_FRAME_START":TIME_FRAME_START,
    "TIME_FRAME_FINISH":TIME_FRAME_FINISH,
    "TRAIN_VAL_SPLIT":TRAIN_VAL_SPLIT,
    "VAL_TEST_SPLIT":VAL_TEST_SPLIT,
    "PAST_LAGS":PAST_LAGS,
    "FUTURE_LAGS":FUTURE_LAGS
}

In [33]:
json_param = json.dumps(used_param_dict, indent = 4) 
with open(f'models/Catboost_Data_Parameters.json', "w") as outfile: 
    outfile.write(json_param)