In [33]:
import pickle
import json
import pandas as pd
import numpy as np
import catboost
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Reading data and Preparing it

In [13]:
# reading data
c2g_data = pd.read_csv('data/interpol/c2g_interpol_demand.csv', index_col=0)

def read_pickle(path):
    infile = open(path,'rb')
    file = pickle.load(infile)
    infile.close()
    return file

lgb_multimodel = read_pickle(r'models/c2g_model_light_gbm.sav')
lgb_unimodel = read_pickle(r'models/unic2g_model_light_gbm.sav')
cat_multimodel = read_pickle(r'models/c2g_model_catboost.sav')
cat_unimodel = read_pickle(r'models/unic2g_model_catboost.sav')

with open(r'models/LightGBM_Data_Parameters.json') as f:
    data_lgb_parameters = json.load(f)
    
with open(r'models/Catboost_Data_Parameters.json') as f:
    data_cat_parameters = json.load(f)
    
if(data_cat_parameters == data_lgb_parameters):
    params = data_cat_parameters
else:
    print("The parameters on the data must be the same")
    print(data_cat_parameters)
    print(data_lgb_parameters)
    params = {}

In [14]:
c2g_data.columns

Index(['tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower', 'Monday', 'Tuesday', 'Wednesday', 'Thursday

In [15]:
c2g_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'interpolate'], inplace=True)

In [17]:
params

{'TIME_FRAME_START': '2016-12-13 15:00:00',
 'TIME_FRAME_FINISH': '2017-02-25 17:00:00',
 'TRAIN_VAL_SPLIT': 0.8,
 'VAL_TEST_SPLIT': 0.9,
 'PAST_LAGS': 24,
 'FUTURE_LAGS': 12}

In [18]:
c2g_data.index = pd.to_datetime(c2g_data.index)

c2g_data = c2g_data.loc[params["TIME_FRAME_START"]:params["TIME_FRAME_FINISH"]]

In [20]:
def undo_one_hot(df, new_col_name, columns = []):
    new_df = df.copy().drop(columns=columns)
    
    def get_cat(row):
        for c in df.loc[:, columns].columns:
            if row[c]==1:
                return c
            
    new_df[new_col_name] = pd.Categorical(df.apply(get_cat, axis=1))
    
    return new_df

In [21]:
c2g_data = undo_one_hot(c2g_data, 'Weather', columns=['Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower'])
c2g_data = undo_one_hot(c2g_data, 'Weekday', columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Saturday', 'Sunday'])
c2g_data.head()

Unnamed: 0,tempC,precipMM,FeelsLikeC,uvIndex,visibility,windspeedMiles,travels,Weather,Weekday
2016-12-13 15:00:00,-2,0.0,-2,1,10,5,120.0,Sunny,Tuesday
2016-12-13 16:00:00,-2,0.0,-3,1,10,6,156.0,Sunny,Tuesday
2016-12-13 17:00:00,-2,0.0,-4,1,10,6,167.0,Sunny,Tuesday
2016-12-13 18:00:00,-3,0.0,-4,1,10,6,154.0,Sunny,Tuesday
2016-12-13 19:00:00,-3,0.0,-5,1,10,7,174.0,Clear,Tuesday


In [22]:
unic2g_data = pd.DataFrame(c2g_data.travels)
unic2g_data.head()

Unnamed: 0,travels
2016-12-13 15:00:00,120.0
2016-12-13 16:00:00,156.0
2016-12-13 17:00:00,167.0
2016-12-13 18:00:00,154.0
2016-12-13 19:00:00,174.0


In [23]:
def gen_supervised_learning(df, past_lags, future_lags):
    X = df.copy()
    for lag in range(1, past_lags):
        X = X.join(df.shift(lag), rsuffix=f'_t-{lag}')
    
    y = pd.DataFrame(df.travels.copy())
    for lag in range(1, future_lags):
        y = y.join(df.travels.shift(lag), rsuffix=f'_t-{lag}')
        
    X = X.shift(future_lags)
    
    return X.iloc[past_lags + future_lags:], y.iloc[past_lags + future_lags:]

In [24]:
X, y = gen_supervised_learning(c2g_data, params['PAST_LAGS'], 1)

In [25]:
X_uni, y_uni = gen_supervised_learning(unic2g_data, params['PAST_LAGS'], 1)

In [26]:
X_12fut, y_12fut = gen_supervised_learning(unic2g_data, params['PAST_LAGS'], params['FUTURE_LAGS'])

In [27]:
def splitter(data, ratio):
    size = len(data)
    data_train = data.iloc[:int(ratio[0]*size)]
    data_val = data.iloc[int(ratio[0]*size):int(ratio[1]*size)]
    data_test = data.iloc[int(ratio[1]*size):]
    return data_train, data_val, data_test

In [28]:
X_train, X_val, X_test = splitter(X, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])
y_train, y_val, y_test = splitter(y, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])
X_uni_train, X_uni_val, X_uni_test = splitter(X_uni, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])
y_uni_train, y_uni_val, y_uni_test = splitter(y_uni, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])
X_12fut_train, X_12fut_val, X_12fut_test = splitter(X_12fut, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])
y_12fut_train, y_12fut_val, y_12fut_test = splitter(y_12fut, [params['TRAIN_VAL_SPLIT'], params['VAL_TEST_SPLIT']])

## Evaluations

In [64]:
def eval_model(y, y_hat, is_y_real=[], are_all_real=True):
    if not are_all_real:
        filtered_y = [[n for n, real in zip(case, real_list) if real != 1] for case, real_list in zip(y, is_y_real)]
        filtered_y_hat = [[n for n, real in zip(case, real_list) if real != 1] for case, real_list in zip(y_hat, is_y_real)]

        filtered_y = [n for n in filtered_y if n]
        filtered_y_hat = [n for n in filtered_y_hat if n]
        
    else:
        filtered_y = y
        filtered_y_hat = y_hat
    
    n = 0
    for item in filtered_y:
        n += len(item)
    
    evaluation = {}
    evaluation["RMSE"] = np.sqrt(mean_squared_error(filtered_y, filtered_y_hat))
    evaluation["MAE"] = mean_absolute_error(filtered_y, filtered_y_hat)
    evaluation["R2"] = r2_score(filtered_y, filtered_y_hat)

    return evaluation

### Univariable Single-Step Evaluation

In [36]:
y_hat_test_lgb = lgb_unimodel.predict(X_uni_test)
y_hat_test_cat = cat_unimodel.predict(X_uni_test)

unilgb_eval = eval_model(y_test, y_hat_test_lgb)
unicat_eval = eval_model(y_test, y_hat_test_cat)

eval_uni_ss = pd.DataFrame([unilgb_eval, unicat_eval], index=['LightGBM', 'CatBoost'])
eval_uni_ss

Unnamed: 0,RMSE,MAE,R2
LightGBM,26.616146,18.860227,0.940307
CatBoost,29.087239,20.985718,0.928709


### Multivariable Single-Step Evaluation

In [40]:
y_hat_test_lgb = lgb_multimodel.predict(X_test)
y_hat_test_cat = cat_multimodel.predict(X_test)

lgb_eval = eval_model(y_test, y_hat_test_lgb)
cat_eval = eval_model(y_test, y_hat_test_cat)

eval_multi_ss = pd.DataFrame([lgb_eval, cat_eval], index=['LightGBM', 'CatBoost'])
eval_multi_ss

Unnamed: 0,RMSE,MAE,R2
LightGBM,27.949841,20.457185,0.934175
CatBoost,31.695423,23.449754,0.915351


### Univariable Multi-Step Evaluation

In [53]:
def predict_n(model, data, n):
    all_results = []
    for index, row in data.iterrows():
        values = pd.DataFrame(row.copy()).T
        result = []
        for _ in range(n):
            prediction = model.predict(values)[0]
            values.iloc[:, :-1] = values.iloc[:, 1:].values
            values.iloc[:, -1] = prediction
            result.append(prediction)
        all_results.append(result)
    return np.array(all_results)

In [65]:
y_hat_test_lgb = predict_n(lgb_unimodel, X_12fut_test, params['FUTURE_LAGS'])
y_hat_test_cat = predict_n(cat_unimodel, X_12fut_test, params['FUTURE_LAGS'])


unilgb_eval = eval_model(y_12fut_test, y_hat_test_lgb)
unicat_eval = eval_model(y_12fut_test, y_hat_test_cat)


eval_uni_ms = pd.DataFrame([unilgb_eval, unicat_eval], index=['LightGBM', 'CatBoost'])
eval_uni_ms

Unnamed: 0,RMSE,MAE,R2
LightGBM,173.942857,150.584352,-1.493902
CatBoost,171.728653,149.938302,-1.431051
