# LightGBM

Script com foco no processo de forecast com previsão multi-step. <br>Método: direct prediction (ver https://machinelearningmastery.com/multi-step-time-series-forecasting/)


In [1]:
#https://www.youtube.com/watch?v=fG8H-0rb0mY
#https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import json

import lightgbm as lgb
#lgb.__version__

import warnings
warnings.filterwarnings("ignore")

In [2]:
plt.style.use('fivethirtyeight') # estilo dos gráficos
rcParams['figure.figsize'] = 15, 5 # tamanho das figuras

In [3]:
def load_data():
    """
    Função para ler e transformar os dados já presentes no diretório especificado
    """
    path = "../data/daily_load.csv"
    df_load = pd.read_csv(path, parse_dates = ["date"])
    df_load2 = df_load[df_load["id_reg"] == "S"]           # região sul
    df_load3 = df_load2[df_load2["date"] <= '2022-05-31']  # data de corte
    df_load4 = df_load3[["date", "load_mwmed"]].set_index("date")
    return df_load4

def train_test_split(data, n_test):
    """
    Função para partir or dados em treino e teste
    """
    if isinstance(data, pd.DataFrame):
        train, test = data.iloc[:-n_test, :], data.iloc[-n_test:, :]
    elif isinstance(data, np.ndarray):
        train, test = data[:-n_test, :], data[-n_test:, :]
    return train, test

# https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in = 1, n_out = 1, dropnan = True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis = 1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace = True)
    return agg

def lightgbm_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = lightgbm_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected = %.1f, predicted = %.1f' % (testy, yhat))
    # estimate prediction error
    mae = mean_absolute_error(test[:, -1], predictions)
    mape = mean_absolute_percentage_error(test[:, -1], predictions)
    rmse = np.sqrt(mean_squared_error(test[:, -1], predictions))    
    return mae, mape, rmse, test[:, -1], predictions

def get_measures(forecast, test):
    """
    Função para obter medidas de acurária a partir dos dados de projeção e teste
    """
    #forecast.reset_index(drop = True, inplace = True)
    #test.reset_index(drop = True, inplace = True)
    #errors = [(test.iloc[i] - forecast.iloc[i])**2 for i in range(len(test))]
    if isinstance(forecast, pd.Series) and isinstance(test, pd.Series):
        errors = [(test.iloc[i] - forecast.iloc[i])**2 for i in range(len(test))]
    # else:
    #     errors = [(test.iloc[i][0] - forecast.iloc[i])**2 for i in range(len(test))]
    mae = mean_absolute_error(test, forecast)
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(test, forecast)
    # smape
    a = np.reshape(test.values, (-1,))
    b = np.reshape(forecast.values, (-1,))
    smape = np.mean(100*2.0 * np.abs(a - b) / (np.abs(a) + np.abs(b))).item()
    # dicionário com as medidas de erro
    measures = { "erro": sum(errors),
                 "mae": mae,
                 "mse": mse,
                 "rmse": rmse,
                 "mape": mape,
                 "smape": smape
                }
    # arredondamento
    # for key, item in measures.items():
    #     measures[key] = round(measures[key], 2)
    return measures


# MANUALMENTE

In [7]:
df = load_data()
df.interpolate(method = "linear", inplace = True)
values = df.values.tolist()
lag = 10 # número de variáveis explicativas
n_test = outs = 5 # tamanho da partição de teste (= número de variáveis explicadas)
data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)

In [8]:
#n_test = 31
train, test = train_test_split(data, n_test)
train.dropna(inplace = True)

In [9]:
train

Unnamed: 0,var1(t-10),var1(t-9),var1(t-8),var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t),var1(t+1),var1(t+2),var1(t+3),var1(t+4)
10,4800.650000,4899.800000,6261.554167,6733.741667,6961.170833,7110.362500,7105.354167,6307.487500,5523.620833,7111.320833,7435.058333,7425.491667,7505.575000,7532.275000,6435.912500
11,4899.800000,6261.554167,6733.741667,6961.170833,7110.362500,7105.354167,6307.487500,5523.620833,7111.320833,7435.058333,7425.491667,7505.575000,7532.275000,6435.912500,5621.175000
12,6261.554167,6733.741667,6961.170833,7110.362500,7105.354167,6307.487500,5523.620833,7111.320833,7435.058333,7425.491667,7505.575000,7532.275000,6435.912500,5621.175000,7234.966667
13,6733.741667,6961.170833,7110.362500,7105.354167,6307.487500,5523.620833,7111.320833,7435.058333,7425.491667,7505.575000,7532.275000,6435.912500,5621.175000,7234.966667,7517.372917
14,6961.170833,7110.362500,7105.354167,6307.487500,5523.620833,7111.320833,7435.058333,7425.491667,7505.575000,7532.275000,6435.912500,5621.175000,7234.966667,7517.372917,7391.795833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8178,11361.001881,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708
8179,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833
8180,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875
8181,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875,9074.211250


In [43]:
# OK
folds = 5
h = 10
rows = df.shape[0]
for fold in range(folds,0,-1):
    #print(fold)
    #print(df.index[rows-(fold*h)-1])
    slide = rows-(fold*h)#-1
    train = df.iloc[:slide]
    #print(range_.tail())
    tr_init = df.index[0].date()
    tr_end = train.index[-1].date()
    te_init = df.index[train.shape[0]].date()
    te_end = df.index[train.shape[0]+(h-1)].date()
    print(f"training set {fold}: {tr_init} to {tr_end}\n  test set {fold}: {te_init} to {te_end}")

training set 5: 2000-01-01 to 2022-04-11
  test set 5: 2022-04-12 to 2022-04-21
training set 4: 2000-01-01 to 2022-04-21
  test set 4: 2022-04-22 to 2022-05-01
training set 3: 2000-01-01 to 2022-05-01
  test set 3: 2022-05-02 to 2022-05-11
training set 2: 2000-01-01 to 2022-05-11
  test set 2: 2022-05-12 to 2022-05-21
training set 1: 2000-01-01 to 2022-05-21
  test set 1: 2022-05-22 to 2022-05-31


In [52]:
# OK

df = load_data()
df.interpolate(method = "linear", inplace = True)

folds = 5 #partições
horz = 15 #horizonte de predição
rows = df.shape[0]
walkin = dict()
for fold in range(folds,0,-1):
    slide = rows-(fold*horz)#-1
    train = df.iloc[:slide]

    values = df.values.tolist()
    lag = 60 # número de variáveis explicativas
    n_test = outs = horz # tamanho da partição de teste (= número de variáveis explicadas)
    data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)

    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)

    response_vars = data.columns[-(outs):]
    predictions = list()
    print(f"predicting for cv {fold}...")
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:lag]]
        varname = response
        cols.append(varname)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        #teste = train.loc[:, :"var1(t-1)"].iloc[-1,:] # t + 3 (observado = 12.054,20)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, varname]
        pred = model.predict([testX])[0]
        print(f"    Predicting {varname}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    walkin[f"cv_{fold}"] = predictions

predicting for cv 5...
    Predicting var1(t)
  > expected: 11964.909375, predicted: 11975.427756245648
    Predicting var1(t+1)
  > expected: 12269.051375, predicted: 11662.180552659058
    Predicting var1(t+2)
  > expected: 12021.41545833, predicted: 11648.877597875558
    Predicting var1(t+3)
  > expected: 11802.52645833, predicted: 11479.928545194285
    Predicting var1(t+4)
  > expected: 10256.970375, predicted: 9807.548458155632
    Predicting var1(t+5)
  > expected: 8938.579125, predicted: 8618.267974395803
    Predicting var1(t+6)
  > expected: 11713.10433333, predicted: 10855.438173683217
    Predicting var1(t+7)
  > expected: 12054.19504167, predicted: 11393.44424901352
    Predicting var1(t+8)
  > expected: 12186.721375, predicted: 11548.779196824977
    Predicting var1(t+9)
  > expected: 12482.52370833, predicted: 10964.932685504376
    Predicting var1(t+10)
  > expected: 12520.80383333, predicted: 11741.521500258614
    Predicting var1(t+11)
  > expected: 10525.490875, pre