# LightGBM

Script com foco no processo de forecast com previsão multi-step. <br>Método: direct prediction (ver https://machinelearningmastery.com/multi-step-time-series-forecasting/)


In [1]:
#https://www.youtube.com/watch?v=fG8H-0rb0mY
#https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

import lightgbm as lgb
#lgb.__version__

In [2]:
plt.style.use('fivethirtyeight') # estilo dos gráficos
rcParams['figure.figsize'] = 15, 5 # tamanho das figuras

In [3]:
def load_data():
    """
    Função para ler e transformar os dados já presentes no diretório especificado
    """
    path = "../data/daily_load.csv"
    df_load = pd.read_csv(path, parse_dates = ["date"])
    df_load2 = df_load[df_load["id_reg"] == "S"]           # região sul
    df_load3 = df_load2[df_load2["date"] <= '2022-05-31']  # data de corte
    df_load4 = df_load3[["date", "load_mwmed"]].set_index("date")
    return df_load4

def train_test_split(data, n_test):
    """
    Função para partir or dados em treino e teste
    """
    if isinstance(data, pd.DataFrame):
        train, test = data.iloc[:-n_test, :], data.iloc[-n_test:, :]
    elif isinstance(data, np.ndarray):
        train, test = data[:-n_test, :], data[-n_test:, :]
    return train, test

# https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in = 1, n_out = 1, dropnan = True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis = 1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace = True)
    return agg

def lightgbm_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = lightgbm_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected = %.1f, predicted = %.1f' % (testy, yhat))
    # estimate prediction error
    mae = mean_absolute_error(test[:, -1], predictions)
    mape = mean_absolute_percentage_error(test[:, -1], predictions)
    rmse = np.sqrt(mean_squared_error(test[:, -1], predictions))    
    return mae, mape, rmse, test[:, -1], predictions

def get_measures(forecast, test):
    """
    Função para obter medidas de acurária a partir dos dados de projeção e teste
    """
    #forecast.reset_index(drop = True, inplace = True)
    #test.reset_index(drop = True, inplace = True)
    #errors = [(test.iloc[i] - forecast.iloc[i])**2 for i in range(len(test))]
    if isinstance(forecast, pd.Series) and isinstance(test, pd.Series):
        errors = [(test.iloc[i] - forecast.iloc[i])**2 for i in range(len(test))]
    # else:
    #     errors = [(test.iloc[i][0] - forecast.iloc[i])**2 for i in range(len(test))]
    mae = mean_absolute_error(test, forecast)
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(test, forecast)
    # smape
    a = np.reshape(test.values, (-1,))
    b = np.reshape(forecast.values, (-1,))
    smape = np.mean(100*2.0 * np.abs(a - b) / (np.abs(a) + np.abs(b))).item()
    # dicionário com as medidas de erro
    measures = { "erro": sum(errors),
                 "mae": mae,
                 "mse": mse,
                 "rmse": rmse,
                 "mape": mape,
                 "smape": smape
                }
    # arredondamento
    # for key, item in measures.items():
    #     measures[key] = round(measures[key], 2)
    return measures


In [4]:
df = load_data()
df = df[df.index <= '2022-04-30']
values = df.values.tolist()
lag = 60 
outs = 5
data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)
data.tail(5)

Unnamed: 0,var1(t-60),var1(t-59),var1(t-58),var1(t-57),var1(t-56),var1(t-55),var1(t-54),var1(t-53),var1(t-52),var1(t-51),...,var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t),var1(t+1),var1(t+2),var1(t+3),var1(t+4)
8152,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,...,11325.166083,11635.152167,9745.995625,8676.435833,11917.185167,12640.413333,13018.60175,13048.245458,12261.400625,10444.075958
8153,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,14608.583292,...,11635.152167,9745.995625,8676.435833,11917.185167,12640.413333,13018.60175,13048.245458,12261.400625,10444.075958,
8154,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,14608.583292,14834.671083,...,9745.995625,8676.435833,11917.185167,12640.413333,13018.60175,13048.245458,12261.400625,10444.075958,,
8155,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,14608.583292,14834.671083,14860.906667,...,8676.435833,11917.185167,12640.413333,13018.60175,13048.245458,12261.400625,10444.075958,,,
8156,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,14608.583292,14834.671083,14860.906667,14877.019417,...,11917.185167,12640.413333,13018.60175,13048.245458,12261.400625,10444.075958,,,,


In [7]:
# EXEMPLO 1: t+3
# ÚLTIMO VALOR EM var1(t+h) DEVE SER O MESMO PARA TODAS ESTIMAÇÕES
# PARA VISUALIZAR MELHOR, OLHAR EXCEL "multistep" NA PASTA "DATA"
response_vars = data.columns[-outs:]
cols = [x for x in data.columns[:lag]]
cols.append("var1(t+3)")
data_ = data[cols]
data_.dropna(inplace = True) # retirar "na" só da coluna de t+h que está se estimando 
data_.tail(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_.dropna(inplace = True) # retirar "na" só da coluna de t+h que está se estimando


Unnamed: 0,var1(t-60),var1(t-59),var1(t-58),var1(t-57),var1(t-56),var1(t-55),var1(t-54),var1(t-53),var1(t-52),var1(t-51),...,var1(t-9),var1(t-8),var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t+3)
8149,15088.1805,14903.538667,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,...,11381.112708,8310.664667,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,12640.413333
8150,14903.538667,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,...,8310.664667,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,13018.60175
8151,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,...,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,8676.435833,13048.245458
8152,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,...,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,8676.435833,11917.185167,12261.400625
8153,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,14608.583292,...,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,8676.435833,11917.185167,12640.413333,10444.075958


In [14]:
# EXEMPLO 2: t+4- PARA VISUALIZAR MELHOR, OLHAR EXCEL "multistep" NA PASTA "DATA"
# ÚLTIMO VALOR EM var1(t+h) DEVE SER O MESMO PARA TODAS ESTIMAÇÕES
# PARA VISUALIZAR MELHOR, OLHAR EXCEL "multistep" NA PASTA "DATA"
response_vars = data.columns[-outs:]
cols = [x for x in data.columns[:lag]]
cols.append("var1(t+4)")
data_ = data[cols]
data_.dropna(inplace = True) # retirar "na" só da coluna de t+h que está se estimando 
data_.tail(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_.dropna(inplace = True) # retirar "na" só da coluna de t+h que está se estimando


Unnamed: 0,var1(t-60),var1(t-59),var1(t-58),var1(t-57),var1(t-56),var1(t-55),var1(t-54),var1(t-53),var1(t-52),var1(t-51),...,var1(t-9),var1(t-8),var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t+4)
8148,14721.577833,15088.1805,14903.538667,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,...,12080.641,11381.112708,8310.664667,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,12640.413333
8149,15088.1805,14903.538667,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,...,11381.112708,8310.664667,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,13018.60175
8150,14903.538667,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,...,8310.664667,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,13048.245458
8151,15048.168208,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,...,8604.080542,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,8676.435833,12261.400625
8152,15101.339458,13219.668167,11890.958833,14121.6205,13533.691167,14360.921333,14522.45575,14725.69375,13204.445458,11716.236167,...,8044.328167,11133.275,11832.275,11905.543417,11325.166083,11635.152167,9745.995625,8676.435833,11917.185167,10444.075958


In [9]:
# DATA DA ÚLTIMA LINHA EM "data"
df[df.load_mwmed == data["var1(t)"].iloc[-1]]

Unnamed: 0_level_0,load_mwmed
date,Unnamed: 1_level_1
2022-04-30,10444.075958


In [49]:
# DIRECT PREDICTION V1: ESTÁ CONSIDERANDO OS DADOS DE TESTE T + H, QUE NÃO ESTARÃO DISPONÍVEIS PARA FORECAST

# lista com as variáveis resposta
response_vars = data.columns[-outs:]

predictions = list()
# estimate models for every t + h
for response in response_vars:
    cols = [x for x in data.columns[:lag]] # features names. resets every iteration
    cols.append(response)
    data_ = data[cols]
    X, y = data_.iloc[:, :-1], data_.iloc[:, -1]    
    model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
    model.fit(X, y)
    yhat = model.predict([data_.iloc[-1, :-1].values])[0]
    yobs = y.iloc[-1] 
    print(f"> expected: {yobs}, predicted: {yhat}")
    predictions.append(yhat)

> expected: 10444.07595833, predicted: 10423.300175959055
> expected: 8794.163125, predicted: 8776.545853490734
> expected: 11643.82154167, predicted: 11660.587035833762
> expected: 11941.66258333, predicted: 11960.412635456514
> expected: 11643.86191667, predicted: 11659.804973910308
> expected: 11442.40841667, predicted: 11488.4804629239
> expected: 11156.521, predicted: 11161.205861225117
> expected: 9333.92008333, predicted: 9376.5656479672
> expected: 7742.3945, predicted: 7785.903522979227
> expected: 10808.54866667, predicted: 10821.415832572027
> expected: 11654.62308112, predicted: 11695.157084430864
> expected: 11608.52466322, predicted: 11599.402289821399
> expected: 11361.00188129, predicted: 11396.672755952975
> expected: 11305.70706206, predicted: 11283.752949852169
> expected: 10060.54978729, predicted: 10028.956778135229
> expected: 8994.85674774, predicted: 8974.21034020558
> expected: 11424.08754167, predicted: 11377.893634138589
> expected: 11964.909375, predicted: 1

In [92]:
# DIRECT PREDICTION V2. Problema: valor observado está vazio para t + h para h > 0 em 30/04/2022 em diante

# lista com as variáveis resposta
response_vars = data.columns[-outs:]

nrows = data.shape[0]

predictions = list()
# estimate models for every t + h
nrows = data.shape[0]
i = 0
for response in response_vars:
    cols = [x for x in data.columns[:lag]] # features names. resets every iteration
    cols.append(response)
    data_ = data[cols]
    nrows -= i
    #print(nrows)
    X, y = data_.iloc[:nrows, :-1], data_.iloc[:nrows, -1]  
    #print(y.shape)  
    #print(y)
    model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
    model.fit(X, y)
    yhat = model.predict([data_.iloc[-1, :-1].values])[0]
    yobs = data["var1(t)"].iloc[i - 1] # !!!!!!!!    VER AQUI: t + h = i - 1 ... (load_mwmed de 30/04/2022 em diante) !!!!!!!!!!!!! 
    print(f"> expected: {yobs}, predicted: {yhat}")
    predictions.append(yhat)
    i += 1

> expected: 10444.07595833, predicted: 10431.985618371944
> expected: 4800.65, predicted: 8708.132769921122
> expected: 4899.8, predicted: 11971.069168795277
> expected: 6261.55416667, predicted: 11728.386400318697
> expected: 6733.74166667, predicted: 12139.055844264678
