# LightGBM

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from load import load_data
from load import series_to_supervised
from load import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from load import get_measures
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

rcParams['figure.figsize'] = 15, 5 # tamanho das figuras

In [18]:
def lightgbm_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = lightgbm_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected = %.1f, predicted = %.1f' % (testy, yhat))
    # estimate prediction error
    mae = mean_absolute_error(test[:, -1], predictions)
    mape = mean_absolute_percentage_error(test[:, -1], predictions)
    rmse = np.sqrt(mean_squared_error(test[:, -1], predictions))    
    return mae, mape, rmse, test[:, -1], predictions

def multi_step_forecast(data, lag, n):
    n_test = outs = n
    #data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)
    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)
    response_vars = data.columns[-(outs):]
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:lag]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    measures = get_measures(pd.Series(predictions), test["var1(t)"])
    df_measures = pd.DataFrame([measures])
    return predictions, df_measures

In [26]:
df_load = load_data()
df_weather = pd.read_csv("../data/weather_daily_data.csv", parse_dates=["DATA"])
# gets the same period for both dataframes
df_weather = df_weather[df_weather.DATA.isin(df_load.index)]
df_load = df_load[df_weather.DATA.min():df_weather.DATA.max()] 

df_load_2 = df_load.reset_index()
df_merged = pd.merge(df_weather, df_load_2, left_on = "DATA", right_on = "date", how = "outer")
df_merged.drop("date", axis = 1, inplace = True)

df_merged.dropna(how = "all", inplace = True)
df_merged.sort_values(by = "DATA", inplace = True)
df_merged.load_mwmed = df_merged.load_mwmed.interpolate(method="linear")

df_load_3 = df_merged.load_mwmed
values = df_load_3.values.tolist()


In [30]:
data1 = series_to_supervised(values, n_in = lag, n_out=10, dropnan=False)
data1

Unnamed: 0,var1(t-15),var1(t-14),var1(t-13),var1(t-12),var1(t-11),var1(t-10),var1(t-9),var1(t-8),var1(t-7),var1(t-6),...,var1(t),var1(t+1),var1(t+2),var1(t+3),var1(t+4),var1(t+5),var1(t+6),var1(t+7),var1(t+8),var1(t+9)
0,,,,,,,,,,,...,7388.445000,6226.141250,8212.231667,8597.089583,8639.442083,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583
1,,,,,,,,,,,...,6226.141250,8212.231667,8597.089583,8639.442083,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833
2,,,,,,,,,,,...,8212.231667,8597.089583,8639.442083,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167
3,,,,,,,,,,,...,8597.089583,8639.442083,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917
4,,,,,,,,,,,...,8639.442083,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,11361.001881,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,...,12520.803833,10525.490875,9074.211250,11648.709583,12162.756792,,,,,
5901,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,...,10525.490875,9074.211250,11648.709583,12162.756792,,,,,,
5902,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,...,9074.211250,11648.709583,12162.756792,,,,,,,
5903,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,...,11648.709583,12162.756792,,,,,,,,


In [31]:
# Define the number of lag observations as input (X)
lag = 15 #ou 60
data1 = series_to_supervised(values, n_in = lag, n_out=10)
data2 = pd.DataFrame()
for col in df_weather.columns:
    if col == "DATA":
        continue
    else:
        values = df_weather[col].values.tolist()
        df_ = series_to_supervised(values, n_in = lag)
        df_.drop("var1(t)", axis = 1, inplace = True) # the response variable is the load dataframe
        df_.columns = [f"{x}_{col}" for x in df_.columns]
        data2 = pd.concat([data2, df_], axis = 1)
data2


Unnamed: 0,var1(t-15)_PRECIPITACAO_PR,var1(t-14)_PRECIPITACAO_PR,var1(t-13)_PRECIPITACAO_PR,var1(t-12)_PRECIPITACAO_PR,var1(t-11)_PRECIPITACAO_PR,var1(t-10)_PRECIPITACAO_PR,var1(t-9)_PRECIPITACAO_PR,var1(t-8)_PRECIPITACAO_PR,var1(t-7)_PRECIPITACAO_PR,var1(t-6)_PRECIPITACAO_PR,...,var1(t-10)_VELOCIDADE_VENTO_SC,var1(t-9)_VELOCIDADE_VENTO_SC,var1(t-8)_VELOCIDADE_VENTO_SC,var1(t-7)_VELOCIDADE_VENTO_SC,var1(t-6)_VELOCIDADE_VENTO_SC,var1(t-5)_VELOCIDADE_VENTO_SC,var1(t-4)_VELOCIDADE_VENTO_SC,var1(t-3)_VELOCIDADE_VENTO_SC,var1(t-2)_VELOCIDADE_VENTO_SC,var1(t-1)_VELOCIDADE_VENTO_SC
15,0.0,0.0,0.4,1.2,0.0,1.8,0.6,0.6,0.0,1.0,...,1.326087,1.225000,1.704167,1.412500,1.225000,1.175000,1.536364,1.808333,1.766667,1.720833
16,0.0,0.4,1.2,0.0,1.8,0.6,0.6,0.0,1.0,0.0,...,1.225000,1.704167,1.412500,1.225000,1.175000,1.536364,1.808333,1.766667,1.720833,1.172727
17,0.4,1.2,0.0,1.8,0.6,0.6,0.0,1.0,0.0,0.2,...,1.704167,1.412500,1.225000,1.175000,1.536364,1.808333,1.766667,1.720833,1.172727,3.950000
18,1.2,0.0,1.8,0.6,0.6,0.0,1.0,0.0,0.2,0.0,...,1.412500,1.225000,1.175000,1.536364,1.808333,1.766667,1.720833,1.172727,3.950000,1.879167
19,0.0,1.8,0.6,0.6,0.0,1.0,0.0,0.2,0.0,0.0,...,1.225000,1.175000,1.536364,1.808333,1.766667,1.720833,1.172727,3.950000,1.879167,1.408333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,5.2,2.2,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,...,4.184062,4.678684,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399
5901,2.2,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,...,4.678684,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194
5902,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,0.0,...,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194,2.068101
5903,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,0.0,0.0,...,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194,2.068101,2.785901


In [32]:
df_weather_load = pd.concat([data2, data1], axis = 1)

In [33]:
df_weather_load.tail(15)

Unnamed: 0,var1(t-15)_PRECIPITACAO_PR,var1(t-14)_PRECIPITACAO_PR,var1(t-13)_PRECIPITACAO_PR,var1(t-12)_PRECIPITACAO_PR,var1(t-11)_PRECIPITACAO_PR,var1(t-10)_PRECIPITACAO_PR,var1(t-9)_PRECIPITACAO_PR,var1(t-8)_PRECIPITACAO_PR,var1(t-7)_PRECIPITACAO_PR,var1(t-6)_PRECIPITACAO_PR,...,var1(t),var1(t+1),var1(t+2),var1(t+3),var1(t+4),var1(t+5),var1(t+6),var1(t+7),var1(t+8),var1(t+9)
5890,62.6,272.6,125.6,8.8,0.4,0.8,0.0,0.2,51.4,28.8,...,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708
5891,272.6,125.6,8.8,0.4,0.8,0.0,0.2,51.4,28.8,5.2,...,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833
5892,125.6,8.8,0.4,0.8,0.0,0.2,51.4,28.8,5.2,2.2,...,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875
5893,8.8,0.4,0.8,0.0,0.2,51.4,28.8,5.2,2.2,73.2,...,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875,9074.21125
5894,0.4,0.8,0.0,0.2,51.4,28.8,5.2,2.2,73.2,15.0,...,10256.970375,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875,9074.21125,11648.709583
5895,0.8,0.0,0.2,51.4,28.8,5.2,2.2,73.2,15.0,1.6,...,8938.579125,11713.104333,12054.195042,12186.721375,12482.523708,12520.803833,10525.490875,9074.21125,11648.709583,12162.756792
5896,0.0,0.2,51.4,28.8,5.2,2.2,73.2,15.0,1.6,0.2,...,,,,,,,,,,
5897,0.2,51.4,28.8,5.2,2.2,73.2,15.0,1.6,0.2,0.2,...,,,,,,,,,,
5898,51.4,28.8,5.2,2.2,73.2,15.0,1.6,0.2,0.2,1.4,...,,,,,,,,,,
5899,28.8,5.2,2.2,73.2,15.0,1.6,0.2,0.2,1.4,0.4,...,,,,,,,,,,


In [35]:
lags = 60
h = 10
pred, measures = multi_step_forecast(df_weather_load, lags, h)

ValueError: Number of features of the model must match the input. Model n_features_ is 60 and input n_features is 285