# LightGBM

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from load import load_data
from load import series_to_supervised
from load import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from load import get_measures
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

rcParams['figure.figsize'] = 15, 5 # tamanho das figuras

In [2]:
def lightgbm_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = lightgbm_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected = %.1f, predicted = %.1f' % (testy, yhat))
    # estimate prediction error
    mae = mean_absolute_error(test[:, -1], predictions)
    mape = mean_absolute_percentage_error(test[:, -1], predictions)
    rmse = np.sqrt(mean_squared_error(test[:, -1], predictions))    
    return mae, mape, rmse, test[:, -1], predictions

def multi_step_forecast(data, lag, n):
    n_test = outs = n
    #data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)
    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)
    response_vars = data.columns[-(outs):]
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:lag]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    measures = get_measures(pd.Series(predictions), test["var1(t)"])
    df_measures = pd.DataFrame([measures])
    return predictions, df_measures

In [24]:
df_load = load_data()
df_weather = pd.read_csv("../data/weather_daily_data.csv", parse_dates=["DATA"])
# gets the same period for both dataframes
df_weather = df_weather[df_weather.DATA.isin(df_load.index)]
df_load = df_load[df_weather.DATA.min():df_weather.DATA.max()] 

df_load_2 = df_load.reset_index()
df_merged = pd.merge(df_weather, df_load_2, left_on = "DATA", right_on = "date", how = "outer")
df_merged.drop("date", axis = 1, inplace = True)

df_merged.dropna(how = "all", inplace = True)
df_merged.sort_values(by = "DATA", inplace = True)
df_merged.load_mwmed = df_merged.load_mwmed.interpolate(method="linear")

Unnamed: 0,var1(t-15),var1(t-14),var1(t-13),var1(t-12),var1(t-11),var1(t-10),var1(t-9),var1(t-8),var1(t-7),var1(t-6),...,var1(t+5),var1(t+6),var1(t+7),var1(t+8),var1(t+9),var1(t+10),var1(t+11),var1(t+12),var1(t+13),var1(t+14)
0,,,,,,,,,,,...,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083
1,,,,,,,,,,,...,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333
2,,,,,,,,,,,...,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167
3,,,,,,,,,,,...,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167,8171.389583
4,,,,,,,,,,,...,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167,8171.389583,9031.222917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,11361.001881,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,...,,,,,,,,,,
5901,11305.707062,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,...,,,,,,,,,,
5902,10060.549787,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,...,,,,,,,,,,
5903,8994.856748,11424.087542,11964.909375,12269.051375,12021.415458,11802.526458,10256.970375,8938.579125,11713.104333,12054.195042,...,,,,,,,,,,


In [52]:
#lag = 15
lag = 60
outs = n_test = 15

In [53]:

df_load_3 = df_merged.load_mwmed
values = df_load_3.values.tolist()
data1 = series_to_supervised(values, n_in = lag, n_out=outs, dropnan=False)
data1

Unnamed: 0,var1(t-60),var1(t-59),var1(t-58),var1(t-57),var1(t-56),var1(t-55),var1(t-54),var1(t-53),var1(t-52),var1(t-51),...,var1(t+5),var1(t+6),var1(t+7),var1(t+8),var1(t+9),var1(t+10),var1(t+11),var1(t+12),var1(t+13),var1(t+14)
0,,,,,,,,,,,...,8530.962083,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083
1,,,,,,,,,,,...,8611.037083,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333
2,,,,,,,,,,,...,7707.624167,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167
3,,,,,,,,,,,...,6268.828333,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167,8171.389583
4,,,,,,,,,,,...,8068.169583,8367.525833,8541.254167,8388.782917,6175.557500,6404.927083,5596.388333,7749.104167,8171.389583,9031.222917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,12049.438875,12864.373792,12801.830000,11890.737708,11762.593583,10305.348750,9318.594333,12069.142875,11989.101125,12151.257583,...,,,,,,,,,,
5901,12864.373792,12801.830000,11890.737708,11762.593583,10305.348750,9318.594333,12069.142875,11989.101125,12151.257583,12512.483583,...,,,,,,,,,,
5902,12801.830000,11890.737708,11762.593583,10305.348750,9318.594333,12069.142875,11989.101125,12151.257583,12512.483583,12271.662917,...,,,,,,,,,,
5903,11890.737708,11762.593583,10305.348750,9318.594333,12069.142875,11989.101125,12151.257583,12512.483583,12271.662917,10395.100000,...,,,,,,,,,,


In [54]:
# Define the number of lag observations as input (X)
#lag = 15 #ou 60
data2 = pd.DataFrame()
for col in df_weather.columns:
    if col == "DATA":
        continue
    else:
        values = df_weather[col].values.tolist()
        df_ = series_to_supervised(values, n_in = lag, dropnan=False)
        df_.drop("var1(t)", axis = 1, inplace = True) # the response variable is the load dataframe
        df_.columns = [f"{x}_{col}" for x in df_.columns]
        data2 = pd.concat([data2, df_], axis = 1)
data2

Unnamed: 0,var1(t-60)_PRECIPITACAO_PR,var1(t-59)_PRECIPITACAO_PR,var1(t-58)_PRECIPITACAO_PR,var1(t-57)_PRECIPITACAO_PR,var1(t-56)_PRECIPITACAO_PR,var1(t-55)_PRECIPITACAO_PR,var1(t-54)_PRECIPITACAO_PR,var1(t-53)_PRECIPITACAO_PR,var1(t-52)_PRECIPITACAO_PR,var1(t-51)_PRECIPITACAO_PR,...,var1(t-10)_VELOCIDADE_VENTO_SC,var1(t-9)_VELOCIDADE_VENTO_SC,var1(t-8)_VELOCIDADE_VENTO_SC,var1(t-7)_VELOCIDADE_VENTO_SC,var1(t-6)_VELOCIDADE_VENTO_SC,var1(t-5)_VELOCIDADE_VENTO_SC,var1(t-4)_VELOCIDADE_VENTO_SC,var1(t-3)_VELOCIDADE_VENTO_SC,var1(t-2)_VELOCIDADE_VENTO_SC,var1(t-1)_VELOCIDADE_VENTO_SC
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,0.830682
2,,,,,,,,,,,...,,,,,,,,,0.830682,0.636364
3,,,,,,,,,,,...,,,,,,,,0.830682,0.636364,0.950000
4,,,,,,,,,,,...,,,,,,,0.830682,0.636364,0.950000,1.659091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,0.0,0.0,72.6,61.6,69.4,5.0,2.2,213.4,3.8,2.4,...,4.184062,4.678684,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399
5901,0.0,72.6,61.6,69.4,5.0,2.2,213.4,3.8,2.4,25.0,...,4.678684,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194
5902,72.6,61.6,69.4,5.0,2.2,213.4,3.8,2.4,25.0,129.2,...,2.647895,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194,2.068101
5903,61.6,69.4,5.0,2.2,213.4,3.8,2.4,25.0,129.2,18.0,...,1.358047,1.514144,1.512718,1.861596,2.597170,2.802064,1.975399,2.324194,2.068101,2.785901


In [55]:
df_weather_load = pd.concat([data2, data1], axis = 1)
#df_weather_load.loc[:, ["var1(t-1)_PRECIPITACAO_PR", "var1(t)"]].tail(15)

In [None]:
# SEM FUNÇÃO
#n_test = outs = 10 # com 10 funciona, com outros horizontes não
train, test = train_test_split(df_weather_load, n_test)
train.dropna(inplace = True)
response_vars = df_weather_load.columns[-(outs):]
predictions = list()
for h, response in enumerate(response_vars):
    cols = [x for x in df_weather_load.columns[:df_weather_load.shape[1] - outs]]
    print(cols)
    cols.append(response)
    data_ = train[cols]
    nrows = data_.shape[0]
    data_ = data_.iloc[:nrows-h, :] 
    data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
    model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
    model.fit(data_X, data_y)
    testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
    pred = model.predict([testX])[0]
    print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
    predictions.append(pred)
measures = get_measures(pd.Series(predictions), test["var1(t)"])
df_measures = pd.DataFrame([measures])
print(df_measures)

In [None]:
def multi_step_forecast(data, n):
    n_test = outs = n
    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)
    response_vars = data.columns[-(outs):]
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:data.shape[1] - outs]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        #teste = train.loc[:, :"var1(t-1)"].iloc[-1,:] # t + 3 (observado = 12.054,20)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    measures = get_measures(pd.Series(predictions), test["var1(t)"])
    df_measures = pd.DataFrame([measures])
    #return predictions
    return df_measures
multi_step_forecast(df_weather_load, outs)