# LightGBM

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
from load import load_data
from load import series_to_supervised
from load import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from load import get_measures
import lightgbm as lgb
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

rcParams['figure.figsize'] = 15, 5 # tamanho das figuras

In [2]:
def lightgbm_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = lightgbm_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected = %.1f, predicted = %.1f' % (testy, yhat))
    # estimate prediction error
    mae = mean_absolute_error(test[:, -1], predictions)
    mape = mean_absolute_percentage_error(test[:, -1], predictions)
    rmse = np.sqrt(mean_squared_error(test[:, -1], predictions))    
    return mae, mape, rmse, test[:, -1], predictions

def multi_step_forecast(data, lag, n):
    n_test = outs = n
    #data = series_to_supervised(values, n_in = lag, n_out = outs, dropnan=False)
    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)
    response_vars = data.columns[-(outs):]
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:lag]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    measures = get_measures(pd.Series(predictions), test["var1(t)"])
    df_measures = pd.DataFrame([measures])
    return predictions, df_measures

In [51]:
df_load = load_data()
df_weather = pd.read_csv("../data/weather_daily_data.csv", parse_dates=["DATA"])
# gets the same period for both dataframes
df_weather = df_weather[df_weather.DATA.isin(df_load.index)]
df_load = df_load[df_weather.DATA.min():df_weather.DATA.max()] 

df_load_2 = df_load.reset_index()
df_merged = pd.merge(df_weather, df_load_2, left_on = "DATA", right_on = "date", how = "outer")
df_merged.drop("date", axis = 1, inplace = True)

df_merged.dropna(how = "all", inplace = True)
df_merged.sort_values(by = "DATA", inplace = True)
df_merged.load_mwmed = df_merged.load_mwmed.interpolate(method="linear")

In [53]:
lag = 15
outs = n_test = 15
df_load_3 = df_merged.load_mwmed
df_load_3.index = df_merged.DATA
df_load_3 = df_load_3["2008-01-01":]
values = df_load_3.values.tolist()
data1 = series_to_supervised(values, n_in = lag, n_out=outs, dropnan=False)
data2 = pd.DataFrame()
df_weather.set_index("DATA", inplace=True) # TESTE: MAPE PASSOU DE 3,2 PARA 3,0
df_weather = df_weather["2008-01-01":] 
for col in df_weather.columns:
    if col == "DATA":
        continue
    else:
        values = df_weather[col].values.tolist()
        df_ = series_to_supervised(values, n_in = lag, dropnan=False)
        df_.drop("var1(t)", axis = 1, inplace = True) # the response variable is the load dataframe
        df_.columns = [f"{x}_{col}" for x in df_.columns]
        data2 = pd.concat([data2, df_], axis = 1)
df_weather_load = pd.concat([data2, data1], axis = 1)
df_weather_load

Unnamed: 0,var1(t-15)_PRECIPITACAO_PR,var1(t-14)_PRECIPITACAO_PR,var1(t-13)_PRECIPITACAO_PR,var1(t-12)_PRECIPITACAO_PR,var1(t-11)_PRECIPITACAO_PR,var1(t-10)_PRECIPITACAO_PR,var1(t-9)_PRECIPITACAO_PR,var1(t-8)_PRECIPITACAO_PR,var1(t-7)_PRECIPITACAO_PR,var1(t-6)_PRECIPITACAO_PR,...,var1(t+5),var1(t+6),var1(t+7),var1(t+8),var1(t+9),var1(t+10),var1(t+11),var1(t+12),var1(t+13),var1(t+14)
0,,,,,,,,,,,...,6930.564167,9086.430417,9437.825000,9535.760417,9946.955417,9684.091667,8194.232083,6997.824167,9407.491250,9897.250000
1,,,,,,,,,,,...,9086.430417,9437.825000,9535.760417,9946.955417,9684.091667,8194.232083,6997.824167,9407.491250,9897.250000,9960.365417
2,,,,,,,,,,,...,9437.825000,9535.760417,9946.955417,9684.091667,8194.232083,6997.824167,9407.491250,9897.250000,9960.365417,9751.891667
3,,,,,,,,,,,...,9535.760417,9946.955417,9684.091667,8194.232083,6997.824167,9407.491250,9897.250000,9960.365417,9751.891667,9676.098333
4,,,,,,,,,,,...,9946.955417,9684.091667,8194.232083,6997.824167,9407.491250,9897.250000,9960.365417,9751.891667,9676.098333,8231.710417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5260,5.2,2.2,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,...,,,,,,,,,,
5261,2.2,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,...,,,,,,,,,,
5262,73.2,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,0.0,...,,,,,,,,,,
5263,15.0,1.6,0.2,0.2,1.4,0.4,0.2,0.0,0.0,0.0,...,,,,,,,,,,


In [None]:
# SEM FUNÇÃO
#n_test = outs = 10 # com 10 funciona, com outros horizontes não
train, test = train_test_split(df_weather_load, n_test)
train.dropna(inplace = True)
response_vars = df_weather_load.columns[-(outs):]
predictions = list()
for h, response in enumerate(response_vars):
    cols = [x for x in df_weather_load.columns[:df_weather_load.shape[1] - outs]]
    print(cols)
    cols.append(response)
    data_ = train[cols]
    nrows = data_.shape[0]
    data_ = data_.iloc[:nrows-h, :] 
    data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
    model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
    model.fit(data_X, data_y)
    testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
    pred = model.predict([testX])[0]
    print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
    predictions.append(pred)
measures = get_measures(pd.Series(predictions), test["var1(t)"])
df_measures = pd.DataFrame([measures])
print(df_measures)

In [None]:
def multi_step_forecast(data, n):
    n_test = outs = n
    train, test = train_test_split(data, n_test)
    train.dropna(inplace = True)
    response_vars = data.columns[-(outs):]
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in data.columns[:data.shape[1] - outs]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        #teste = train.loc[:, :"var1(t-1)"].iloc[-1,:] # t + 3 (observado = 12.054,20)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"Predicting {response}\n  > expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    measures = get_measures(pd.Series(predictions), test["var1(t)"])
    df_measures = pd.DataFrame([measures])
    #return predictions
    return df_measures

multi_step_forecast(df_weather_load, outs)

In [59]:
lag = 15
outs = n_test = horz = 15
df_load_3 = df_merged.load_mwmed
df_load_3.index = df_merged.DATA
df_load_3 = df_load_3["2008-01-01":]
values = df_load_3.values.tolist()
data1 = series_to_supervised(values, n_in = lag, n_out=outs, dropnan=False)
data2 = pd.DataFrame()
df_weather = df_weather["2008-01-01":] 
for col in df_weather.columns:
    if col == "DATA":
        continue
    else:
        values = df_weather[col].values.tolist()
        df_ = series_to_supervised(values, n_in = lag, dropnan=False)
        df_.drop("var1(t)", axis = 1, inplace = True) # the response variable is the load dataframe
        df_.columns = [f"{x}_{col}" for x in df_.columns]
        data2 = pd.concat([data2, df_], axis = 1)
df_weather_load = pd.concat([data2, data1], axis = 1)


folds = 15 #partições
rows = df_weather_load.shape[0]
out = defaultdict(dict)
for fold in range(folds,0,-1):
    slide = rows-((fold-1)*horz)
    df_cv = df_weather_load.iloc[:slide,:]
    #print(df_cv.tail())
    
    train, test = train_test_split(df_cv, n_test)
    train.dropna(inplace = True)
    response_vars = df_weather_load.columns[-(outs):]
    print(f"predicting for cv {fold}...")
    predictions = list()
    for h, response in enumerate(response_vars):
        cols = [x for x in df_cv.columns[:df_cv.shape[1] - outs]]
        cols.append(response)
        data_ = train[cols]
        nrows = data_.shape[0]
        data_ = data_.iloc[:nrows-h, :] 
        data_X, data_y = data_.iloc[:, :-1], data_.iloc[:, -1]
        model = lgb.LGBMRegressor(objective='regression', n_estimators=1000)
        model.fit(data_X, data_y)
        #print(data_X)
        testX, testy = test.reset_index(drop=True).loc[0, :"var1(t-1)"], test.reset_index(drop=True).loc[0, response]
        pred = model.predict([testX])[0]
        print(f"\tPredicting {response}\n\t\t> expected: {testy}, predicted: {pred}")
        predictions.append(pred)
    out[f"cv_{fold}"]["pred"] = predictions
    out[f"cv_{fold}"]["test"] = test["var1(t)"].to_list()
d = dict(out)

predicting for cv 15...


In [56]:
d

{'cv_1': {'pred': [11742.29490159845,
   11752.398105879569,
   11673.18263280014,
   11756.246350118126,
   10086.618251221176,
   9414.412437259482,
   11647.149970102593,
   11914.721892459947,
   11408.0590723506,
   11993.182060853731,
   11603.84333900538,
   10317.489987196555,
   9163.67476409575,
   11211.218489322331,
   11729.157162021822],
  'test': [11964.909375,
   12269.051375,
   12021.41545833,
   11802.52645833,
   10256.970375,
   8938.579125,
   11713.10433333,
   12054.19504167,
   12186.721375,
   12482.52370833,
   12520.80383333,
   10525.490875,
   9074.21125,
   11648.70958333,
   12162.75679167]}}

In [57]:
mapes = []
for x in d:
    meas = get_measures(pd.Series(d[x]["pred"]),pd.Series(d[x]["test"]))
    print(meas)
    mapes.append(meas["mape"])

{'erro': 2836397.641487, 'mae': 355.927546, 'mse': 189093.176099, 'rmse': 434.848452, 'mape': 0.030654, 'smape': 3.115692}


In [58]:
np.mean(mapes)

0.030654