## Imports

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.base import BaseEstimator
from typing import List, Dict
import numpy as np

## Configurações

In [2]:
data_path = Path(r'..\..\01_dados\03_dados_modelagem')

# Configuração para os lags
lags = range(5, 91, 5)

# Definindo os modelos
modelos = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regression': SVR(),
}

## Funções e Classes Auxliares

In [7]:
def load_and_preprocess_data(file_path):
    """
    Carrega e pré-processa os dados.
    """
    df = pd.read_csv(file_path)
    df.fillna(0, inplace=True)
    feature_columns = df.columns.drop(['cmo', 'din_instante'])
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])
    return df

def create_lags(df, lag, columns):
    """
    Cria lags para as colunas especificadas.
    """
    df_lagged = df.copy()
    for col in columns:
        df_lagged[f'{col}_lag_{lag}'] = df_lagged[col].shift(lag)
    return df_lagged.dropna()

def train_and_evaluate_model(train, test, feature_columns, lag, model):
    """
    Treina e avalia o modelo para um determinado lag.
    
    Args:
    train (pd.DataFrame): DataFrame de treino.
    test (pd.DataFrame): DataFrame de teste.
    feature_columns (List[str]): Lista de colunas de características.
    lag (int): Valor do lag para criar variáveis defasadas.
    model: Modelo de machine learning a ser treinado.
    model_type (str): Tipo do modelo ('sklearn' ou 'arima').

    Returns:
    Dict[str, float]: Dicionário contendo o lag, RMSE e R² do modelo.
    """
    train_lagged = create_lags(train, lag, feature_columns)
    X_train = train_lagged.drop(['cmo', 'din_instante'] + [f'{col}_lag_{lag}' for col in feature_columns], axis=1)
    y_train = train_lagged['cmo']

    test_lagged = create_lags(test, lag, feature_columns)
    X_test = test_lagged.drop(['cmo', 'din_instante'] + [f'{col}_lag_{lag}' for col in feature_columns], axis=1)
    y_test = test_lagged['cmo']

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)


    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)

    return {'lag': lag, 'rmse': rmse, 'r2': r2}


def train_and_evaluate_arima(train, test, order):
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=len(test))
    rmse = np.sqrt(mean_squared_error(test, predictions))
    r2 = r2_score(test, predictions)
    return {'rmse': rmse, 'r2': r2}


def train_and_evaluate_sarima(train, test, order, seasonal_order):
    model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    predictions = model_fit.forecast(steps=len(test))
    rmse = np.sqrt(mean_squared_error(test, predictions))
    r2 = r2_score(test, predictions)
    return {'rmse': rmse, 'r2': r2}

## Importação dos dados

In [4]:
df_s = pd.read_csv(data_path / 'dados_sul.csv')
df_se = pd.read_csv(data_path / 'dados_sudeste.csv')
df_n = pd.read_csv(data_path / 'dados_norte.csv')
df_ne = pd.read_csv(data_path / 'dados_nordeste.csv')

## Avaliação Multitemporal

In [5]:
# Carregar e pré-processar os dados
df = load_and_preprocess_data(data_path / 'dados_nordeste.csv')
feature_columns = df.columns.drop(['cmo', 'din_instante'])

# Dividir os dados em treino e teste
train = df[df['din_instante'] < '2023-01-01']
test = df[df['din_instante'] >= '2023-01-01']

# Loop para avaliar cada modelo em cada lag
for nome_modelo, modelo in modelos.items():
    print(f"\n\nAvaliando o modelo: {nome_modelo}")
    for lag in lags:
        resultados = train_and_evaluate_model(train, test, feature_columns, lag, modelo)
        print(f"Lag: {lag}, RMSE: {resultados['rmse']:.3f}, R²: {resultados['r2']:.3f}")



Avaliando o modelo: Linear Regression
Lag: 5, RMSE: 148.808, R²: -5021863643396.422
Lag: 10, RMSE: 151.126, R²: -5065170587093.944
Lag: 15, RMSE: 152.247, R²: -5024595626564.265
Lag: 20, RMSE: 152.089, R²: -4898370503757.056
Lag: 25, RMSE: 152.682, R²: -4819966575146.108
Lag: 30, RMSE: 152.356, R²: -4683249961835.788
Lag: 35, RMSE: 153.150, R²: -4614810185370.591
Lag: 40, RMSE: 154.682, R²: -4587827091096.463
Lag: 45, RMSE: 156.978, R²: -4601741183800.727
Lag: 50, RMSE: 158.905, R²: -4777609427120.292
Lag: 55, RMSE: 160.089, R²: -4715003631656.312
Lag: 60, RMSE: 161.015, R²: -4634000438634.229
Lag: 65, RMSE: 162.797, R²: -4598513947403.562
Lag: 70, RMSE: 162.980, R²: -4469857612057.246
Lag: 75, RMSE: 164.569, R²: -4415788817181.162
Lag: 80, RMSE: 166.388, R²: -4369099215461.679
Lag: 85, RMSE: 167.805, R²: -4296523851159.960
Lag: 90, RMSE: 169.893, R²: -4253161327768.834


Avaliando o modelo: Random Forest
Lag: 5, RMSE: 107.958, R²: -2643150534548.269
Lag: 10, RMSE: 105.411, R²: -2464

para uma avaliação sequencial o modelo não performou, vamos tentar outras abrdagens.