## Imports

In [1]:
import pandas as pd
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np

## Configurações

In [2]:
data_path = Path(r'..\..\01_dados\03_dados_modelagem')
save_data_path = Path(r'..\..\01_dados\04_dados_modelos')
save_model_path = Path(r'..\..\06_modelos')

missing_value_threshold = 0.50

## Funções e Classes Auxiliares

In [3]:
def create_lag_features(dataframe, lag):
    df = dataframe.copy()
    df['din_instante'] = pd.to_datetime(df['din_instante'])
    df.sort_values(by='din_instante', inplace=True)

    # Columns to be lagged (excluding 'din_instante' and 'cmo')
    lag_columns = [col for col in df.columns if col not in ['din_instante', 'cmo']]

    # Creating lagged columns
    for col in lag_columns:
        df[f'{col}_lag_{lag}'] = df[col].shift(lag)

    # Dropping original columns except 'din_instante' and 'cmo'
    df.drop(columns=lag_columns, inplace=True)
    
    return df


# Function to train and evaluate multiple regression models
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Ridge': Ridge(),
        'SVR': SVR()
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        results[name] = {'model': str(model), 'RMSE': rmse, 'R2': r2}

    return results


def train_specific_model(model_name, X_train, y_train):
    if model_name == 'RandomForestRegressor':
        model = RandomForestRegressor()
    elif model_name == 'GradientBoostingRegressor':
        model = GradientBoostingRegressor()
    elif model_name == 'LinearRegression':
        model = LinearRegression()
    elif model_name == 'Ridge':
        model = Ridge()
    elif model_name == 'SVR':
        model = SVR()
    else:
        raise ValueError("Unknown model name")

    model.fit(X_train, y_train)
    return model

## Importação dos dados

In [4]:
df_s = pd.read_csv(data_path / 'dados_sul.csv')
df_se = pd.read_csv(data_path / 'dados_sudeste.csv')
df_n = pd.read_csv(data_path / 'dados_norte.csv')
df_ne = pd.read_csv(data_path / 'dados_nordeste.csv')

df_list = [df_s, df_se, df_n, df_ne]

In [5]:
df_ne

Unnamed: 0,din_instante,cmo,"Geração no Centro de Gravidade - MW médios (Gp,j) - MWh","Fator de Abatimento de Perdas Internas Instantâneas (F_PDIp,j)","Fator de Rateio das Perdas de Geração (UXP_GLFp,j)*","Deslocamento Hidráulico Energético Preliminar (DH_ENER_PRE_UHp,j)","Garantia física modulada ajustada pelo fator de disponibilidade (GFIS_2p,j)","Garantia Física de Repasse de Risco Hidrológico Modulada e Ajustada (GFIS_2_RRHp,j) - MWh","Garantis Física Modulada Ajustada de Repasse do Risco Hidrológico (GFIS_3_RRHp,j) - MWh","Fator de Risco Hidrológico aceito pelo gerador, variando entre 0 e 11% (Fp,j)",...,val_verifunitcommitment,val_verifconstrainedoff,val_importacaoprogramada,val_importacaodespachada,val_importacaoverificada,val_preco_importacao,val_dispf,val_indisppf,val_indispff,val_intercambiomwmed
0,2018-04-17,117.138750,,,,,,,,,...,0.0,0.0,,,,,,,,0.0
1,2018-04-18,113.163542,,,,,,,,,...,0.0,0.0,,,,,,,,0.0
2,2018-04-19,90.739792,,,,,,,,,...,0.0,0.0,,,,,,,,0.0
3,2018-04-20,120.826458,,,,,,,,,...,0.0,0.0,,,,,,,,0.0
4,2018-04-21,101.858333,,,,,,,,,...,0.0,0.0,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941,2023-08-10,0.000417,,,,,,,,,...,,,,,,,,,,
1942,2023-08-11,0.000417,,,,,,,,,...,,,,,,,,,,
1943,2023-08-12,0.000000,,,,,,,,,...,,,,,,,,,,
1944,2023-08-13,0.000000,,,,,,,,,...,,,,,,,,,,


## Tratamento dos dados

In [6]:
for df in df_list:
    df.fillna(-1, inplace=True)

### Modelagem

#### Nordeste

In [7]:
# Training models for lags 5 to 25 and collecting results
all_results = []
for lag in range(5, 76, 5):
    df_lagged = create_lag_features(df_ne, lag)
    df_lagged.dropna(inplace=True)

    X = df_lagged.drop(columns=['din_instante', 'cmo'])
    y = df_lagged['cmo']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model_results = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    for model_name, result in model_results.items():
        all_results.append({'model': model_name, 'lag': lag, 'RMSE': result['RMSE'], 'R2': result['R2']})

# Converting results to DataFrame
all_results_df = pd.DataFrame(all_results)
all_results_df.sort_values(by='R2', ignore_index=True, inplace=True, ascending=False)
all_results_df.head(20)

Unnamed: 0,model,lag,RMSE,R2
0,GradientBoostingRegressor,20,104.772467,0.924558
1,RandomForestRegressor,20,109.022271,0.918314
2,RandomForestRegressor,30,110.479533,0.912001
3,GradientBoostingRegressor,30,114.575437,0.905355
4,RandomForestRegressor,35,126.110521,0.868404
5,RandomForestRegressor,25,120.661276,0.851534
6,RandomForestRegressor,70,110.193905,0.835454
7,GradientBoostingRegressor,75,123.18821,0.834727
8,GradientBoostingRegressor,35,142.007845,0.833135
9,GradientBoostingRegressor,25,132.326391,0.82144


In [8]:
best_model_info = all_results_df.iloc[0]

best_model_name = best_model_info['model']
best_lag = best_model_info['lag']

best_model = train_specific_model(best_model_name, X_train, y_train)

model_path = save_model_path / 'melhor_modelo_ne.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

#### Norte

In [9]:
# Training models for lags 5 to 25 and collecting results
all_results = []
for lag in range(5, 76, 5):
    df_lagged = create_lag_features(df_n, lag)
    df_lagged.dropna(inplace=True)

    X = df_lagged.drop(columns=['din_instante', 'cmo'])
    y = df_lagged['cmo']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model_results = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    for model_name, result in model_results.items():
        all_results.append({'model': model_name, 'lag': lag, 'RMSE': result['RMSE'], 'R2': result['R2']})

# Converting results to DataFrame
all_results_df = pd.DataFrame(all_results)
all_results_df.sort_values(by='R2', ignore_index=True, inplace=True, ascending=False)
all_results_df.head(20)

Unnamed: 0,model,lag,RMSE,R2
0,RandomForestRegressor,35,97.517797,0.929564
1,RandomForestRegressor,60,92.066302,0.928421
2,RandomForestRegressor,25,89.390897,0.925934
3,RandomForestRegressor,70,82.303276,0.915239
4,GradientBoostingRegressor,25,101.818943,0.903907
5,RandomForestRegressor,75,97.336497,0.901697
6,GradientBoostingRegressor,50,114.712638,0.899376
7,GradientBoostingRegressor,30,126.709149,0.895985
8,GradientBoostingRegressor,40,133.28822,0.895579
9,GradientBoostingRegressor,35,120.163933,0.893051


In [10]:
best_model_info = all_results_df.iloc[0]

best_model_name = best_model_info['model']
best_lag = best_model_info['lag']

best_model = train_specific_model(best_model_name, X_train, y_train)

model_path = save_model_path / 'melhor_modelo_n.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

#### Sul

In [11]:
# Training models for lags 5 to 25 and collecting results
all_results = []
for lag in range(5, 76, 5):
    df_lagged = create_lag_features(df_s, lag)
    df_lagged.dropna(inplace=True)

    X = df_lagged.drop(columns=['din_instante', 'cmo'])
    y = df_lagged['cmo']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model_results = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    for model_name, result in model_results.items():
        all_results.append({'model': model_name, 'lag': lag, 'RMSE': result['RMSE'], 'R2': result['R2']})

# Converting results to DataFrame
all_results_df = pd.DataFrame(all_results)
all_results_df.sort_values(by='R2', ignore_index=True, inplace=True, ascending=False)
all_results_df.head(20)

Unnamed: 0,model,lag,RMSE,R2
0,RandomForestRegressor,35,84.351549,0.956479
1,RandomForestRegressor,60,82.515596,0.948736
2,GradientBoostingRegressor,35,95.411709,0.944318
3,RandomForestRegressor,15,95.522272,0.939681
4,RandomForestRegressor,50,96.008623,0.93951
5,GradientBoostingRegressor,15,103.075182,0.929765
6,RandomForestRegressor,75,82.40055,0.928117
7,RandomForestRegressor,40,125.625124,0.917751
8,GradientBoostingRegressor,30,121.176952,0.916966
9,RandomForestRegressor,30,123.262807,0.914083


In [12]:
best_model_info = all_results_df.iloc[0]

best_model_name = best_model_info['model']
best_lag = best_model_info['lag']

best_model = train_specific_model(best_model_name, X_train, y_train)

model_path = save_model_path / 'melhor_modelo_s.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

#### Sudeste

In [13]:
# Training models for lags 5 to 25 and collecting results
all_results = []
for lag in range(5, 76, 5):
    df_lagged = create_lag_features(df_se, lag)
    df_lagged.dropna(inplace=True)

    X = df_lagged.drop(columns=['din_instante', 'cmo'])
    y = df_lagged['cmo']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    model_results = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    for model_name, result in model_results.items():
        all_results.append({'model': model_name, 'lag': lag, 'RMSE': result['RMSE'], 'R2': result['R2']})

# Converting results to DataFrame
all_results_df = pd.DataFrame(all_results)
all_results_df.sort_values(by='R2', ignore_index=True, inplace=True, ascending=False)
all_results_df.head(20)

Unnamed: 0,model,lag,RMSE,R2
0,RandomForestRegressor,35,66.718447,0.972884
1,GradientBoostingRegressor,35,85.717178,0.955243
2,RandomForestRegressor,60,77.814724,0.954584
3,GradientBoostingRegressor,30,91.036613,0.953262
4,RandomForestRegressor,30,94.683034,0.949442
5,RandomForestRegressor,50,96.957281,0.938587
6,RandomForestRegressor,75,76.622395,0.938069
7,GradientBoostingRegressor,60,92.631828,0.935642
8,GradientBoostingRegressor,15,103.073873,0.930044
9,RandomForestRegressor,40,117.990838,0.927698


In [14]:
best_model_info = all_results_df.iloc[0]

best_model_name = best_model_info['model']
best_lag = best_model_info['lag']

best_model = train_specific_model(best_model_name, X_train, y_train)

model_path = save_model_path / 'melhor_modelo_se.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

## Salvando Dataframes

In [15]:
df_lagged_se = create_lag_features(df_se, 35)
df_lagged_s = create_lag_features(df_s, 35)
df_lagged_n = create_lag_features(df_n, 35)
df_lagged_ne = create_lag_features(df_ne, 20)

df_lagged_se.to_csv(save_data_path / 'dados_se_lag_35.csv', index=False)
df_lagged_s.to_csv(save_data_path / 'dados_s_lag_35.csv', index=False)
df_lagged_n.to_csv(save_data_path / 'dados_n_lag_35.csv', index=False)
df_lagged_ne.to_csv(save_data_path / 'dados_ne_lag_20.csv', index=False)