# **Preparación de datos previa a proceso de modelado**

## **Librerías**

In [1]:
from modeling_auxiliary_functions import (apply_pca,
                                          train_model, 
                                          split_train_test_date
                                 )
from modeling_auxiliary_functions import add_total_load

import warnings
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
warnings.filterwarnings('ignore')

## **Modelado predictivo**

El objetivo es predecir las 24 horas de la demanda total de energía con una anticipación de 12,24 y 36 horas según las sesiones intradiarias del mercado eléctrico mayorista.

In [None]:
basic_dataset = pd.read_pickle('./modeling_datasets/basic_dataset.pkl')

In [4]:
offer_dataset = pd.read_pickle('./modeling_datasets/market_offer_dataset.pkl')
offer_dataset = add_total_load(offer_dataset, basic_dataset)
X_train, y_train, X_test, y_test = split_train_test_date(offer_dataset, "total_load_actual", dt.datetime(2018,1,1,0))

In [13]:
min_variance = 0.95
pca_summary, num_components, X_train, X_test = apply_pca(min_variance, X_train, X_test)
pca_summary

Unnamed: 0,components,accumulated_variance
0,1,0.355424
1,2,0.643161
2,3,0.850071
3,4,0.918046
4,5,0.94526
5,6,0.96699
6,7,0.985653
7,8,1.0


In [8]:
# rfo_model = train_model(X_train, y_train, "randomForest")

In [7]:
xgb_model = train_model(X_train, y_train, "XGBoost")

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [9]:
# mlp_model = train_model(X_train, y_train, "MLP")

In [None]:
models_dic = {
    "randomForest": rfo_model,
    "XGBoost": xgb_model,
    "MLP": mlp_model,
}

In [None]:
def prep_results_df(X_test, y_test, models_dic, offer_type):
    tmp_df_list = []
    for model_name, model_predictor in models_dic.items():
        tmp_df = y_test.copy()
        tmp_df["prediction"] = model_predictor.predict(X_test).round(0)
        tmp_df["model"] = model_name
        tmp_df["offer_type"] = offer_type
        tmp_df_list.append(tmp_df)

    results_df = pd.concat(tmp_df_list)
    return results_df

In [10]:
prep_results_df(X_test, y_test, models_dic, offer_type)

In [15]:
# Get feature importances
importances = xgb_model.feature_importances_

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.grid()
plt.show()

ValueError: All arrays must be of the same length

In [None]:
import scipy.stats as stats

def check_gaussian_distribution(data):

    # Shapiro-Wilk Test
    shapiro_test = stats.shapiro(data)

    # D'Agostino's K-squared Test
    k2_test = stats.normaltest(data)

    # Conclusion based on p-values
    alpha = 0.05
    if shapiro_test.pvalue > alpha and k2_test.pvalue > alpha:
        print(f"Los residuos forman una distribución Gaussiana (fail to reject H0).")
    else:
        print(f"Los residuos no forman una distribución Gaussiana (reject H0).")

    # Kurtosis
    kurtosis = stats.kurtosis(data, fisher=True)
    print(f"Curtosis: {kurtosis}")
    
    if kurtosis > 0:
        print("La distribución es leptocúrtica")
    elif kurtosis < 0:
        print("La distribución es platicúrtica")
    else:
        print("La distribución tiene una kurtosis normal (mesocúrtica).")


In [None]:
def calc_metrics(test_data):

    test_data["pred_dif"] = (test_data["pred"] - test_data["total_load_actual"])
    test_data["pred_dif_abs"] = test_data["pred_dif"].abs()

    mae = test_data["pred_dif_abs"].mean()
    mape = (test_data["pred_dif_abs"]/test_data["total_load_actual"]).abs().mean()*100

    mdae = test_data["pred_dif_abs"].abs().median()
    mdape = (test_data["pred_dif_abs"]/test_data["total_load_actual"]).abs().median()*100

    dicc = {"metrics":["MAE (MW)", "MAPE (%)", "MDAE (MW)", "MDAPE (%)"],
            "values":[round(mae,0), round(mape,2), round(mdae,0), round(mdape,2),]}

    plt.figure(figsize=(10, 4))
    plt.hist(test_data["pred_dif"], bins=50)
    plt.grid()

    metrics_df = pd.DataFrame(dicc)

    check_gaussian_distribution(test_data["pred_dif"])

    return metrics_df