## Imports

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
from src.data_process import DataStorage, FeaturesGenerator
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import numpy as np
from xgboost import XGBRegressor
from codecarbon import EmissionsTracker

warnings.filterwarnings("ignore")

## Read data

In [None]:
# Se leen y se preprocesan los datos
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)
train_data = features_generator.generate_features(data_storage.df_data)

In [None]:
# Se definen parametros de preprocesado
preprocessing_params = {
    "n_features": 60,
    "is_holiday": True
    }

In [None]:
# Se crea la matriz de correlación
correlation_matrix = train_data.corr()

# Se ordenan las columnas en función del valor absoluto de la correlación.
target_column = 'target'
correlation_with_target = correlation_matrix[target_column].abs().sort_values(ascending=False)

# Se seleccionan las N features más correlacionadas sin incluir el target
top_n_features = correlation_with_target[1:preprocessing_params["n_features"]]  

print("Top correlated features with", target_column)
print(top_n_features.index.tolist())

In [None]:
# Se combiana la columna objetivo con las columnas más correlacionadas en un nuevo DataFrame.
selected_features_df = train_data[top_n_features.index.tolist()]

In [None]:
X = selected_features_df
y = train_data[target_column]

# Se crea un Split para series temporales
tsvc = TimeSeriesSplit(n_splits=6)
# Se definen los parametros
params = {
    "learning_rate" : .1,
    "max_depth" : 10,
    "n_estimators" : 500
}
# Se ejecuta la validación cruzada para series temporales
scores = cross_val_score(XGBRegressor(**params,enable_categorical=True,), X,y,cv=tsvc, scoring="neg_mean_absolute_error")

# Se hace la media de las metricas y se multiplica por -1 porque la librería tiene implementada la metrica en negativo: neg_mean_absolute_error
mean_score = np.mean(scores)*-1
print(f"Mean_score: {mean_score}")

In [None]:
# Se inicializa el tracker de emisiones
tracker = EmissionsTracker()
# Se define el modelo en cuestión con los parámetros
my_model = XGBRegressor(**params,enable_categorical=True,)
# Se inicializa el tracker y se entrena
tracker.start()
my_model.fit(X, y)
emissions = tracker.stop()
print(f"Emissions:{emissions}")

In [None]:
# Se cargan nuevos datos y se prerprocesan
data_storage.update_with_new_data(
        df_new_client=pd.read_csv("data/example_test_files/client.csv"),
        df_new_gas_prices=pd.read_csv("data/example_test_files/gas_prices.csv"),
        df_new_electricity_prices=pd.read_csv("data/example_test_files/electricity_prices.csv", parse_dates=["forecast_date","origin_date"]),
        df_new_forecast_weather=pd.read_csv("data/example_test_files/forecast_weather.csv", parse_dates=["origin_datetime", "forecast_datetime"]),
        df_new_historical_weather=pd.read_csv("data/example_test_files/historical_weather.csv", parse_dates=["datetime"]),
        df_new_target=pd.read_csv("data/example_test_files/revealed_targets.csv", parse_dates=["datetime"])
    )
df_test = data_storage.preprocess_test(pd.read_csv("data/example_test_files/test.csv",  parse_dates=["prediction_datetime"]))
df_test_features = features_generator.generate_features(df_test, has_target=False)

In [None]:
# Se crea una predicción
predictions = my_model.predict(df_test_features[top_n_features.index.tolist()])

In [None]:
import mlflow

# Nos conectamos a nuestro servidor de MLFlow y se crea un nuevo experimento
mlflow.set_tracking_uri(uri=MLFLOW_SERVER_URL)
mlflow.set_experiment("Enefit-XGBoost")

# MLFlow en este momento no acepta la variables categóricas asi que se convierten estas variales a strings
# para que se registren todas las features.
X[["is_consumption","product_type","is_business","county","segment"]] = X[["is_consumption","product_type","is_business","county","segment"]].astype(str)
signature = mlflow.models.infer_signature(X, predictions)

In [None]:
# Se registra el experimento en MLFlow
with mlflow.start_run():
    # Se juntan los parametros de preprocesado con los de entrenamiento
    mlflow.log_params(params | preprocessing_params)
    mlflow.log_metric("mean_absolute_error", mean_score)
    # Se guardan también las emisiones
    mlflow.log_metric("emissions", emissions)
    mlflow.set_tag("Basic XGBoost experiment", "First experiment")
    model_info = mlflow.xgboost.log_model(
        xgb_model=my_model,
        artifact_path="enefit_model",
        signature=signature,
        input_example=X,
        registered_model_name="enefit-xgboost-experiment"
    )