In [1]:
import pandas as pd
import warnings
from src.data_process import DataStorage, FeaturesGenerator
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import numpy as np
from codecarbon import EmissionsTracker
import lightgbm as lgb
import tensorflow as tf

warnings.filterwarnings("ignore")

In [2]:
# Se leen y se preprocesan los datos
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)
train_data = features_generator.generate_features(data_storage.df_data)

In [4]:
# Se definen parametros de preprocesado
preprocessing_params = {
    "n_features": 60,
    "is_holiday": True
    }

In [5]:
# Se crea la matriz de correlación
correlation_matrix = train_data.corr()

# Se ordenan las columnas en función del valor absoluto de la correlación.
target_column = 'target'
correlation_with_target = correlation_matrix[target_column].abs().sort_values(ascending=False)

# Se seleccionan las N features más correlacionadas sin incluir el target
top_n_features = correlation_with_target[1:preprocessing_params["n_features"]]  

print("Top correlated features with", target_column)
print(top_n_features.index.tolist())

Top correlated features with target
['target_168h', 'target_336h', 'target_144h', 'target_mean', 'target_192h', 'target_312h', 'target_48h', 'target_120h', 'target_72h', 'target_216h', 'target_96h', 'target_288h', 'target_240h', 'target_264h', 'target_std', 'installed_capacity', 'target_all_type_sum_168h', 'target_all_type_sum_336h', 'target_all_type_sum_48h', 'target_all_type_sum_72h', 'target_all_county_type_sum_168h', 'target_all_county_type_sum_336h', 'target_all_county_type_sum_48h', 'target_all_county_type_sum_72h', 'eic_count', 'is_consumption', 'product_type', 'is_business', 'county', 'segment', 'cos(hour)', 'surface_solar_radiation_downwards_forecast_local_0h', 'surface_solar_radiation_downwards', 'shortwave_radiation_historical_local_48h', 'shortwave_radiation', 'diffuse_radiation', 'shortwave_radiation_historical_local_168h', 'shortwave_radiation_historical_168h', 'surface_solar_radiation_downwards_forecast_168h', 'diffuse_radiation_historical_168h', 'surface_solar_radiation

In [6]:
# Se combiana la columna objetivo con las columnas más correlacionadas en un nuevo DataFrame.
selected_features_df = train_data[top_n_features.index.tolist()]

In [15]:

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score

ModuleNotFoundError: No module named 'keras'

In [19]:
params = {
    "activation_func": "relu",
    "epochs": 15,
    "n_hidden_layers": 3,
    "n_per_layer": 64
}

In [None]:
X = selected_features_df
y = train_data[target_column]
def create_network (params=params):
    model = tf.keras.models.Sequential([
                tf.keras.layers.Dense(128, input_shape=(len(X),), activation='relu'),
                tf.keras.layers.Dense(params["n_per_layer"], activation='relu'),
                tf.keras.layers.Dense(params["n_per_layer"], activation='relu'),
                tf.keras.layers.Dense(params["n_per_layer"], activation='relu'),
                tf.keras.layers.Dense(1)
            ])
            
    model.compile(optimizer='adam', loss='mae', metrics=['mae'])
    return model


my_model = KerasClassifier(build_fn=create_network, epochs=params["epochs"], batch_size=32,
                                     verbose=0)
     

In [None]:
# Se crea un Split para series temporales
tsvc = TimeSeriesSplit(n_splits=6)
# Se definen los parametros
params = {
    "learning_rate" : .1,
    "max_depth" : 10,
    "n_estimators" : 500,
    "num_leaves" : 31
}
# Se ejecuta la validación cruzada para series temporales
scores = cross_val_score(my_model, X,y,cv=tsvc, scoring="neg_mean_absolute_error")

# Se hace la media de las metricas y se multiplica por -1 porque la librería tiene implementada la metrica en negativo: neg_mean_absolute_error
mean_score = np.mean(scores)*-1
print(f"Mean_score: {mean_score}")

In [None]:
# Se inicializa el tracker de emisiones
tracker = EmissionsTracker()

# Se inicializa el tracker y se entrena
tracker.start()
my_model.fit(X, y)
emissions = tracker.stop()
print(f"Emissions:{emissions}")

In [None]:
# Se cargan nuevos datos y se prerprocesan
data_storage.update_with_new_data(
        df_new_client=pd.read_csv("data/example_test_files/client.csv"),
        df_new_gas_prices=pd.read_csv("data/example_test_files/gas_prices.csv"),
        df_new_electricity_prices=pd.read_csv("data/example_test_files/electricity_prices.csv", parse_dates=["forecast_date","origin_date"]),
        df_new_forecast_weather=pd.read_csv("data/example_test_files/forecast_weather.csv", parse_dates=["origin_datetime", "forecast_datetime"]),
        df_new_historical_weather=pd.read_csv("data/example_test_files/historical_weather.csv", parse_dates=["datetime"]),
        df_new_target=pd.read_csv("data/example_test_files/revealed_targets.csv", parse_dates=["datetime"])
    )
df_test = data_storage.preprocess_test(pd.read_csv("data/example_test_files/test.csv",  parse_dates=["prediction_datetime"]))
df_test_features = features_generator.generate_features(df_test, has_target=False)

In [None]:
# Se crea una predicción
predictions = my_model.predict(df_test_features[top_n_features.index.tolist()])

In [None]:
import mlflow

# Nos conectamos a nuestro servidor de MLFlow y se crea un nuevo experimento
mlflow.set_tracking_uri(uri=MLFLOW_SERVER_URL)
mlflow.set_experiment("Enefit-Nn")

# MLFlow en este momento no acepta la variables categóricas asi que se convierten estas variales a strings
# para que se registren todas las features.
X[["is_consumption","product_type","is_business","county","segment"]] = X[["is_consumption","product_type","is_business","county","segment"]].astype(str)
signature = mlflow.models.infer_signature(X, predictions)

In [None]:
# Se registra el experimento en MLFlow
with mlflow.start_run():
    # Se juntan los parametros de preprocesado con los de entrenamiento
    mlflow.log_params(params | preprocessing_params)
    mlflow.log_metric("mean_absolute_error", mean_score)
    # Se guardan también las emisiones
    mlflow.log_metric("emissions", emissions)
    mlflow.set_tag("NN experiment", "First experiment")
    model_info = mlflow.keras.log_model(
        keras_model=my_model,
        artifact_path="enefit_model",
        signature=signature,
        input_example=X,
        registered_model_name="enefit-nn-experiment"
    )