In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow
from mlflow.models import infer_signature

In [3]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import shap
from cubist import Cubist

# fetch dataset
steel_industry_energy_consumption = fetch_ucirepo(id=851)

In [4]:
df = pd.DataFrame(steel_industry_energy_consumption.data.original)

In [5]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M')

In [6]:
# Utilizamos One Hot Enconder para ["Day_of_week", "Load_Type"]  eliminando las columnas iniciales
df = pd.get_dummies(df, columns=["Day_of_week", "Load_Type"], drop_first=True)

In [7]:
# Visualizamos las columnas actuales del DataFrame
# df.columns.to_list()

In [8]:
# Crear nuevas columnas descomponiendo la fecha y hora en sus componentes
# df['year'] = df['date'].dt.year
# df['month'] = df['date'].dt.month
# df['day'] = df['date'].dt.day
# df['hour'] = df['date'].dt.hour
# df['minute'] = df['date'].dt.minute
# df['second'] = df['date'].dt.second
# df['dayofweek'] = df['date'].dt.dayofweek  # Lunes=0, Domingo=6
# df['dayofyear'] = df['date'].dt.dayofyear
# df['weekofyear'] = df['date'].dt.isocalendar().week
# df['quarter'] = df['date'].dt.quarter

In [9]:
# Hacemos una diferenciación entre fines de semana y días hábiles
df['IsWeekend'] = df['WeekStatus'] == 'Weekend'

In [10]:
# Eliminamos Date porque según EDA la potencia tiene muy baja dependencia de la temporalidad
# Eiminamos WeekStatus porque según EDA la potencia cambia entre día laboral y fin de semana
df = df.drop(['date', 'WeekStatus'], axis=1)

In [11]:
#df.dtypes

In [12]:
mlflow.set_tracking_uri(uri="http://localhost:5001")

In [13]:
# Definimos conjunto de entrenamiento y variable dependiente Y
X = df
X = df.drop('Usage_kWh', axis=1)
y = df['Usage_kWh']

In [14]:
experiment_name = "RegresionLineal_Kfolds"
experiment_id = mlflow.create_experiment(experiment_name) if not mlflow.get_experiment_by_name(experiment_name) else mlflow.get_experiment_by_name(experiment_name).experiment_id

In [15]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [16]:
# Check if a run is already active
if mlflow.active_run() is not None:
    mlflow.end_run()
with mlflow.start_run(experiment_id=experiment_id):

    # Desarrollamos Regresión Lineal   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convertir a matrices numpy si es necesario
    if isinstance(X_train, pd.DataFrame):
        X_train = X_train.values
    if isinstance(X_test, pd.DataFrame):
        X_test = X_test.values
    if isinstance(y_train, pd.Series):
        y_train = y_train.values
    if isinstance(y_test, pd.Series):
        y_test = y_test.values

    # Crear el modelo de regresión lineal
    model = LinearRegression()

    # Realizar validación cruzada K-fold
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)  # Ajustar n_splits como se desee

    mse_scores = []
    mae_scores = []
    r2_scores = []

    for train_index, test_index in kfold.split(X_train):
        X_kfold_train, X_kfold_test = X_train[train_index], X_train[test_index]
        y_kfold_train, y_kfold_test = y_train[train_index], y_train[test_index]

        model.fit(X_kfold_train, y_kfold_train)
        y_pred = model.predict(X_kfold_test)

        mse = mean_squared_error(y_kfold_test, y_pred)
        mae = mean_absolute_error(y_kfold_test, y_pred)
        r2 = r2_score(y_kfold_test, y_pred)

        mse_scores.append(mse)
        mae_scores.append(mae)
        r2_scores.append(r2)

    # Calcular promedios de las métricas K-fold
    avg_mse = np.mean(mse_scores)
    avg_mae = np.mean(mae_scores)
    avg_r2 = np.mean(r2_scores)

    # Registrar métricas promedio en MLflow
    mlflow.log_param("kfolds", 10)  # Registrar el valor de k
    mlflow.log_metric("avg_mse", avg_mse)
    mlflow.log_metric("avg_mae", avg_mae)
    mlflow.log_metric("avg_r2", avg_r2)

    # Entrenar el modelo final en todo el conjunto de entrenamiento
    model.fit(X_train, y_train)

    # Evaluar el modelo final en el conjunto de prueba
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'R^2 Score: {r2}')

    # Registrar métricas de evaluación final en MLflow
    mlflow.log_metric("test_mse", mse)
    mlflow.log_metric("test_rmse", rmse)
    mlflow.log_metric("test_mae", mae)
    mlflow.log_metric("test_r2", r2)

    # (Opcional) Registrar el modelo final como artefacto
    mlflow.sklearn.log_model(model, "model")

# Agregamos modelos a utilizar, Arbol de Decisión, RandomForest, XGBoost Regressor

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [19]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from cubist import Cubist
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Definir los modelos
modelos = [
    ('LinearRegression', LinearRegression()),
    ('DecisionTreeRegressor', DecisionTreeRegressor(max_depth=2)),
    ('RandomForestRegressor', RandomForestRegressor(min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1, oob_score=True, random_state=None, verbose=0, warm_start=False)),
    ('XGBRegressor', xgb.XGBRegressor(learning_rate=0.01, n_estimators=500, max_depth=5, eval_metric='rmsle'))
]

# Procedimiento para comparar los modelos
for nombre, model in modelos:
    experiment_name = nombre
    experiment_id = mlflow.create_experiment(experiment_name) if not mlflow.get_experiment_by_name(experiment_name) else mlflow.get_experiment_by_name(experiment_name).experiment_id
    with mlflow.start_run(experiment_id=experiment_id):
        # Entrenar el modelo
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calcular las métricas
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Registrar los parámetros y métricas en MLFlow
        mlflow.log_param("model_name", nombre)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        # Registrar el modelo
        mlflow.sklearn.log_model(model, "model")

        print(f'{nombre}," score: ",{model.score(X_test, y_pred):.03f}', end=" ")
        print(f'Mean Squared Error (MSE): {mse:.03f}', end=" ")
        print(f'Root Mean Squared Error (RMSE): {rmse:.03f}', end=" ")
        print(f'Mean Absolute Error (MAE): {mae:.03f}', end=" ")
        print(f'R^2 Score: {r2:.03f}', end=" ")
        print("")

# Modelo CUBIST

In [20]:
from cubist import Cubist

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir el modelo CUBIST

cubist_model = Cubist(n_rules=500, neighbors=None, unbiased=True, auto=False, extrapolation=0.1, n_committees=5)

experiment_name = 'Cubist'
experiment_id = mlflow.create_experiment(experiment_name) if not mlflow.get_experiment_by_name(experiment_name) else mlflow.get_experiment_by_name(experiment_name).experiment_id

# Procedimiento para entrenar y evaluar el modelo CUBIST
with mlflow.start_run(experiment_id=experiment_id):
    # Entrenar el modelo
    cubist_model.fit(X_train, y_train)
    y_pred = cubist_model.predict(X_test)

    # Calcular las métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Registrar los parámetros y métricas en MLFlow
    mlflow.log_param("model_name", experiment_name)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Registrar el modelo
    mlflow.sklearn.log_model(cubist_model, experiment_name)

    # Imprimir los resultados
    print(f'{experiment_name} score: {cubist_model.score(X_test, y_pred):.03f}', end=" ")
    print(f'Mean Squared Error (MSE): {mse:.03f}', end=" ")
    print(f'Root Mean Squared Error (RMSE): {rmse:.03f}', end=" ")
    print(f'Mean Absolute Error (MAE): {mae:.03f}', end=" ")
    print(f'R^2 Score: {r2:.03f}', end=" ")
    print("")


# Buscamos mejores valores de hiperparámetros para los 2 modelos con mejores resultados, XGBRegressor y Cubist
## 1. XGBRegressor

In [21]:
from pprint import pprint
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_leaves = [0, 8, 10, 12, 16, 20]  # 0 equivale a 'no limit'
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]


random_grid = {'n_estimators': n_estimators,
               'max_leaves': max_leaves,
               'max_depth': max_depth}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_estimators']) *
      len(random_grid['max_leaves']) *
      len(random_grid['max_depth'])
      )

## Usaremos la recomendación de busqueda con el 10%

In [22]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Definir el espacio de búsqueda aleatoria
random_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Definir el modelo y la búsqueda aleatoria
xgbb = xgb.XGBRegressor()
xgb_random = RandomizedSearchCV(estimator=xgbb,
                                param_distributions=random_grid,
                                n_iter=20,
                                cv=3,
                                verbose=2,
                                random_state=0,
                                n_jobs=-1)

# Crear o obtener el experimento en MLFlow
model_name = 'XGBRegressor_RandomSearch'
experiment_name = model_name
experiment_id = mlflow.create_experiment(experiment_name) if not mlflow.get_experiment_by_name(experiment_name) else mlflow.get_experiment_by_name(experiment_name).experiment_id

# Procedimiento para entrenar y evaluar el modelo XGBRegressor con búsqueda aleatoria
with mlflow.start_run(experiment_id=experiment_id):
    # Entrenar el modelo
    xgb_random.fit(X_train, y_train)
    best_model = xgb_random.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calcular las métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Registrar los parámetros y métricas en MLFlow
    mlflow.log_params(xgb_random.best_params_)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Registrar el modelo
    mlflow.sklearn.log_model(best_model, "model")

    # Imprimir los resultados
    print(f'{model_name} best score: {best_model.score(X_test, y_pred):.03f}', end=" ")
    print(f'Mean Squared Error (MSE): {mse:.03f}', end=" ")
    print(f'Root Mean Squared Error (RMSE): {rmse:.03f}', end=" ")
    print(f'Mean Absolute Error (MAE): {mae:.03f}', end=" ")
    print(f'R^2 Score: {r2:.03f}', end=" ")
    print("")


In [23]:
xgb_random_best = xgb_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(xgb_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', xgb_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', xgb_random_best.score(X_test, y_test))

## 2. Búsqueda de hiperparámetros para el método Cubist

# Agregamos modelos a utilizar, Arbol de Decisión, RandomForest, XGBoost Regressor, CUBIST

In [24]:
from pprint import pprint
import numpy as np

###Hiperparámetros a considerar
# n_rules (int, default=500)
# n_committees (int, default=0):
# neighbors (int, default=None)
# extrapolation (float, default=0.05):

n_rules = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
n_committees = [1, 2, 5, 10, 15]  # 5 es el recomendado en la documentación
neighbors = [int(x) for x in np.linspace(1, 9, num = 1)]
extrapolation = [0.01, 0.03, 0.05, 0.07, 0.09] #0,05 = 5% es el recomendado


random_grid = {'n_rules': n_rules,
               'n_committees': n_committees,
               'neighbors': neighbors,
               'extrapolation':extrapolation}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_rules']) *
      len(random_grid['n_committees']) *
      len(random_grid['neighbors']) *
      len(random_grid['extrapolation'])
      )

In [25]:
from cubist import Cubist

# Definir el espacio de búsqueda aleatoria
random_grid = {
    'n_rules': [50, 100, 200, 500],
    'neighbors': [0, 1, 5],
    'unbiased': [True, False],
    'extrapolation': [0.0, 0.1, 0.5],
    'n_committees': [1, 3, 5, 10]
}

# Definir el modelo y la búsqueda aleatoria
cbst = Cubist()
cbst_random = RandomizedSearchCV(estimator=cbst,
                                 param_distributions=random_grid,
                                 n_iter=5,
                                 cv=3,
                                 verbose=2,
                                 random_state=0,
                                 n_jobs=-1)

# Crear o obtener el experimento en MLFlow
model_name = 'Cubist_10percent'
experiment_name = model_name
experiment_id = mlflow.create_experiment(experiment_name) if not mlflow.get_experiment_by_name(experiment_name) else mlflow.get_experiment_by_name(experiment_name).experiment_id

# Procedimiento para entrenar y evaluar el modelo Cubist con búsqueda aleatoria
with mlflow.start_run(experiment_id=experiment_id):
    # Entrenar el modelo
    cbst_random.fit(X_train, y_train)
    best_model = cbst_random.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calcular las métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Registrar los parámetros y métricas en MLFlow
    mlflow.log_params(cbst_random.best_params_)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Registrar el modelo
    mlflow.sklearn.log_model(best_model, "model")

    # Imprimir los resultados
    print(f'{model_name} best score: {best_model.score(X_test, y_pred):.03f}', end=" ")
    print(f'Mean Squared Error (MSE): {mse:.03f}', end=" ")
    print(f'Root Mean Squared Error (RMSE): {rmse:.03f}', end=" ")
    print(f'Mean Absolute Error (MAE): {mae:.03f}', end=" ")
    print(f'R^2 Score: {r2:.03f}', end=" ")
    print("")


In [26]:
cbst_random_best = cbst_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(cbst_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', cbst_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', cbst_random_best.score(X_test, y_test))

## Uso de la librería OPTUNA para encontrar los mejores hiperparámetros

Estimamos los mejores hiperparámetros usando la librería OPTUNA sugerida en el plantemiento del proyecto

In [27]:
import optuna
import optuna.visualization as vis

# Para trabajar con optuna, debemos definir la función objetivo que luego le diremos a optuna que debe minimizar (según nuestro indicador escogido RMSE)
def objective(trial):
  # Dejamos que optuna sugiera los hiperparámetros iniciales
  n_rules = trial.suggest_int("n_rules", 200, 1000)
  n_committees = trial.suggest_int("n_committees", 1, 15)
  neighbors = trial.suggest_int("neighbors", 1, 9)
  extrapolation = trial.suggest_float("extrapolation",0.01, 0.09)

  #Definimos el modelo Cubist a utilizar para la optimización
  cbst = Cubist(n_rules= n_rules, n_committees=n_committees, neighbors=neighbors, extrapolation=extrapolation)
  cbst.fit(X_train, y_train)

  # Make predictions and calculate RMSE
  y_pred = model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  # Return MAE
  return mae

In [28]:
# Creamos el "objeto de estudio", como llama Optuna
study = optuna.create_study(direction="minimize")

# Ejecutamos el proceso en si de optimización, le pasamos la función objetivo definida en el paso previo
study.optimize(objective, n_trials=10, show_progress_bar=True)

In [29]:
# Imprimimos los resultados del estudio con optuna
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

In [30]:
import plotly
import plotly.io as pio

import plotly.graph_objs as go
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()
#Utilizamos algunos gráficos utiles de la librería optuna
# Plotear el historial de la optimización realizada
vis.plot_optimization_history(study)

# Plotear tipo 'slice'
#vis.plot_slice(study, params=["n_rules", "n_committees"])

# Ploteo tipo 'contorno'
#vis.plot_contour(study, params=["neighbors", "extrapolation"])

# Plotear parallel_coordinate
vis.plot_parallel_coordinate(study)

In [31]:
import plotly.graph_objs as go
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()
optuna.visualization.plot_optimization_history(study)