In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# !pip install ucimlrepo shap --quiet
# !pip install --upgrade cubist --quiet
# !pip install --upgrade optuna --quiet

In [3]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import shap
from cubist import Cubist

# fetch dataset
steel_industry_energy_consumption = fetch_ucirepo(id=851)

In [4]:
df = pd.DataFrame(steel_industry_energy_consumption.data.original)

In [5]:
df.sample(n=10)

In [6]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M')

In [7]:
pd.DataFrame(steel_industry_energy_consumption.variables)

In [8]:
df.dtypes

In [9]:
# Crear nuevas columnas descomponiendo la fecha y hora en sus componentes
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second
df['dayofweek'] = df['date'].dt.dayofweek  # Lunes=0, Domingo=6
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df['quarter'] = df['date'].dt.quarter

In [10]:
df = pd.get_dummies(df, columns=["Day_of_week", "Load_Type"], drop_first=True)

In [11]:
df.columns.to_list()

In [12]:
df['IsWeekend'] = df['WeekStatus'] == 'Weekend'

In [13]:
df = df.drop(['date', 'WeekStatus'], axis=1)

In [14]:
df.dtypes

In [15]:
X = df
X = df.drop('Usage_kWh', axis=1)
y = df['Usage_kWh']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo
model.fit(X_train, y_train)

# Realizar predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
# Calcular MSE
mse = mean_squared_error(y_test, y_pred)

# Calcular RMSE
rmse = np.sqrt(mse)

# Calcular MAE
mae = mean_absolute_error(y_test, y_pred)

# Calcular R^2
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R^2 Score: {r2}')

Agregamos modelos a utilizar, Arbol de Decisión, RandomForest, XGBoost Regressor, CUBIST

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

modelos = [('LinearRegression',LinearRegression()),
    ('DecisionTreeRegressor', DecisionTreeRegressor(max_depth=2)),
    ('RandomForestRegressor', RandomForestRegressor(min_samples_split=2, min_weight_fraction_leaf=0.0,n_estimators=40, n_jobs=-1, oob_score=True,random_state=None, verbose=0, warm_start=False)),
    ('XGBRegressor', xgb.XGBRegressor(learning_rate=0.01,n_estimators=500,max_depth=5,eval_metric='rmsle')),
    ('Cubist', Cubist(n_rules=500,neighbors=None,unbiased=True,auto=False,extrapolation=0.1,n_committees=5))
]


#procedimiento para comparar los modelos
for nombre, model in modelos:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  print(f'{nombre}," score: ",{model.score(X_test,y_pred):.03f}',end=" ")
  print(f'Mean Squared Error (MSE): {mse:.03f}',end=" ")
  print(f'Root Mean Squared Error (RMSE): {rmse:.03f}',end=" ")
  print(f'Mean Absolute Error (MAE): {mae:.03f}',end=" ")
  print(f'R^2 Score: {r2:.03f}',end=" ")
  print("")

Buscamos mejores valores de hiperparámetros para los 2 modelos con mejores resultados, XGBRegressor y Cubist
1. XGBRegressor

In [18]:
from pprint import pprint
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_leaves = [0, 8, 10, 12, 16, 20]  # 0 equivale a 'no limit'
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]


random_grid = {'n_estimators': n_estimators,
               'max_leaves': max_leaves,
               'max_depth': max_depth}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_estimators']) *
      len(random_grid['max_leaves']) *
      len(random_grid['max_depth'])
      )

Usaremos la recomendación de busqueda con el 10%

In [19]:
from sklearn.model_selection import RandomizedSearchCV

xgbb = xgb.XGBRegressor()
xgb_random = RandomizedSearchCV(estimator = xgbb,
                               param_distributions = random_grid,
                               n_iter = 20,
                               cv = 3,          # Validación cruzada 3-fold
                               verbose=2,
                               random_state=0,
                               n_jobs = -1      # Paralelizar en todos los cores disponibles
                               )
xgb_random.fit(X_train, y_train)

In [20]:
xgb_random_best = xgb_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(xgb_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', xgb_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', xgb_random_best.score(X_test, y_test))

2. Búsqueda de hiperparámetros para el método Cubist

In [21]:
from pprint import pprint
import numpy as np

###Hiperparámetros a considerar
# n_rules (int, default=500)
# n_committees (int, default=0):
# neighbors (int, default=None)
# extrapolation (float, default=0.05):

n_rules = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
n_committees = [1, 2, 5, 10, 15]  # 5 es el recomendado en la documentación
neighbors = [int(x) for x in np.linspace(1, 9, num = 1)]
extrapolation = [0.01, 0.03, 0.05, 0.07, 0.09] #0,05 = 5% es el recomendado


random_grid = {'n_rules': n_rules,
               'n_committees': n_committees,
               'neighbors': neighbors,
               'extrapolation':extrapolation}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_rules']) *
      len(random_grid['n_committees']) *
      len(random_grid['neighbors']) *
      len(random_grid['extrapolation'])
      )

In [22]:
#usamos la recomendación de probar con el 10%

cbst = Cubist()
cbst_random = RandomizedSearchCV(estimator = cbst,
                               param_distributions = random_grid,
                               n_iter = 5,
                               cv = 3,          # Validación cruzada 3-fold
                               verbose=2,
                               random_state=0,
                               n_jobs = -1      # Paralelizar en todos los cores disponibles
                               )
cbst_random.fit(X_train, y_train)

In [23]:
cbst_random_best = cbst_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(cbst_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', cbst_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', cbst_random_best.score(X_test, y_test))

**Uso de la librería OPTUNA para encontrar los mejores hiperparámetros**

Estimamos los mejores hiperparámetros usando la librería OPTUNA sugerida en el plantemiento del proyecto

In [24]:
import optuna
import optuna.visualization as vis

# Para trabajar con optuna, debemos definir la función objetivo que luego le diremos a optuna que debe minimizar (según nuestro indicador escogido RMSE)
def objective(trial):
  # Dejamos que optuna sugiera los hiperparámetros iniciales
  n_rules = trial.suggest_int("n_rules", 200, 1000)
  n_committees = trial.suggest_int("n_committees", 1, 15)
  neighbors = trial.suggest_int("neighbors", 1, 9)
  extrapolation = trial.suggest_float("extrapolation",0.01, 0.09)

  #Definimos el modelo Cubist a utilizar para la optimización
  cbst = Cubist(n_rules= n_rules, n_committees=n_committees, neighbors=neighbors, extrapolation=extrapolation)
  cbst.fit(X_train, y_train)

  # Make predictions and calculate RMSE
  y_pred = model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  # Return MAE
  return mae

In [25]:
# Creamos el "objeto de estudio", como llama Optuna
study = optuna.create_study(direction="minimize")

# Ejecutamos el proceso en si de optimización, le pasamos la función objetivo definida en el paso previo
study.optimize(objective, n_trials=10, show_progress_bar=True)

In [26]:
# Imprimimos los resultados del estudio con optuna
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

In [27]:
#Utilizamos algunos gráficos utiles de la librería optuna
# Plotear el historial de la optimización realizada
vis.plot_optimization_history(study)

# Plotear tipo 'slice'
#vis.plot_slice(study, params=["n_rules", "n_committees"])

# Ploteo tipo 'contorno'
#vis.plot_contour(study, params=["neighbors", "extrapolation"])

# Plotear parallel_coordinate
vis.plot_parallel_coordinate(study)

In [28]:
optuna.visualization.plot_optimization_history(study)