In [1]:
!pip install ucimlrepo shap --quiet
!pip install --upgrade cubist --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m582.0/582.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import shap
from cubist import Cubist

# fetch dataset
steel_industry_energy_consumption = fetch_ucirepo(id=851)

In [3]:
df = pd.DataFrame(steel_industry_energy_consumption.data.original)

In [4]:
df.sample(n=10)

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
34394,25/12/2018 06:45,3.2,2.3,0.07,0.0,81.2,99.98,24300,Weekday,Tuesday,Light_Load
3581,07/02/2018 07:30,6.08,5.58,0.0,0.0,73.68,100.0,27000,Weekday,Wednesday,Light_Load
24678,15/09/2018 01:45,2.99,5.8,0.0,0.0,45.82,100.0,6300,Weekend,Saturday,Light_Load
33315,14/12/2018 01:00,5.29,2.88,0.0,0.0,87.83,100.0,3600,Weekday,Friday,Light_Load
29390,03/11/2018 03:45,3.2,4.86,0.0,0.0,54.99,100.0,13500,Weekend,Saturday,Light_Load
28337,23/10/2018 04:30,3.02,5.83,0.0,0.0,46.0,100.0,16200,Weekday,Tuesday,Light_Load
28836,28/10/2018 09:15,3.67,0.0,15.66,0.0,100.0,22.82,33300,Weekend,Sunday,Light_Load
4894,20/02/2018 23:45,97.92,46.91,0.0,0.04,90.19,100.0,85500,Weekday,Tuesday,Light_Load
21549,13/08/2018 11:30,81.54,47.16,0.0,0.04,86.56,100.0,41400,Weekday,Monday,Maximum_Load
31040,20/11/2018 08:15,25.88,17.21,0.36,0.01,83.27,99.99,29700,Weekday,Tuesday,Light_Load


In [5]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M')

In [6]:
pd.DataFrame(steel_industry_energy_consumption.variables)

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,date,Other,Date,,,,no
1,Usage_kWh,Feature,Continuous,,Industry Energy Consumption,kWh,no
2,Lagging_Current_Reactive.Power_kVarh,Feature,Continuous,,,kVarh,no
3,Leading_Current_Reactive_Power_kVarh,Feature,Continuous,,,kVarh,no
4,CO2(tCO2),Feature,Continuous,,,ppm,no
5,Lagging_Current_Power_Factor,Feature,Continuous,,,%,no
6,Leading_Current_Power_Factor,Feature,Continuous,,,%,no
7,NSM,Feature,Integer,,,s,no
8,WeekStatus,Feature,Categorical,,Weekend (0) or a Weekday(1),,no
9,Day_of_week,Feature,Categorical,,"Sunday, Monday, ..., Saturday",,no


In [7]:
df.dtypes

date                                    datetime64[ns]
Usage_kWh                                      float64
Lagging_Current_Reactive.Power_kVarh           float64
Leading_Current_Reactive_Power_kVarh           float64
CO2(tCO2)                                      float64
Lagging_Current_Power_Factor                   float64
Leading_Current_Power_Factor                   float64
NSM                                              int64
WeekStatus                                      object
Day_of_week                                     object
Load_Type                                       object
dtype: object

In [8]:
# Crear nuevas columnas descomponiendo la fecha y hora en sus componentes
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second
df['dayofweek'] = df['date'].dt.dayofweek  # Lunes=0, Domingo=6
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week
df['quarter'] = df['date'].dt.quarter

In [9]:
df = pd.get_dummies(df, columns=["Day_of_week", "Load_Type"], drop_first=True)

In [10]:
df.columns.to_list()

['date',
 'Usage_kWh',
 'Lagging_Current_Reactive.Power_kVarh',
 'Leading_Current_Reactive_Power_kVarh',
 'CO2(tCO2)',
 'Lagging_Current_Power_Factor',
 'Leading_Current_Power_Factor',
 'NSM',
 'WeekStatus',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'second',
 'dayofweek',
 'dayofyear',
 'weekofyear',
 'quarter',
 'Day_of_week_Monday',
 'Day_of_week_Saturday',
 'Day_of_week_Sunday',
 'Day_of_week_Thursday',
 'Day_of_week_Tuesday',
 'Day_of_week_Wednesday',
 'Load_Type_Maximum_Load',
 'Load_Type_Medium_Load']

In [11]:
df['IsWeekend'] = df['WeekStatus'] == 'Weekend'

In [12]:
df = df.drop(['date', 'WeekStatus'], axis=1)

In [13]:
df.dtypes

Usage_kWh                               float64
Lagging_Current_Reactive.Power_kVarh    float64
Leading_Current_Reactive_Power_kVarh    float64
CO2(tCO2)                               float64
Lagging_Current_Power_Factor            float64
Leading_Current_Power_Factor            float64
NSM                                       int64
year                                      int32
month                                     int32
day                                       int32
hour                                      int32
minute                                    int32
second                                    int32
dayofweek                                 int32
dayofyear                                 int32
weekofyear                               UInt32
quarter                                   int32
Day_of_week_Monday                         bool
Day_of_week_Saturday                       bool
Day_of_week_Sunday                         bool
Day_of_week_Thursday                    

In [14]:
X = df
X = df.drop('Usage_kWh', axis=1)
y = df['Usage_kWh']

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo
model.fit(X_train, y_train)

# Realizar predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
# Calcular MSE
mse = mean_squared_error(y_test, y_pred)

# Calcular RMSE
rmse = np.sqrt(mse)

# Calcular MAE
mae = mean_absolute_error(y_test, y_pred)

# Calcular R^2
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error (MSE): 17.717200552024053
Root Mean Squared Error (RMSE): 4.209180508367877
Mean Absolute Error (MAE): 2.595219175169306
R^2 Score: 0.984413783416719


Agregamos modelos a utilizar, Arbol de Decisión, RandomForest, XGBoost Regressor, CUBIST

In [40]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

modelos = [('LinearRegression',LinearRegression()),
    ('DecisionTreeRegressor', DecisionTreeRegressor(max_depth=2)),
    ('RandomForestRegressor', RandomForestRegressor(min_samples_split=2, min_weight_fraction_leaf=0.0,n_estimators=40, n_jobs=-1, oob_score=True,random_state=None, verbose=0, warm_start=False)),
    ('XGBRegressor', xgb.XGBRegressor(learning_rate=0.01,n_estimators=500,max_depth=5,eval_metric='rmsle')),
    ('Cubist', Cubist(n_rules=500,neighbors=None,unbiased=True,auto=False,extrapolation=0.1,n_committees=5))
]


#procedimiento para comparar los modelos
for nombre, model in modelos:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  print(f'{nombre}," score: ",{model.score(X_test,y_pred):.03f}',end=" ")
  print(f'Mean Squared Error (MSE): {mse:.03f}',end=" ")
  print(f'Root Mean Squared Error (RMSE): {rmse:.03f}',end=" ")
  print(f'Mean Absolute Error (MAE): {mae:.03f}',end=" ")
  print(f'R^2 Score: {r2:.03f}',end=" ")
  print("")

LinearRegression," score: ",1.000 Mean Squared Error (MSE): 17.717 Root Mean Squared Error (RMSE): 4.209 Mean Absolute Error (MAE): 2.595 R^2 Score: 0.984 
DecisionTreeRegressor," score: ",1.000 Mean Squared Error (MSE): 65.380 Root Mean Squared Error (RMSE): 8.086 Mean Absolute Error (MAE): 4.597 R^2 Score: 0.942 
RandomForestRegressor," score: ",1.000 Mean Squared Error (MSE): 1.258 Root Mean Squared Error (RMSE): 1.122 Mean Absolute Error (MAE): 0.393 R^2 Score: 0.999 
XGBRegressor," score: ",1.000 Mean Squared Error (MSE): 5.291 Root Mean Squared Error (RMSE): 2.300 Mean Absolute Error (MAE): 1.352 R^2 Score: 0.995 
Cubist," score: ",1.000 Mean Squared Error (MSE): 0.115 Root Mean Squared Error (RMSE): 0.339 Mean Absolute Error (MAE): 0.076 R^2 Score: 1.000 


Buscamos mejores valores de hiperparámetros para los 2 modelos con mejores resultados, XGBRegressor y Cubist
1. XGBRegressor

In [35]:
from pprint import pprint
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_leaves = [0, 8, 10, 12, 16, 20]  # 0 equivale a 'no limit'
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]


random_grid = {'n_estimators': n_estimators,
               'max_leaves': max_leaves,
               'max_depth': max_depth}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_estimators']) *
      len(random_grid['max_leaves']) *
      len(random_grid['max_depth'])
      )

Los valores a probar en la búsqueda aleatoria son:
{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_leaves': [0, 8, 10, 12, 16, 20],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

El número total de combinaciones de parámetros de entrenamiento es 720


Usaremos la recomendación de busqueda con el 10%

In [46]:
from sklearn.model_selection import RandomizedSearchCV

xgbb = xgb.XGBRegressor()
xgb_random = RandomizedSearchCV(estimator = xgbb,
                               param_distributions = random_grid,
                               n_iter = 20,
                               cv = 3,          # Validación cruzada 3-fold
                               verbose=2,
                               random_state=0,
                               n_jobs = -1      # Paralelizar en todos los cores disponibles
                               )
xgb_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "extrapolation", "n_committees", "n_rules", "neighbors" } are not used.



In [47]:
xgb_random_best = xgb_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(xgb_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', xgb_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', xgb_random_best.score(X_test, y_test))

Los hiperparámetros del mejor modelo son:
{'extrapolation': 0.09, 'n_committees': 5, 'n_rules': 644, 'neighbors': 1}

Exactitud luego de búsqueda aleatoria en entrenamiento: 0.9996738076101616
Exactitud luego de búsqueda aleatoria en validación: 0.9989899659844007


2. Búsqueda de hiperparámetros para el método Cubist

In [51]:
from pprint import pprint
import numpy as np

###Hiperparámetros a considerar
# n_rules (int, default=500)
# n_committees (int, default=0):
# neighbors (int, default=None)
# extrapolation (float, default=0.05):

n_rules = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
n_committees = [1, 2, 5, 10, 15]  # 5 es el recomendado en la documentación
neighbors = [int(x) for x in np.linspace(1, 9, num = 1)]
extrapolation = [0.01, 0.03, 0.05, 0.07, 0.09] #0,05 = 5% es el recomendado


random_grid = {'n_rules': n_rules,
               'n_committees': n_committees,
               'neighbors': neighbors,
               'extrapolation':extrapolation}

print('Los valores a probar en la búsqueda aleatoria son:')
pprint(random_grid)

print()
print('El número total de combinaciones de parámetros de entrenamiento es',
      len(random_grid['n_rules']) *
      len(random_grid['n_committees']) *
      len(random_grid['neighbors']) *
      len(random_grid['extrapolation'])
      )

Los valores a probar en la búsqueda aleatoria son:
{'extrapolation': [0.01, 0.03, 0.05, 0.07, 0.09],
 'n_committees': [1, 2, 5, 10, 15],
 'n_rules': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000],
 'neighbors': [1]}

El número total de combinaciones de parámetros de entrenamiento es 250


In [52]:
#usamos la recomendación de probar con el 10%

cbst = Cubist()
cbst_random = RandomizedSearchCV(estimator = cbst,
                               param_distributions = random_grid,
                               n_iter = 20,
                               cv = 3,          # Validación cruzada 3-fold
                               verbose=2,
                               random_state=0,
                               n_jobs = -1      # Paralelizar en todos los cores disponibles
                               )
cbst_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [53]:
cbst_random_best = cbst_random.best_estimator_

print('Los hiperparámetros del mejor modelo son:')
pprint(cbst_random.best_params_)
print()

print('Exactitud luego de búsqueda aleatoria en entrenamiento:', cbst_random_best.score(X_train, y_train))
print('Exactitud luego de búsqueda aleatoria en validación:', cbst_random_best.score(X_test, y_test))

Los hiperparámetros del mejor modelo son:
{'extrapolation': 0.09, 'n_committees': 10, 'n_rules': 288, 'neighbors': 1}

Exactitud luego de búsqueda aleatoria en entrenamiento: 0.999999999999999
Exactitud luego de búsqueda aleatoria en validación: 0.9999135268905981
