Objetivo desse notebook é avaliar, a partir dos resultados do Pycaret, os possíveis modelos a serem testados durante a modelagem.

OBS: Recomenda-se criar um ambiente virtual exclusivo para instalar o Pycaret para evitar conflitos com as versões deste projeto.

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import logging
import pycaret.regression as reg

In [2]:
# region: parâmetros necessários para uso do logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(console_format)
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)
# endregion
base_path = sys.path[0]

path_manipulate_data = os.path.abspath(os.path.join(base_path, "..", "0_utils"))

sys.path.append(str(path_manipulate_data))

from class_manipulate_data import ManipulateData

manipulate_data = ManipulateData()
path_preprocessing_output = manipulate_data.get_path_preprocessing_output()

In [3]:
logger.info("Definindo as entradas, a saída e o equipamento.")

input_model = ['setting_1', 'setting_2', 'sensor_2', 'sensor_3',
       'sensor_4', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11',
       'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17',
       'sensor_20', 'sensor_21']

output_model = ['RUL']

equipment_name = 'FD001'

__main__ - INFO - Definindo as entradas, a saída e o equipamento.


In [4]:
logger.info("Lendo os dados de treino.")
path_dataset_train = os.path.join(path_preprocessing_output, f"train_{equipment_name}.csv")

df_train = pd.read_csv(path_dataset_train)

logger.info("Lendo os dados de teste.")
path_dataset_test = os.path.join(path_preprocessing_output, f"test_{equipment_name}.csv")

df_test = pd.read_csv(path_dataset_test)

__main__ - INFO - Lendo os dados de treino.
__main__ - INFO - Lendo os dados de teste.


In [5]:
# concateando os dados de treino e teste

df_train = df_train.drop(columns=["unit_number", "time"])
df_test = df_test.drop(columns=["unit_number", "time"])

df_data = pd.concat([df_train, df_test])

In [6]:
df_data = df_data[input_model + output_model]

In [7]:
reg1 = reg.setup(df_data,
                 target=output_model[0],
                 train_size=0.8,
                 normalize=True,
                 normalize_method='zscore',
                 experiment_name='Turbofan')

Unnamed: 0,Description,Value
0,session_id,4497
1,Target,RUL
2,Original Data,"(33727, 18)"
3,Missing Values,False
4,Numeric Features,16
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(26981, 17)"


In [8]:
best_model = reg.compare_models(fold=10, sort='RMSE', exclude = ['huber'], n_select=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,31.2099,1842.2973,42.9182,0.5918,0.3779,0.3355,0.117
gbr,Gradient Boosting Regressor,31.4162,1846.6454,42.9683,0.5909,0.3955,0.3515,0.855
et,Extra Trees Regressor,31.2847,1852.3154,43.0359,0.5896,0.3717,0.3347,1.634
rf,Random Forest Regressor,31.4687,1870.8277,43.2503,0.5855,0.3742,0.3363,2.939
br,Bayesian Ridge,35.4959,2111.9014,45.9522,0.5321,0.5393,0.6752,0.033
lr,Linear Regression,35.4956,2111.9439,45.9527,0.5321,0.5395,0.6758,0.57
ridge,Ridge Regression,35.4956,2111.9433,45.9527,0.5321,0.5395,0.6758,0.017
lar,Least Angle Regression,35.4956,2111.9438,45.9527,0.5321,0.5395,0.6758,0.019
lasso,Lasso Regression,35.5342,2114.3386,45.9786,0.5316,0.5354,0.6519,0.225
en,Elastic Net,35.7541,2134.3492,46.1955,0.5272,0.5226,0.6068,0.02


In [9]:
best_model[0]

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=4497, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Conclusão

Pode-se observar que os modelos de Boosting e de árvore foram os melhores em relação as métricas da regressão.