Objetivo desse notebook é avaliar, a partir dos resultados do Pycaret, os possíveis modelos a serem testados durante a modelagem.

OBS: Recomenda-se criar um ambiente virtual Python exclusivo para instalar o Pycaret para evitar conflitos com as versões deste projeto.

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import logging
import pycaret.regression as reg

# importando pacote de utilidade do projeto
import util
from class_manipulate_data import ManipulateData

In [6]:
# region: parâmetros necessários para uso do logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(console_format)
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)
# endregion

In [4]:
manipulate_data = ManipulateData()
path_preprocessing_output = manipulate_data.get_path_preprocessing_output()

In [7]:
logger.info("Definindo as entradas, a saída e o equipamento.")

# todas as entradas
input_model = ['time',
    'setting_1', 'setting_2', 'setting_3',
    'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
    'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
    'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16',
    'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21']

output_model = ['RUL']

equipment_name = 'FD001'

__main__ - INFO - Definindo as entradas, a saída e o equipamento.


In [8]:
logger.info("Lendo os dados de treino.")
path_dataset_train = os.path.join(path_preprocessing_output, f"train_{equipment_name}.csv")

df_train = pd.read_csv(path_dataset_train)

logger.info("Lendo os dados de teste.")
path_dataset_test = os.path.join(path_preprocessing_output, f"test_{equipment_name}.csv")

df_test = pd.read_csv(path_dataset_test)

__main__ - INFO - Lendo os dados de treino.
__main__ - INFO - Lendo os dados de teste.


In [9]:
# concateando os dados de treino e teste

df_train = df_train.drop(columns=["unit_number"])
df_test = df_test.drop(columns=["unit_number"])

df_data = pd.concat([df_train, df_test])

In [10]:
df_data = df_data[input_model + output_model]
df_data.head()

Unnamed: 0,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1.0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,191.0
1,2.0,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392.0,2388.0,100.0,39.0,23.4236,190.0
2,3.0,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390.0,2388.0,100.0,38.95,23.3442,189.0
3,4.0,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392.0,2388.0,100.0,38.88,23.3739,188.0
4,5.0,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393.0,2388.0,100.0,38.9,23.4044,187.0


In [11]:
reg1 = reg.setup(df_data,
                 target=output_model[0],
                 train_size=0.8,
                 normalize=True,
                 normalize_method='zscore',
                 experiment_name='Turbofan')

Unnamed: 0,Description,Value
0,session_id,576
1,Target,RUL
2,Original Data,"(33727, 26)"
3,Missing Values,False
4,Numeric Features,24
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(26981, 25)"


In [12]:
best_model = reg.compare_models(fold=10, sort='RMSE', exclude = ['huber'], n_select=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,27.3317,1426.8887,37.7658,0.6872,0.3388,0.2895,0.132
gbr,Gradient Boosting Regressor,27.5472,1433.0701,37.8466,0.6859,0.3517,0.3039,0.883
et,Extra Trees Regressor,27.5808,1438.7386,37.9224,0.6846,0.332,0.29,1.801
rf,Random Forest Regressor,27.6245,1448.3938,38.0492,0.6825,0.3338,0.2897,3.301
br,Bayesian Ridge,32.1167,1719.5964,41.4606,0.623,0.5354,0.5844,0.051
lr,Linear Regression,32.1189,1719.6194,41.4609,0.623,0.5356,0.5845,0.712
ridge,Ridge Regression,32.1189,1719.6186,41.4609,0.623,0.5356,0.5845,0.02
lar,Least Angle Regression,32.1189,1719.6194,41.4609,0.623,0.5356,0.5845,0.021
lasso,Lasso Regression,32.1156,1722.1374,41.4907,0.6225,0.5295,0.5658,0.292
knn,K Neighbors Regressor,30.7984,1790.1821,42.3051,0.6075,0.3626,0.3192,0.488


In [13]:
best_model[0]

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=576, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Conclusão

Pode-se observar que os modelos de Boosting e de árvore foram os melhores em relação as métricas da regressão.