In [1]:
import os
import pandas as pd
import numpy as np
import datetime

import pycaret
from pycaret.classification import *

In [2]:
data = pd.read_csv('../src/data/final_data/final_data.csv')

In [3]:
train_size = 0.8
session_id = 2022
target = 'State'
experiment_name='predictive_maintenance'

In [4]:
timer = datetime.datetime.now()
experiment = setup(data=data, target=target, session_id=session_id, train_size=train_size, log_experiment = True, experiment_name=experiment_name, fold_shuffle=True, ignore_features=['unit_number', 'RUL', 'cycles'], ignore_low_variance=False, normalize=True)

Unnamed: 0,Description,Value
0,session_id,2022
1,Target,State
2,Target Type,Multiclass
3,Label Encoded,"Attention required: 0, Brand new: 1, Healthy: 2, Mature: 3"
4,Original Data,"(20631, 81)"
5,Missing Values,0
6,Numeric Features,73
7,Categorical Features,4
8,Ordinal Features,0
9,High Cardinality Features,0


In [5]:
fold = 15

In [6]:
best_model = compare_models(sort='F1', fold=fold)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9795,0.9994,0.9795,0.9796,0.9795,0.9727,0.9727,0.6067
lightgbm,Light Gradient Boosting Machine,0.9775,0.9991,0.9774,0.9776,0.9775,0.9699,0.97,1.3633
xgboost,Extreme Gradient Boosting,0.9769,0.9992,0.9769,0.977,0.9769,0.9692,0.9692,6.9947
rf,Random Forest Classifier,0.9761,0.9992,0.976,0.9762,0.9761,0.9681,0.9681,0.9533
catboost,CatBoost Classifier,0.9727,0.9987,0.9727,0.9729,0.9727,0.9636,0.9637,20.324
knn,K Neighbors Classifier,0.9409,0.9917,0.9407,0.9413,0.941,0.9212,0.9213,0.6907
dt,Decision Tree Classifier,0.9377,0.9585,0.9376,0.9378,0.9377,0.9169,0.917,0.1473
gbc,Gradient Boosting Classifier,0.8973,0.9842,0.897,0.8991,0.8978,0.8631,0.8633,11.5893
lr,Logistic Regression,0.7326,0.9213,0.731,0.7322,0.7318,0.6434,0.6437,1.8053
lda,Linear Discriminant Analysis,0.6836,0.9021,0.6823,0.7013,0.6876,0.5782,0.5811,0.1013


In [7]:
model_type = 'lightgbm'

In [8]:
model = create_model(model_type, fold=fold)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9773,0.9992,0.9772,0.9773,0.9773,0.9697,0.9697
1,0.9737,0.9989,0.9736,0.9737,0.9737,0.9649,0.9649
2,0.9827,0.9995,0.9827,0.9828,0.9828,0.977,0.977
3,0.9764,0.9992,0.9762,0.9764,0.9764,0.9685,0.9685
4,0.9736,0.9989,0.9736,0.9739,0.9737,0.9648,0.9649
5,0.9809,0.9992,0.9809,0.981,0.9809,0.9745,0.9746
6,0.9773,0.9991,0.9772,0.9777,0.9773,0.9697,0.9698
7,0.9736,0.9993,0.9736,0.9737,0.9737,0.9648,0.9649
8,0.9818,0.9992,0.9818,0.9819,0.9818,0.9758,0.9758
9,0.9745,0.9992,0.9744,0.9745,0.9745,0.9661,0.9661


In [9]:
tunned_model = tune_model(model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9764,0.999,0.9763,0.9765,0.9764,0.9685,0.9685
1,0.9746,0.9989,0.9745,0.9746,0.9746,0.9661,0.9661
2,0.9697,0.9987,0.9696,0.9697,0.9697,0.9596,0.9596
3,0.9624,0.9984,0.9623,0.9626,0.9625,0.9499,0.9499
4,0.9709,0.9989,0.9709,0.9712,0.971,0.9612,0.9613
5,0.9758,0.999,0.9757,0.9759,0.9758,0.9677,0.9677
6,0.9739,0.999,0.9739,0.974,0.974,0.9653,0.9653
7,0.9739,0.9988,0.9739,0.974,0.974,0.9653,0.9653
8,0.9673,0.9983,0.9672,0.9674,0.9673,0.9564,0.9564
9,0.9745,0.9987,0.9745,0.9748,0.9746,0.9661,0.9661


In [10]:
plot_model(tunned_model, plot = 'auc', save=True)

'AUC.png'

In [11]:
plot_model(tunned_model, plot = 'pr', save=True)

'Precision Recall.png'

In [12]:
plot_model(tunned_model, plot='feature', save=True)

'Feature Importance.png'

In [13]:
plot_model(tunned_model, plot = 'confusion_matrix', save=True)

'Confusion Matrix.png'

In [14]:
save_model(tunned_model, f'../src/models/{timer.day}_{timer.month}_{timer.year}__{timer.hour}{timer.minute}{timer.second}')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['unit_number', 'RUL',
                                                        'cycles'],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='State',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_n...
                                 colsample_bytree=1.0, feature_fraction=1.0,
                                 importance_type='split', learning_rate=0.123,
                                 max_depth=-1, min_chil