In [11]:
import pandas as pd
from utils_manual import get_Xy, model_generator, evaluate

from utils import _find_scaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
df = pd.read_csv('../data/predictive_maintenance.csv')

In [14]:
X_train, X_test, y_train, y_test = get_Xy(df, 'Target', drop_cols=['UDI', 'Product ID', 'Failure Type'])

In [15]:
standard, minmax = _find_scaler(X_train)

# OneHotEncoder for 'Type' feature
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[["Type"]])
X_train_type_encoded = ohe.transform(X_train[["Type"]])
X_test_type_encoded = ohe.transform(X_test[["Type"]])

# StandardScaler for 'Torque [Nm]'
s_scaler = StandardScaler().fit(X_train[standard])
X_train_continuous_norm_scaled = s_scaler.transform(X_train[standard])
X_test_continuous_norm_scaled = s_scaler.transform(X_test[standard])

# MinMaxScaler for continuous features
mm_scaler = MinMaxScaler().fit(X_train[minmax])
X_train_continuous_scaled = mm_scaler.transform(X_train[minmax])
X_test_continuous_scaled = mm_scaler.transform(X_test[minmax])

# Combine encoded categorical features and scaled continuous features
X_train_processed = pd.concat([pd.DataFrame(X_train_type_encoded, columns=['L', 'M', 'H']), pd.DataFrame(X_train_continuous_norm_scaled, columns=standard), pd.DataFrame(X_train_continuous_scaled, columns=minmax)], axis=1)
X_test_processed = pd.concat([pd.DataFrame(X_test_type_encoded, columns=['L', 'M', 'H']), pd.DataFrame(X_test_continuous_norm_scaled, columns=standard), pd.DataFrame(X_test_continuous_scaled, columns=minmax)], axis=1)

In [16]:
print(X_train_processed.shape)
X_train_processed.head(2)

(8000, 8)


Unnamed: 0,L,M,H,Torque [Nm],Air temperature [K],Process temperature [K],Rotational speed [rpm],Tool wear [min]
0,0.0,1.0,0.0,0.666192,0.478261,0.567901,0.114522,0.007905
1,0.0,1.0,0.0,-0.823706,0.402174,0.407407,0.207792,0.098814


In [17]:
# Undersampling and oversampling
ros = RandomOverSampler()
rus = RandomUnderSampler()

X_oversampled, y_oversampled = ros.fit_resample(X_train_processed, y_train)
X_undersampled, y_undersampled = rus.fit_resample(X_train_processed, y_train)

In [18]:
raw_pipeline = model_generator(X_train_processed, y_train, models=None)
over_pipeline = model_generator(X_oversampled, y_oversampled, models=None)
under_pipeline = model_generator(X_undersampled, y_undersampled, models=None)

(8000, 8)
(8000, 8)
(8000, 8)
(8000, 8)
(8000, 8)
(8000, 8)
(8000, 8)
(8000, 8)




(8000, 8)
(15448, 8)
(15448, 8)
(15448, 8)
(15448, 8)
(15448, 8)
(15448, 8)
(15448, 8)




(15448, 8)
(15448, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)
(552, 8)




In [20]:
evals_raw = evaluate(raw_pipeline, X_test_processed, y_test)
for model in evals_raw:
    print(model["name"], '-' * (60 - len(model)))
    print()
    print(model["report"]) 
    print("Balanced accuracy: ", model["balanced_accuracy"])
    print("\n")

Logistic Regression ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.97      1.00      0.99      1937
    positive       1.00      0.19      0.32        63

    accuracy                           0.97      2000
   macro avg       0.99      0.60      0.65      2000
weighted avg       0.98      0.97      0.97      2000

Balanced accuracy:  0.5952380952380952


KNN ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.98      1.00      0.99      1937
    positive       0.86      0.38      0.53        63

    accuracy                           0.98      2000
   macro avg       0.92      0.69      0.76      2000
weighted avg       0.98      0.98      0.97      2000

Balanced accuracy:  0.6894436659537331


Decision Tree ---------------------------------------------------------

              precision    recall  f1-score   sup

In [21]:
evals_over = evaluate(over_pipeline, X_test_processed, y_test)
for model in evals_over:
    print(model["name"], '-' * (60 - len(model)))
    print()
    print(model["report"])  
    print("Balanced accuracy: ", model["balanced_accuracy"])
    print("\n")

Logistic Regression ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.99      0.81      0.89      1937
    positive       0.12      0.84      0.22        63

    accuracy                           0.81      2000
   macro avg       0.56      0.82      0.55      2000
weighted avg       0.97      0.81      0.87      2000

Balanced accuracy:  0.8238357466545386


KNN ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.98      0.98      0.98      1937
    positive       0.39      0.37      0.38        63

    accuracy                           0.96      2000
   macro avg       0.68      0.67      0.68      2000
weighted avg       0.96      0.96      0.96      2000

Balanced accuracy:  0.6732469618375658


Decision Tree ---------------------------------------------------------

              precision    recall  f1-score   sup

In [22]:
evals_under = evaluate(under_pipeline, X_test_processed, y_test)
for model in evals_under:
    print(model["name"], '-' * (60 - len(model)))
    print()
    print(model["report"]) 
    print("Balanced accuracy: ", model["balanced_accuracy"])
    print("\n")

Logistic Regression ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.99      0.78      0.87      1937
    positive       0.10      0.78      0.18        63

    accuracy                           0.78      2000
   macro avg       0.55      0.78      0.53      2000
weighted avg       0.96      0.78      0.85      2000

Balanced accuracy:  0.7784087649859461


KNN ---------------------------------------------------------

              precision    recall  f1-score   support

    negative       0.99      0.83      0.90      1937
    positive       0.13      0.81      0.23        63

    accuracy                           0.83      2000
   macro avg       0.56      0.82      0.56      2000
weighted avg       0.97      0.83      0.88      2000

Balanced accuracy:  0.8182879760060968


Decision Tree ---------------------------------------------------------

              precision    recall  f1-score   sup