In [1]:
import pandas as pd
from utils import get_Xy, preprocess, model_generator, evaluate

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('../data/predictive_maintenance.csv')

In [4]:
# X_train, X_test, y_train, y_test = get_Xy(df, 'Failure Type', binary=['No Failure'], drop_cols=['UDI', 'Product ID'])
X_train, X_test, y_train, y_test = get_Xy(df, 'Target', drop_cols=['UDI', 'Product ID', 'Failure Type'])

In [5]:
# preprocessor = preprocess(X_train, y_train, label_cols=['Target'])
preprocessor = preprocess(X_train, y_train)

In [6]:
preprocessor[1]

In [None]:
# Manually preprocessing the data so that other group members can use it to compare output from their pipelines
cols = dict(zip(range(8), ['H', 'L', 'M'] + list(df.columns[3:])))

X_train_transformed = pd.DataFrame(preprocessor[1].fit_transform(X_train))
X_train_transformed = X_train_transformed.rename(columns=cols)
X_train_transformed.to_pickle("X_train_transformed.pickle")

X_test_transformed = pd.DataFrame(preprocessor[1].transform(X_test))
X_test_transformed = X_test_transformed.rename(columns=cols)
X_test_transformed.to_pickle("X_test_transformed.pickle")

In [7]:
# TO LOAD PROCESSED DATA
# X_train_transformed = pd.read_pickle("X_train_transformed.pickle")
# X_test_transformed = pd.read_pickle("X_test_transformed.pickle")

In [8]:
X_train_transformed

Unnamed: 0,H,L,M,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,0.0,1.0,0.0,3.736597,0.554348,0.679012,0.176950,0.739130
1,0.0,1.0,0.0,4.357693,0.434783,0.629630,0.183353,0.059289
2,0.0,1.0,0.0,3.035359,0.543478,0.419753,0.309080,0.703557
3,0.0,1.0,0.0,5.319391,0.445652,0.604938,0.145518,0.039526
4,0.0,1.0,0.0,3.476137,0.402174,0.345679,0.253783,0.628458
...,...,...,...,...,...,...,...,...
7995,0.0,1.0,0.0,4.988807,0.641304,0.580247,0.123981,0.454545
7996,0.0,1.0,0.0,4.497941,0.782609,0.765432,0.135623,0.802372
7997,0.0,1.0,0.0,4.367711,0.445652,0.444444,0.121653,0.110672
7998,0.0,1.0,0.0,5.409550,0.358696,0.493827,0.109430,0.023715


In [9]:
try:
    pipelines = pd.read_pickle("pipeline.pickle")
    print("reading pipeline from file")
except:
    print("writing pipeline to file")
    pipelines = model_generator(X_train, y_train, preprocessor, models=None)

writing pipeline to file
Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.990 total time=   0.1s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.991 total time=   0.1s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.990 total time=   0.1s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.989 total time=   0.1s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.991 total time=   0.1s
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.990 total time=   0.0s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.991 total time=   0.0s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.990 total time=   0.0s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.989 total time=   0.0s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.991 total time=   0.0



Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END .....................kernel=linear;, score=0.821 total time=   1.5s
[CV 2/5] END .....................kernel=linear;, score=0.829 total time=   1.5s
[CV 3/5] END .....................kernel=linear;, score=0.825 total time=   1.5s
[CV 4/5] END .....................kernel=linear;, score=0.821 total time=   1.6s
[CV 5/5] END .....................kernel=linear;, score=0.836 total time=   1.5s
[CV 1/5] END .......................kernel=poly;, score=0.864 total time=   1.3s
[CV 2/5] END .......................kernel=poly;, score=0.871 total time=   1.3s
[CV 3/5] END .......................kernel=poly;, score=0.870 total time=   1.3s
[CV 4/5] END .......................kernel=poly;, score=0.863 total time=   1.3s
[CV 5/5] END .......................kernel=poly;, score=0.882 total time=   1.3s
[CV 1/5] END ........................kernel=rbf;, score=0.861 total time=   1.5s
[CV 2/5] END ........................kernel=rbf;,

In [25]:
evals = evaluate(pipelines, X_test, y_test)

dict_keys(['preprocessor', 'over-sampler', 'classifier'])
-----------------
over-sampler
{'Logistic Regression': {'over-sampler': '              precision    recall  f1-score   support\n\n    negative       0.99      0.78      0.87      1928\n    positive       0.11      0.75      0.20        72\n\n    accuracy                           0.78      2000\n   macro avg       0.55      0.77      0.53      2000\nweighted avg       0.96      0.78      0.85      2000\n'}}
dict_keys(['preprocessor', 'under-sampler', 'classifier'])
-----------------
under-sampler
{'Logistic Regression': {'over-sampler': '              precision    recall  f1-score   support\n\n    negative       0.99      0.78      0.87      1928\n    positive       0.11      0.75      0.20        72\n\n    accuracy                           0.78      2000\n   macro avg       0.55      0.77      0.53      2000\nweighted avg       0.96      0.78      0.85      2000\n', 'under-sampler': '              precision    recall  f1-score

In [28]:
evals['Logistic Regression'].keys()

dict_keys(['over-sampler', 'under-sampler'])

In [38]:
for model, samplers in evals.items():
    print(model, '-' * (60 - len(model)))
    print()
    for sample, report in samplers.items():
        print(sample, end="  |  ") 
        print(report) 
    print("\n")
    # print(f"{e['model']} ({e['sample']})", '-' * (45 - len(e['model'])), "|")
    # print(e['report'])

Logistic Regression -----------------------------------------

over-sampler  |  under-sampler  |  

KNN ---------------------------------------------------------

over-sampler  |  under-sampler  |  

Decision Tree -----------------------------------------------

over-sampler  |  under-sampler  |  

Random Forest -----------------------------------------------

over-sampler  |  under-sampler  |  

Extremely Random Trees --------------------------------------

over-sampler  |  under-sampler  |  

Gradient Boosting -------------------------------------------

over-sampler  |  under-sampler  |  

AdaBoost ----------------------------------------------------

over-sampler  |  under-sampler  |  

SVM ---------------------------------------------------------

over-sampler  |  under-sampler  |  

Naive Bayes -------------------------------------------------

over-sampler  |  under-sampler  |  



--- WITHOUT OVERSAMPLING ---

Logistic Regression -----------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.96      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000

KNN ---------------------------------------------------
              precision    recall  f1-score   support

    negative       0.96      0.89      0.93        76
    positive       1.00      1.00      1.00      1924

    accuracy                           0.99      2000
   macro avg       0.98      0.95      0.96      2000
weighted avg       0.99      0.99      0.99      2000

Decision Tree -----------------------------------------
              precision    recall  f1-score   support

    negative       0.93      0.93      0.93        76
    positive       1.00      1.00      1.00      1924

    accuracy                           0.99      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.99      0.99      0.99      2000

Random Forest -----------------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.96      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000

Extremely Random Trees --------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.96      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000

Gradient Boosting -------------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.96      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000

AdaBoost ----------------------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.95      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       1.00      1.00      1.00      2000

SVM ---------------------------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.96      0.95        76
    positive       1.00      1.00      1.00      1924

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000



--- WITH OVERSAMPLING ---

Logistic Regression -----------------------------------
              precision    recall  f1-score   support

    negative       0.95      0.97      0.96        71
    positive       1.00      1.00      1.00      1929

    accuracy                           1.00      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       1.00      1.00      1.00      2000

KNN ---------------------------------------------------
              precision    recall  f1-score   support

    negative       0.89      0.92      0.90        71
    positive       1.00      1.00      1.00      1929

    accuracy                           0.99      2000
   macro avg       0.94      0.96      0.95      2000
weighted avg       0.99      0.99      0.99      2000

Decision Tree -----------------------------------------
              precision    recall  f1-score   support

    negative       0.92      0.93      0.92        71
    positive       1.00      1.00      1.00      1929

    accuracy                           0.99      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.99      0.99      0.99      2000

Random Forest -----------------------------------------
              precision    recall  f1-score   support

    negative       0.96      0.97      0.97        71
    positive       1.00      1.00      1.00      1929

    accuracy                           1.00      2000
   macro avg       0.98      0.99      0.98      2000
weighted avg       1.00      1.00      1.00      2000

Extremely Random Trees --------------------------------
              precision    recall  f1-score   support

    negative       0.96      0.97      0.97        71
    positive       1.00      1.00      1.00      1929

    accuracy                           1.00      2000
   macro avg       0.98      0.99      0.98      2000
weighted avg       1.00      1.00      1.00      2000

Gradient Boosting -------------------------------------
              precision    recall  f1-score   support

    negative       0.92      0.97      0.95        71
    positive       1.00      1.00      1.00      1929

    accuracy                           1.00      2000
   macro avg       0.96      0.98      0.97      2000
weighted avg       1.00      1.00      1.00      2000

AdaBoost ----------------------------------------------
              precision    recall  f1-score   support

    negative       0.87      0.97      0.92        71
    positive       1.00      0.99      1.00      1929

    accuracy                           0.99      2000
   macro avg       0.94      0.98      0.96      2000
weighted avg       0.99      0.99      0.99      2000

SVM ---------------------------------------------------
              precision    recall  f1-score   support

    negative       0.96      0.97      0.97        71
    positive       1.00      1.00      1.00      1929

    accuracy                           1.00      2000
   macro avg       0.98      0.99      0.98      2000
weighted avg       1.00      1.00      1.00      2000

Naive Bayes -------------------------------------------
              precision    recall  f1-score   support

    negative       0.09      0.18      0.12        71
    positive       0.97      0.93      0.95      1929

    accuracy                           0.90      2000
   macro avg       0.53      0.56      0.53      2000
weighted avg       0.94      0.90      0.92      2000



In [None]:
# Multiclass classification
X_train, X_test, y_train, y_test = get_Xy(df, 'Target', drop_cols=['UDI', 'Product ID', 'Failure Type'])