In [1]:
# import basic data science libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import datetime


In [2]:

# import required machine learning libraries

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [3]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv")
print( df.head() )

print( df.info() )

   UDI Product ID Type  Air temperature [K]  ...  HDF  PWF  OSF  RNF
0    1     M14860    M                298.1  ...    0    0    0    0
1    2     L47181    L                298.2  ...    0    0    0    0
2    3     L47182    L                298.1  ...    0    0    0    0
3    4     L47183    L                298.2  ...    0    0    0    0
4    5     L47184    L                298.2  ...    0    0    0    0

[5 rows x 14 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]       

In [4]:
# map categorical variable 'diagnosis' into numeric

df["Type"] = df["Type"].map({'H': 2, 'M': 1, 'L': 0})

df.head()



Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,1,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,0,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,0,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,0,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,0,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
# drop redundant columns / unneeded columns
# df.drop('id', axis=1, inplace=True)

df.drop('Product ID', axis=1, inplace=True)

df.drop('UDI', axis=1, inplace=True)
df.drop('TWF', axis=1, inplace=True)
df.drop('HDF', axis=1, inplace=True)
df.drop('PWF', axis=1, inplace=True)
df.drop('OSF', axis=1, inplace=True)
df.drop('RNF', axis=1, inplace=True)

In [6]:
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,298.1,308.6,1551,42.8,0,0
1,0,298.2,308.7,1408,46.3,3,0
2,0,298.1,308.5,1498,49.4,5,0
3,0,298.2,308.6,1433,39.5,7,0
4,0,298.2,308.7,1408,40.0,9,0


In [7]:
df['Machine failure'].value_counts(normalize=True) * 100


0    96.61
1     3.39
Name: Machine failure, dtype: float64

In [8]:
#x_train = df

# SKIP THIS, older data split code

print("Raw Dataset shape: " + str(df.shape))

#sklearn 
dataset_size = int(df.shape[0])

validate_percent = 0.15
test_percent = 0.10

#train_size = int(df.shape[0] * (validate_percent + test_percent))
#validate_size = int(train_size * validate_percent)
train_percent = 1.0 - validate_percent - test_percent

train, validate, test = np.split(df.sample(frac=1), [int(train_percent * len(df)), int((1.0 - validate_percent) * len(df))])

print("Train shape: " + str(train.shape))

print("Validate shape: " + str(validate.shape))

print("Test shape: " + str(test.shape))


Raw Dataset shape: (10000, 7)
Train shape: (7500, 7)
Validate shape: (1000, 7)
Test shape: (1500, 7)


In [9]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

df_full = df.copy()

y_train_full = df_full.pop("Machine failure")

y_train_full = pd.DataFrame(y_train_full)



# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(df_full, y_train_full, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 


#print(x_train, x_val, x_test)

#print(y_train, y_val, y_test)

In [10]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy='most_frequent',random_state=0)
clf.fit(x_train, y_train)
clf.score(x_val, y_val)


dummy_predicted = clf.predict(x_val)

print('Dummy Classifier Accuracy is: {}'.format(accuracy_score(y_val, dummy_predicted)))

print('Dummy Classification Report')
print(classification_report(y_val, dummy_predicted))

# this is where we make the point about accuracy vs usefulness (confusion matrix)

Dummy Classifier Accuracy is: 0.9646666666666667
Dummy Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1447
           1       0.00      0.00      0.00        53

    accuracy                           0.96      1500
   macro avg       0.48      0.50      0.49      1500
weighted avg       0.93      0.96      0.95      1500



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Normalize the featues

from sklearn.preprocessing import Normalizer

norm = Normalizer()
norm.fit(x_train)
X_train_norm = norm.transform(x_train)
X_val_norm = norm.transform(x_val)

In [12]:
#SVM_params = {'C':[0.001, 0.1, 10, 100], 'kernel':['rbf' ,'linear', 'poly', 'sigmoid'], 'gamma':['auto']}
#LR_params = {'C':[0.001, 0.1, 1, 10, 100], 'solver':['liblinear']}
#LDA_params = {'n_components':[None, 1,2,3], 'solver':['svd'], 'shrinkage':[None]}
RF_params = {'n_estimators':[10,50,100], 'random_state':[42]}
#GBC_params = {'n_estimators':[10, 50, 100], 'random_state':[42]}

models_opt = []

#models_opt.append(('LR', LogisticRegression(), LR_params))
#models_opt.append(('LDA', LinearDiscriminantAnalysis(), LDA_params))
models_opt.append(('RFC', RandomForestClassifier(), RF_params))
#models_opt.append(('SVM', SVC(), SVM_params))
#models_opt.append(('GBC', GradientBoostingClassifier(), GBC_params))

results = []
names = []

for name, model, params in models_opt:
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    model_grid = GridSearchCV(model, params, cv=5) #, iid=True)
    cv_results = cross_val_score(model_grid, X_train_norm, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "Cross Validation Accuracy {} Accarcy: {} SD: {}".format(name, cv_results.mean(), cv_results.std())
    print(msg)

Cross Validation Accuracy RFC Accarcy: 0.9736 SD: 0.0010832051206181373


In [13]:
GB_params = {'n_estimators':[10, 50, 100, 200]}
gbc = GradientBoostingClassifier(random_state=42)

# Instantiate gridsearch using GBC model and search for the best parameters
gbc_grid = GridSearchCV(gbc, GB_params, cv=3, iid=True)

# Fit model to training data
gbc_grid.fit(X_train_norm, y_train)

print('Optimized number of estimators: {}'.format(gbc_grid.best_params_.values()))

# Instantiate GBC with optimal parameters
gbc_best = GradientBoostingClassifier(**gbc_grid.best_params_, random_state=42)

# Fit GBC to training data
gbc_best.fit(X_train_norm, y_train)

# Evalaute GBC with validation data
gbc_best_predicted = gbc_best.predict(X_val_norm)
print('Model accuracy on validation data: {}'.format(accuracy_score(y_val, gbc_best_predicted)))
print('GBC Classification Report')
print(classification_report(y_val, gbc_best_predicted))



Optimized number of estimators: dict_values([200])
Model accuracy on validation data: 0.9766666666666667
GBC Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1447
           1       0.80      0.45      0.58        53

    accuracy                           0.98      1500
   macro avg       0.89      0.72      0.78      1500
weighted avg       0.97      0.98      0.97      1500

