In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_raw = pd.read_csv("../data/predictive_maintenance.csv")
df_raw.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [3]:
df_model = df_raw.copy()

## Feature Engineering 

In [4]:
df_model['Type'].replace({'L':"Low", 'M':"Medium",'H':"High"}, inplace=True)

In [5]:
df_model['var_temperature'] = df_model['Process temperature [K]'] - df_model['Air temperature [K]']

In [6]:
dummy_model = pd.get_dummies(df_model['Type'])
df_model = pd.concat([df_model.drop('Type', axis=1), dummy_model], axis=1)
df_model.head()

Unnamed: 0,UDI,Product ID,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,var_temperature,High,Low,Medium
0,1,M14860,298.1,308.6,1551,42.8,0,0,No Failure,10.5,0,0,1
1,2,L47181,298.2,308.7,1408,46.3,3,0,No Failure,10.5,0,1,0
2,3,L47182,298.1,308.5,1498,49.4,5,0,No Failure,10.4,0,1,0
3,4,L47183,298.2,308.6,1433,39.5,7,0,No Failure,10.4,0,1,0
4,5,L47184,298.2,308.7,1408,40.0,9,0,No Failure,10.5,0,1,0


In [13]:
coluna = 'High'

df_model[coluna] = 1

In [7]:
df_model.drop(columns=['UDI', 'Product ID', 'Target'], inplace=True)

 ## Model building 



In [9]:
X = df_model.drop('Failure Type', axis=1)
y = df_model['Failure Type']

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # teste com 300 arvores
from sklearn.model_selection  import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score ,plot_confusion_matrix, recall_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,stratify=y)

dtree = RandomForestClassifier(n_estimators=300, class_weight='balanced')

# dtree = DecisionTreeClassifier()

dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)
y_proba = dtree.predict_proba(X_test)


# print("Recall:", recall_score(y_test, y_pred, average="macro"))
print("ROC_AUC:", roc_auc_score(y_test, y_proba, multi_class = 'ovo', average='weighted'))
print("Confusion matrix:" ,confusion_matrix(y_pred, y_test))
print("Acurácia: ", accuracy_score(y_pred, y_test))

ROC_AUC: 0.8759758044167102
Confusion matrix: [[  32    5    0    0    0    0]
 [   1 2889   11   13    5   14]
 [   0    0   11    1    0    0]
 [   1    2    1   14    0    0]
 [   0    0    0    0    0    0]
 [   0    0    0    0    0    0]]
Acurácia:  0.982


In [11]:
# save model
import pickle
filename = 'finalized_model.sav'
pickle.dump(dtree, open(filename, 'wb'))