# Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Algoritmos de ML de Classificação
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# para calcular a performance dos modelos
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

# Leitura dos dados 

In [2]:
df_train = pd.read_csv('desafio_manutencao_preditiva_treino.csv')
df_test = pd.read_csv('desafio_manutencao_preditiva_teste.csv')

In [3]:
df_train.head()

Unnamed: 0,udi,product_id,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,1,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,5,L47184,L,298.2,308.7,1408,40.0,9,No Failure
3,6,M14865,M,298.1,308.6,1425,41.9,11,No Failure
4,7,L47186,L,298.1,308.6,1558,42.4,14,No Failure


As colunas 'udi' e 'product_id' serão removidas por se tratarem de identificação dos produtos e não influenciarem na criação dos modelos de Machine Learning.

In [4]:
df_train = df_train.drop(['udi', 'product_id'], axis=1)
df_train

Unnamed: 0,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,M,298.1,308.6,1551,42.8,0,No Failure
1,L,298.2,308.7,1408,46.3,3,No Failure
2,L,298.2,308.7,1408,40.0,9,No Failure
3,M,298.1,308.6,1425,41.9,11,No Failure
4,L,298.1,308.6,1558,42.4,14,No Failure
...,...,...,...,...,...,...,...
6662,L,298.8,308.3,1634,27.9,12,No Failure
6663,M,298.8,308.4,1604,29.5,14,No Failure
6664,H,298.9,308.4,1632,31.8,17,No Failure
6665,H,299.0,308.7,1408,48.5,25,No Failure


# Análise exploratória dos dados

In [5]:
df_train.shape

(6667, 7)

In [6]:
df_train.describe()

Unnamed: 0,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
count,6667.0,6667.0,6667.0,6667.0,6667.0
mean,299.992515,309.99262,1537.419529,40.058512,108.098095
std,1.99471,1.488101,177.182908,9.950804,63.359915
min,295.3,305.7,1168.0,3.8,0.0
25%,298.3,308.8,1422.5,33.2,54.0
50%,300.0,310.0,1503.0,40.2,108.0
75%,301.5,311.1,1612.0,46.8,162.0
max,304.5,313.8,2886.0,76.6,251.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6667 entries, 0 to 6666
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type                   6667 non-null   object 
 1   air_temperature_k      6667 non-null   float64
 2   process_temperature_k  6667 non-null   float64
 3   rotational_speed_rpm   6667 non-null   int64  
 4   torque_nm              6667 non-null   float64
 5   tool_wear_min          6667 non-null   int64  
 6   failure_type           6667 non-null   object 
dtypes: float64(3), int64(2), object(2)
memory usage: 364.7+ KB


In [8]:
count = pd.DataFrame(df_train['failure_type'].value_counts())
count

Unnamed: 0,failure_type
No Failure,6435
Heat Dissipation Failure,75
Power Failure,63
Overstrain Failure,52
Tool Wear Failure,30
Random Failures,12


# Pré-processamento dos dados

Os dados categóricos foram convertidos para dados numéricos para criação dos modelos de Machine Learning. Os atributos foram denominados como a variável 'X' e a variável alvo (target) como variável 'y'. Os atributos foram normalizados com a função MinMaxScaler para que a diferença de suas grandezas não causassem influencia no treinamento dos modelos.

In [9]:
df_train = df_train.replace({'type': {'M':0, 'L': 1, 'H':2}})
df_train = df_train.replace({'failure_type': {'No Failure': 0, 'Power Failure': 1, 'Tool Wear Failure': 2, 'Overstrain Failure': 3, 'Random Failures': 4, 'Heat Dissipation Failure': 5}})
df_train

Unnamed: 0,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,failure_type
0,0,298.1,308.6,1551,42.8,0,0
1,1,298.2,308.7,1408,46.3,3,0
2,1,298.2,308.7,1408,40.0,9,0
3,0,298.1,308.6,1425,41.9,11,0
4,1,298.1,308.6,1558,42.4,14,0
...,...,...,...,...,...,...,...
6662,1,298.8,308.3,1634,27.9,12,0
6663,0,298.8,308.4,1604,29.5,14,0
6664,2,298.9,308.4,1632,31.8,17,0
6665,2,299.0,308.7,1408,48.5,25,0


In [10]:
X = df_train.drop('failure_type', axis=1)
y = df_train['failure_type']
X

Unnamed: 0,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,0,298.1,308.6,1551,42.8,0
1,1,298.2,308.7,1408,46.3,3
2,1,298.2,308.7,1408,40.0,9
3,0,298.1,308.6,1425,41.9,11
4,1,298.1,308.6,1558,42.4,14
...,...,...,...,...,...,...
6662,1,298.8,308.3,1634,27.9,12
6663,0,298.8,308.4,1604,29.5,14
6664,2,298.9,308.4,1632,31.8,17
6665,2,299.0,308.7,1408,48.5,25


In [11]:
y

0       0
1       0
2       0
3       0
4       0
       ..
6662    0
6663    0
6664    0
6665    0
6666    0
Name: failure_type, Length: 6667, dtype: int64

In [12]:
normalization = MinMaxScaler()
X = pd.DataFrame(normalization.fit_transform(X), columns=X.columns)

In [13]:
X

Unnamed: 0,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,0.0,0.304348,0.358025,0.222934,0.535714,0.000000
1,0.5,0.315217,0.370370,0.139697,0.583791,0.011952
2,0.5,0.315217,0.370370,0.139697,0.497253,0.035857
3,0.0,0.304348,0.358025,0.149593,0.523352,0.043825
4,0.5,0.304348,0.358025,0.227008,0.530220,0.055777
...,...,...,...,...,...,...
6662,0.5,0.380435,0.320988,0.271246,0.331044,0.047809
6663,0.0,0.380435,0.333333,0.253783,0.353022,0.055777
6664,1.0,0.391304,0.333333,0.270081,0.384615,0.067729
6665,1.0,0.402174,0.370370,0.139697,0.614011,0.099602


O conjunto dos dados foi dividido em 70% dados de treinamento e 30% para dados de teste. Foram escolhidos 4 modelos de classificação, sendo eles:

-Randon Florest;

-Decision Tree;

-Gaussian Naive Bayes;

-K-Neighbors

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [15]:
X_train.shape

(4666, 6)

In [16]:
y_train.shape

(4666,)

# Avaliação do modelo:
É importante ver o desempenho dos modelos. Existem algumas maneiras de avaliar a precisão de um modelo de aprendizado de máquina:

a) Precisão : a proporção do número total de previsões que estavam corretas;

b) Sensibilidade ou Recall: a proporção de casos positivos reais que são corretamente identificados;

c) Especificidade: a proporção de casos negativos reais que são identificados corretamente.

In [17]:
models = {
    'Random Forest': RandomForestClassifier,
    'Decision Tree': DecisionTreeClassifier,
    'GaussianNB': GaussianNB,
    'k-nearest neighbors': KNeighborsClassifier,
}
prev = pd.DataFrame([])
for classifier, model in models.items():
    clf = model()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    sensitivity=recall_score(y_test,y_pred, average='weighted')
    specificity=recall_score(y_test,y_pred, average='weighted')
    temp_prev = pd.DataFrame({'Algoritmo':[classifier], 'Acuracia':[np.round(accuracy,2)], 'sensitivity':[np.round(sensitivity,2)], 'specificity':[np.round(specificity,2)]})
    prev = pd.concat( [prev, temp_prev] )
    prev = prev[['Algoritmo','Acuracia', 'sensitivity', 'specificity']]

prev.reset_index(drop=True)

Unnamed: 0,Algoritmo,Acuracia,sensitivity,specificity
0,Random Forest,0.98,0.98,0.98
1,Decision Tree,0.97,0.97,0.97
2,GaussianNB,0.89,0.89,0.89
3,k-nearest neighbors,0.97,0.97,0.97


# Após o treinamento dos modelos, pode-se observar que todos apresentaram alta acurácia, assim como a sensibilidade e especificidade dos modelos. Foi escolhido o Randon Florest para fazer a previsão nos dados de teste.

Existem alguns prós e contras a serem levados em consideração do uso de Random Florest. 

Prós: fácil de interpretar
Lida bem com dados categóricos e contínuos.
Funciona bem em um grande  conjunto de dados .
Não é sensível a outliers.
Natureza não paramétrica.

Contras:
Estes são propensos a overfitting.
Pode ser bastante grande, tornando necessária a poda.
Não pode garantir árvores ótimas.
Ele fornece baixa precisão de previsão para um conjunto de dados em comparação com outros algoritmos de aprendizado de máquina .
Os cálculos podem se tornar complexos quando há muitas variáveis ​​de classe.
Alta variação (o modelo mudará rapidamente com uma alteração nos dados de treinamento)

Leitura dos dados

In [18]:
df_test.head()

Unnamed: 0,udi,product_id,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,446,L47625,L,297.5,308.6,1793,26.7,70
1,7076,L54255,L,300.7,310.5,1536,47.4,192
2,1191,L48370,L,297.2,308.4,1460,42.1,41
3,2618,L49797,L,299.4,309.1,1670,35.9,68
4,5067,L52246,L,304.1,313.1,1550,30.9,9


O pré-processamento dos dados de teste foi o mesmo utilizado nos dados de treinamento.

In [19]:
df_test = df_test.drop(['udi', 'product_id'], axis=1)
df_test

Unnamed: 0,type,air_temperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min
0,L,297.5,308.6,1793,26.7,70
1,L,300.7,310.5,1536,47.4,192
2,L,297.2,308.4,1460,42.1,41
3,L,299.4,309.1,1670,35.9,68
4,L,304.1,313.1,1550,30.9,9
...,...,...,...,...,...,...
3328,L,302.5,311.9,1306,59.7,172
3329,L,300.7,311.0,1413,52.0,91
3330,L,300.8,311.2,1481,38.5,181
3331,L,302.8,312.2,1509,36.5,52


In [20]:
df_test = df_test.replace({'type': {'M':0, 'L': 1, 'H':2}})
normalization = MinMaxScaler()
df_test = pd.DataFrame(normalization.fit_transform(df_test), columns=df_test.columns)

In [21]:
rfc_clf = RandomForestClassifier()
rfc_clf.fit(X_train, y_train)
 
y_pred_test = rfc_clf.predict(df_test)

Criação e exportação do arquivo 'predicted.csv'.

In [22]:
prev_test = pd.DataFrame(y_pred_test)
prev_test['rowNumber'] = prev_test.index
prev_test = prev_test.rename(columns={0: 'predictedValues'})
prev_test = prev_test[['rowNumber', 'predictedValues']]

In [23]:
prev_test = prev_test.replace({'predictedValues': {0: 'No Failure', 1: 'Power Failure', 2: 'Tool Wear Failure', 3: 'Overstrain Failure', 4: 'Random Failures', 5: 'Heat Dissipation Failure'}})

In [24]:
prev_test.to_csv('predicted.csv', index=False)