In [1]:
# Importamos librerías.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
# sklearn.
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.preprocessing import label_binarize
# Hypertuning utilizando grid search.
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Oversampler.
from imblearn.over_sampling import  RandomOverSampler
datos_url_pc= "C://Users/maxib/Source/repos/proyecto_final_coder/datos/Base_predictive_maintenance.csv"

In [2]:
# Leer .csv y creo un DF.
df = pd.read_csv(datos_url_pc, delimiter=",", index_col= ["UDI", "Product_ID"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type,Air_temperature_[K],Process_temperature_[K],Rotational_speed_[rpm],Torque_[Nm],Tool_wear_[min],Target,Failure_Type
UDI,Product_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


# Definimos Variables 

In [3]:
# Definimos datos base.
data_x= df.drop(["Failure_Type", "Target"], axis=1)
data_y_ft= df["Failure_Type"]
data_y_t= df["Target"]
# Datos para testear.
data_test_x= pd.get_dummies(data_x)
data_test_y_ft_dum= pd.get_dummies(data_y_ft)

## Resampleo variable Failure type

In [4]:
# Oversampleamos variable Failure Type.
# Preparamos variable Target para entrenar los algoritmos.
X= data_x
y= data_y_ft
# Instantiating the random over sampler.
ros = RandomOverSampler(sampling_strategy={"Heat Dissipation Failure":1000,"Random Failures":1000,"Tool Wear Failure":1000,
                                            "Overstrain Failure":1000, "Power Failure":1000})
# Resampling X, y
X_ros_failure, y_ros_failure = ros.fit_resample(X, y)
X_ros_failure.info()
X_ros_failure["Type"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     14652 non-null  object 
 1   Air_temperature_[K]      14652 non-null  float64
 2   Process_temperature_[K]  14652 non-null  float64
 3   Rotational_speed_[rpm]   14652 non-null  int64  
 4   Torque_[Nm]              14652 non-null  float64
 5   Tool_wear_[min]          14652 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 686.9+ KB


array(['M', 'L', 'H'], dtype=object)

In [5]:
# Adquerimos Dummies de Failure Type (Para Train)
data_y_dum_train= pd.get_dummies(y_ros_failure)
data_X= pd.get_dummies(X_ros_failure)
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air_temperature_[K]      14652 non-null  float64
 1   Process_temperature_[K]  14652 non-null  float64
 2   Rotational_speed_[rpm]   14652 non-null  int64  
 3   Torque_[Nm]              14652 non-null  float64
 4   Tool_wear_[min]          14652 non-null  int64  
 5   Type_H                   14652 non-null  uint8  
 6   Type_L                   14652 non-null  uint8  
 7   Type_M                   14652 non-null  uint8  
dtypes: float64(3), int64(2), uint8(3)
memory usage: 615.4 KB


# Randon Forest Hiper Failure Type

In [11]:
rf= RandomForestClassifier() # Instanciamos el modelo
# UNO VS EL RESTO
model_rf= OneVsRestClassifier(rf)

# Definimos los parámetros de búsqueda
params ={ 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
grid_random_forest = GridSearchCV(estimator = rf,
                                  param_grid = params,
                                  scoring = 'neg_mean_absolute_error',
                                  cv = 5, 
                                  verbose = 1, # Muestra el resultado en pantalla
                                  n_jobs = -1) # corrida en paralelo

In [12]:
grid_random_forest.fit(data_X, data_y_dum_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             scoring='neg_mean_absolute_error', verbose=1)

In [13]:
#Obtenemos el mejor modelo!
grid_random_forest.best_estimator_
print("Mejores parametros: "+str(grid_random_forest.best_params_))
print("Mejor Score: "+str(grid_random_forest.best_score_)+'\n')
scores = pd.DataFrame(grid_random_forest.cv_results_)
scores

Mejores parametros: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 200}
Mejor Score: -14.821693140176274



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.789445,0.214618,0.383313,0.026464,gini,4,auto,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-8.397532,-32.183953,-7.296815,-26.047668,-41.618885,-23.108971,13.416847,60
1,9.497965,0.220246,1.092762,0.065906,gini,4,auto,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-8.005914,-32.517514,-7.036007,-25.683618,-41.641126,-22.976836,13.60107,59
2,3.770816,0.196668,0.457236,0.068614,gini,4,sqrt,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-5.396679,-32.198965,-7.050683,-25.844027,-41.660808,-22.430232,14.167354,54
3,9.614704,0.349447,1.133188,0.049162,gini,4,sqrt,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-7.454737,-32.387297,-7.094482,-26.046359,-41.66058,-22.928691,13.713017,58
4,4.558791,0.348629,0.440668,0.030525,gini,4,log2,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-4.766064,-32.726942,-6.59215,-26.034983,-38.654778,-21.754983,13.732008,52
5,12.398757,0.424882,1.042988,0.028576,gini,4,log2,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",-4.447003,-33.000171,-6.621331,-26.022127,-38.522924,-21.722711,13.816041,50
6,4.662121,0.323076,0.481468,0.061806,gini,5,auto,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",-5.895201,-31.681679,-6.894824,-26.002844,-38.191297,-21.733169,13.107942,51
7,10.989257,0.381824,1.018022,0.038826,gini,5,auto,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",-5.617252,-31.261401,-6.634016,-25.959841,-38.692435,-21.632989,13.296057,47
8,4.261568,0.358303,0.452124,0.060116,gini,5,sqrt,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",-5.834129,-31.376834,-6.909841,-25.9,-38.41132,-21.686425,13.122675,48
9,9.868501,0.308254,1.081138,0.105982,gini,5,sqrt,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",-5.561299,-31.26106,-6.749602,-25.944141,-38.439306,-21.591082,13.217744,46


In [19]:
#Predicción de casos nuevos
# Prediccion en Train
y_train_pred= grid_random_forest.predict(data_X)
# Prediccion en Test
y_test_pred= grid_random_forest.predict(data_test_x)
# 
y_score= grid_random_forest.best_estimator_.predict_proba(data_test_x)
#Accuracy
print('Exactitud:', accuracy_score(data_test_y_ft_dum, y_test_pred))

Exactitud: 0.9714


In [29]:
# Analisis score F1
print(f"F1 Score TEST del classificador macro: {f1_score(data_test_y_ft_dum, y_test_pred, average='macro')}")

print(f"F1 Score TEST del classificador micro: {f1_score(data_test_y_ft_dum, y_test_pred, average='micro')}")

print(f"F1 Score TEST del classificador weighted: {f1_score(data_test_y_ft_dum, y_test_pred, average='weighted')}")

print(f"F1 Score TEST del classificador: {f1_score(data_test_y_ft_dum, y_test_pred, average=None)}")
# Analisis jaccard
print(f"Jaccard Score TEST del classificador macro: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='macro')}")

print(f"Jaccard Score TEST del classificador micro: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='micro')}")

print(f"Jaccard Score TEST del classificador weighted: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='weighted')}")


F1 Score TEST del classificador macro: 0.7656481635822555
F1 Score TEST del classificador micro: 0.9743718340939866
F1 Score TEST del classificador weighted: 0.978585252022904
F1 Score TEST del classificador: [0.71111111 0.9850323  0.96296296 1.         0.5        0.43478261]
Jaccard Score TEST del classificador macro: 0.6769854552723441
Jaccard Score TEST del classificador micro: 0.9500244498777506
Jaccard Score TEST del classificador weighted: 0.961504610828262


In [30]:
metrics.classification_report(data_test_y_ft_dum, y_test_pred)

  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.55      1.00      0.71       112\n           1       1.00      0.97      0.99      9652\n           2       0.93      1.00      0.96        78\n           3       1.00      1.00      1.00        95\n           4       1.00      0.33      0.50        18\n           5       0.28      1.00      0.43        45\n\n   micro avg       0.98      0.97      0.97     10000\n   macro avg       0.79      0.88      0.77     10000\nweighted avg       0.99      0.97      0.98     10000\n samples avg       0.97      0.97      0.97     10000\n'