In [21]:
# Importamos librerías.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
# sklearn.
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.preprocessing import label_binarize
# Hypertuning utilizando grid search.
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Oversampler.
from imblearn.over_sampling import  RandomOverSampler
datos_url_pc= "C://Users/maxib/Source/repos/proyecto_final_coder/datos/Base_predictive_maintenance.csv"

In [2]:
# Leer .csv y creo un DF.
df = pd.read_csv(datos_url_pc, delimiter=",", index_col= ["UDI", "Product_ID"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type,Air_temperature_[K],Process_temperature_[K],Rotational_speed_[rpm],Torque_[Nm],Tool_wear_[min],Target,Failure_Type
UDI,Product_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


# Definimos Variables 

In [3]:
# Definimos datos base.
data_x= df.drop(["Failure_Type", "Target"], axis=1)
data_y_ft= df["Failure_Type"]
data_y_t= df["Target"]
# Datos para testear.
data_test_x= pd.get_dummies(data_x)
data_test_y_ft_dum= pd.get_dummies(data_y_ft)

## Resampleo variable Failure type

In [4]:
# Oversampleamos variable Failure Type.
# Preparamos variable Target para entrenar los algoritmos.
X= data_x
y= data_y_ft
# Instantiating the random over sampler.
ros = RandomOverSampler(sampling_strategy={"Heat Dissipation Failure":1000,"Random Failures":1000,"Tool Wear Failure":1000,
                                            "Overstrain Failure":1000, "Power Failure":1000})
# Resampling X, y
X_ros_failure, y_ros_failure = ros.fit_resample(X, y)
X_ros_failure.info()
X_ros_failure["Type"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     14652 non-null  object 
 1   Air_temperature_[K]      14652 non-null  float64
 2   Process_temperature_[K]  14652 non-null  float64
 3   Rotational_speed_[rpm]   14652 non-null  int64  
 4   Torque_[Nm]              14652 non-null  float64
 5   Tool_wear_[min]          14652 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 686.9+ KB


array(['M', 'L', 'H'], dtype=object)

In [5]:
# Adquerimos Dummies de Failure Type (Para Train)
data_y_dum_train= pd.get_dummies(y_ros_failure)
data_X= pd.get_dummies(X_ros_failure)
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air_temperature_[K]      14652 non-null  float64
 1   Process_temperature_[K]  14652 non-null  float64
 2   Rotational_speed_[rpm]   14652 non-null  int64  
 3   Torque_[Nm]              14652 non-null  float64
 4   Tool_wear_[min]          14652 non-null  int64  
 5   Type_H                   14652 non-null  uint8  
 6   Type_L                   14652 non-null  uint8  
 7   Type_M                   14652 non-null  uint8  
dtypes: float64(3), int64(2), uint8(3)
memory usage: 615.4 KB


# Randon Forest Hiper Failure Type

In [33]:
rf= RandomForestClassifier() # Instanciamos el modelo
# UNO VS EL RESTO
model_rf= OneVsRestClassifier(rf)

# Definimos los parámetros de búsqueda
params ={ 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
# metricas= {'f1_score': make_scorer(f1_score),
#            'precision_score': make_scorer(metrics.precision_score) ,
#            'recall_score': make_scorer(recall_score) }
grid_random_forest = GridSearchCV(estimator = rf,
                                  param_grid = params,
                                #   scoring = metricas,
                                #   refit=f1_score,
                                  cv = 5, 
                                  verbose = 1, # Muestra el resultado en pantalla
                                  n_jobs = -1) # corrida en paralelo

In [34]:
grid_random_forest.fit(data_X, data_y_dum_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             verbose=1)

In [35]:
#Obtenemos el mejor modelo!
grid_random_forest.best_estimator_
print("Mejores parametros: "+str(grid_random_forest.best_params_))
print("Mejor Score: "+str(grid_random_forest.best_score_)+'\n')
scores = pd.DataFrame(grid_random_forest.cv_results_)
scores

Mejores parametros: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 200}
Mejor Score: 0.6489161755647237



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.406869,0.233453,0.35437,0.043911,gini,4,auto,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.800409,0.227567,0.831058,0.394198,0.025256,0.455698,0.316489,60
1,9.101707,0.408669,1.041245,0.167723,gini,4,auto,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.816104,0.225861,0.833447,0.392491,0.023549,0.458291,0.321287,59
2,3.288677,0.089546,0.346948,0.023439,gini,4,sqrt,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.863187,0.239509,0.830375,0.388055,0.030375,0.4703,0.327895,54
3,9.011809,0.650962,1.011545,0.148974,gini,4,sqrt,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.844763,0.233026,0.833788,0.393857,0.02116,0.465319,0.327442,58
4,4.037557,0.288779,0.422597,0.053882,gini,4,log2,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.893893,0.224497,0.848464,0.389078,0.102389,0.491664,0.323274,50
5,10.063288,0.278312,0.980443,0.039619,gini,4,log2,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.896963,0.223814,0.848464,0.388737,0.104096,0.492415,0.323765,48
6,3.67977,0.254919,0.367214,0.03121,gini,5,auto,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.882293,0.25725,0.843003,0.389078,0.095904,0.493506,0.31563,47
7,9.268859,1.053534,1.120106,0.239922,gini,5,auto,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.881269,0.258274,0.84471,0.388737,0.098976,0.494393,0.314854,45
8,3.595292,0.105397,0.39631,0.059281,gini,5,sqrt,200,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.869669,0.262027,0.847782,0.389078,0.099659,0.493643,0.311966,46
9,10.134724,1.365828,1.165456,0.307628,gini,5,sqrt,500,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.873081,0.258274,0.846416,0.390785,0.091468,0.492005,0.315002,49


In [36]:
#Predicción de casos nuevos
# Prediccion en Train
y_train_pred= grid_random_forest.predict(data_X)
# Prediccion en Test
y_test_pred= grid_random_forest.predict(data_test_x)
# 
y_score= grid_random_forest.best_estimator_.predict_proba(data_test_x)
#Accuracy
print('Exactitud:', accuracy_score(data_test_y_ft_dum, y_test_pred))

Exactitud: 0.973


In [37]:
# Analisis score F1
print(f"F1 Score TEST del classificador macro: {f1_score(data_test_y_ft_dum, y_test_pred, average='macro')}")

print(f"F1 Score TEST del classificador micro: {f1_score(data_test_y_ft_dum, y_test_pred, average='micro')}")

print(f"F1 Score TEST del classificador weighted: {f1_score(data_test_y_ft_dum, y_test_pred, average='weighted')}")

print(f"F1 Score TEST del classificador: {f1_score(data_test_y_ft_dum, y_test_pred, average=None)}")
# Analisis jaccard
print(f"Jaccard Score TEST del classificador macro: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='macro')}")

print(f"Jaccard Score TEST del classificador micro: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='micro')}")

print(f"Jaccard Score TEST del classificador weighted: {metrics.jaccard_score(data_test_y_ft_dum, y_test_pred, average='weighted')}")


F1 Score TEST del classificador macro: 0.7723638369834158
F1 Score TEST del classificador micro: 0.9758299067295156
F1 Score TEST del classificador weighted: 0.9796009159998578
F1 Score TEST del classificador: [0.71565495 0.98583421 0.98113208 0.99470899 0.5        0.45685279]
Jaccard Score TEST del classificador macro: 0.6851834485708723
Jaccard Score TEST del classificador micro: 0.9528006267136702
Jaccard Score TEST del classificador weighted: 0.9633204605794284


In [38]:
metrics.classification_report(data_test_y_ft_dum, y_test_pred)

  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.56      1.00      0.72       112\n           1       1.00      0.97      0.99      9652\n           2       0.96      1.00      0.98        78\n           3       1.00      0.99      0.99        95\n           4       1.00      0.33      0.50        18\n           5       0.30      1.00      0.46        45\n\n   micro avg       0.98      0.97      0.98     10000\n   macro avg       0.80      0.88      0.77     10000\nweighted avg       0.99      0.97      0.98     10000\n samples avg       0.97      0.97      0.97     10000\n'