In [1]:
# Importamos librerías.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
# sklearn.
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.preprocessing import label_binarize
# Hypertuning utilizando grid search.
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Oversampler.
from imblearn.over_sampling import  RandomOverSampler
datos_url_pc= "C://Users/maxib/Source/repos/proyecto_final_coder/datos/Base_predictive_maintenance.csv"

In [2]:
# Leer .csv y creo un DF.
df = pd.read_csv(datos_url_pc, delimiter=",", index_col= ["UDI", "Product_ID"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type,Air_temperature_[K],Process_temperature_[K],Rotational_speed_[rpm],Torque_[Nm],Tool_wear_[min],Target,Failure_Type
UDI,Product_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


# Definimos Variables 

In [3]:
# Definimos datos base.
data_x= df.drop(["Failure_Type", "Target"], axis=1)
data_y_ft= df["Failure_Type"]
data_y_t= df["Target"]
# Datos para testear.
data_test_x= pd.get_dummies(data_x)
data_test_y_ft_dum= pd.get_dummies(data_y_ft)

## Resampleo variable Failure type

In [4]:
# Oversampleamos variable Failure Type.
# Preparamos variable Target para entrenar los algoritmos.
X= data_x
y= data_y_ft
# Instantiating the random over sampler.
ros = RandomOverSampler(sampling_strategy={"Heat Dissipation Failure":1000,"Random Failures":1000,"Tool Wear Failure":1000,
                                            "Overstrain Failure":1000, "Power Failure":1000})
# Resampling X, y
X_ros_failure, y_ros_failure = ros.fit_resample(X, y)
X_ros_failure.info()
X_ros_failure["Type"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     14652 non-null  object 
 1   Air_temperature_[K]      14652 non-null  float64
 2   Process_temperature_[K]  14652 non-null  float64
 3   Rotational_speed_[rpm]   14652 non-null  int64  
 4   Torque_[Nm]              14652 non-null  float64
 5   Tool_wear_[min]          14652 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 686.9+ KB


array(['M', 'L', 'H'], dtype=object)

In [5]:
# Adquerimos Dummies de Failure Type (Para Train)
data_y_dum_train= pd.get_dummies(y_ros_failure)
data_X= pd.get_dummies(X_ros_failure)
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air_temperature_[K]      14652 non-null  float64
 1   Process_temperature_[K]  14652 non-null  float64
 2   Rotational_speed_[rpm]   14652 non-null  int64  
 3   Torque_[Nm]              14652 non-null  float64
 4   Tool_wear_[min]          14652 non-null  int64  
 5   Type_H                   14652 non-null  uint8  
 6   Type_L                   14652 non-null  uint8  
 7   Type_M                   14652 non-null  uint8  
dtypes: float64(3), int64(2), uint8(3)
memory usage: 615.4 KB


# Randon Forest Hiper Failure Type

In [34]:
rf= RandomForestClassifier() # Instanciamos el modelo
# UNO VS EL RESTO
model_rf= OneVsRestClassifier(rf)

# Definimos los parámetros de búsqueda
params ={ 
    'n_estimators': [200,500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [6,7,8],
    'criterion' :['entropy']
}

# metricas= {'f1_score': make_scorer(f1_score),
#            'precision_score': make_scorer(metrics.precision_score) ,
#            'recall_score': make_scorer(recall_score) }
grid_random_forest = GridSearchCV(estimator = rf,
                                  param_grid = params,
                                  scoring = "f1",
                                #   refit=f1_score,
                                  cv = 5, 
                                  verbose = 1, # Muestra el resultado en pantalla
                                  n_jobs = -1) # corrida en paralelo

In [7]:
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [35]:
grid_random_forest.fit(data_X, data_y_dum_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [6, 7, 8],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             scoring='f1', verbose=1)

In [36]:
#Obtenemos el mejor modelo!
grid_random_forest.best_estimator_
print("Mejores parametros: "+str(grid_random_forest.best_params_))
scores = pd.DataFrame(grid_random_forest.cv_results_)
scores

Mejores parametros: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 200}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.949825,0.150125,0.419797,0.030478,entropy,6,sqrt,200,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",,,,,,,,1
1,12.159972,0.258612,1.010016,0.052901,entropy,6,sqrt,500,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",,,,,,,,2
2,5.714398,0.175937,0.419221,0.027475,entropy,6,log2,200,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",,,,,,,,3
3,14.534621,0.594181,1.00264,0.013091,entropy,6,log2,500,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",,,,,,,,4
4,5.090355,0.192099,0.447962,0.093467,entropy,7,sqrt,200,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",,,,,,,,5
5,13.223,0.303697,1.198428,0.197627,entropy,7,sqrt,500,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",,,,,,,,6
6,6.137316,0.17418,0.40831,0.040239,entropy,7,log2,200,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",,,,,,,,7
7,15.634083,0.311816,1.009599,0.044494,entropy,7,log2,500,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",,,,,,,,8
8,5.445685,0.199707,0.401585,0.041915,entropy,8,sqrt,200,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",,,,,,,,9
9,13.322622,0.190251,1.009037,0.035152,entropy,8,sqrt,500,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",,,,,,,,10


In [37]:
#Predicción de casos nuevos
# Prediccion en Train
y_train_pred= grid_random_forest.predict(data_X)
# Prediccion en Test
y_test_pred= grid_random_forest.predict(data_test_x)
# 
y_score= grid_random_forest.best_estimator_.predict_proba(data_test_x)
#Accuracy
print('Exactitud:', accuracy_score(data_test_y_ft_dum, y_test_pred))

Exactitud: 0.9407


In [40]:
print(metrics.classification_report(data_test_y_ft_dum, y_test_pred))

              precision    recall  f1-score   support

           0       0.57      0.93      0.71       112
           1       1.00      0.94      0.97      9652
           2       0.75      0.92      0.83        78
           3       0.84      0.79      0.82        95
           4       1.00      0.22      0.36        18
           5       0.37      0.76      0.50        45

   micro avg       0.98      0.94      0.96     10000
   macro avg       0.75      0.76      0.70     10000
weighted avg       0.99      0.94      0.96     10000
 samples avg       0.94      0.94      0.94     10000



  _warn_prf(average, modifier, msg_start, len(result))
