In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

In [None]:
dataset = pd.read_csv('../input/machine-predictive-maintenance-classification/predictive_maintenance.csv')


In [None]:
dataset.describe()

In [None]:
dataset.groupby('Failure Type').count()

In [None]:
dataset.groupby('Failure Type').mean()

In [None]:
dataset.groupby(['Failure Type','Type']).mean()

### Developing the model:

#### Preparing the X and Y Values (fts).

#### Since we have a Type Column for the Quality, lets use the OHE in order to convert this data. 

In [None]:
enc = OneHotEncoder() 
X = pd.get_dummies(dataset, columns=['Type']).set_index('UDI') #Changing the Type using OHE and setting the Index to the UDI column. 

In [None]:
X = X.drop(['Failure Type','Product ID'], axis=1) # Droping the Columns not used anymore. 
                                                # Failure Type = OHE; Product ID = Product Identification
y = dataset[['Failure Type']] #Predicted Value. 

In [None]:
# Now our X dataset is ready to be used. 
X

# Random Forest Model:

In [None]:
# Splitting the dataset on Train x Test: [RandomState Seed = 42] and Test Size= 40%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=11)

## Selecting the n_estimators using some few tests:



In [None]:

scores = []
n = 100
for k in range(1,n):
    forest = RandomForestClassifier(n_estimators=k)
    forest.fit(X_train, y_train.values.ravel())
    y_pred_test = forest.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred_test))


# Plotting the Results and Reletionship (K x Accuracy):

plt.plot(range(1,n), scores)
plt.xlabel('Value of n_estimators for Random Forest Classifier')
plt.ylabel('Testing Accuracy')

# Display the Highest Value:
max_score = max(scores)
index = scores.index(max(scores))

print(f'The best score found was {max_score} using n_estimators ={index} in a range to 1. to {n} tests.')


### Applying the Random Forest Model:

In [None]:
# Importing the random forest classifier: 
forest = RandomForestClassifier(n_estimators= 49)

forest.fit(X_train, y_train)
y_pred_test = forest.predict(X_test)

In [None]:
#Accuracy Score:
accuracy_score(y_test, y_pred_test)

In [201]:
y_predicted = forest.predict(X_test)
cm = confusion_matrix(y_test,y_pred_test)
cm

array([[  45,    0,    0,    0,    0,    0],
       [   1, 3859,    0,    1,    0,    0],
       [   1,    0,   31,    1,    0,    0],
       [   0,    0,    1,   33,    0,    0],
       [   0,   12,    0,    0,    0,    0],
       [   0,    0,    1,    0,    0,   14]])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#reshaping the confusion matrix:
matrix = cm.astype('float') / cm.sum(axis=1) [:, np.newaxis]

# Plotting:

plt.figure(figsize=(14,7))
sns.set(font_scale = 1.4)
sn.heatmap(matrix, annot=True,fmt='g', annot_kws={'size':15}, cmap=plt.cm.Blues,linewidths=1)

# Adding the labels:

class_names =['Heat Dissipation Failure',
'No Failure',
'Overstrain Failure',
'Power Failure',
'Ramdom Failures',
'Tool Wear Failure',
]
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Random Forest Model: Machine Failure Prevention')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_test))

 Based on the infos above, the biggest issue is under the Random Failure state, were it's not really clear where the failure is located, so the model failed to predict this failure type. 