In [10]:
import os 
import pandas as pd 
import numpy as np 
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
%pwd

'/local/home/hadjmefm/Predictive-Maintenance-System/notebooks'

In [3]:
train = pd.read_csv("../artifacts/train.csv")
train.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,0.0,1,0.118955,0.742842,0.762951,0.659044,0.639374,1
1,0.0,0,0.32596,0.383242,0.525692,0.554348,0.703704,5
2,2.0,0,0.103609,0.699176,0.774704,0.728261,0.604938,5
3,0.0,0,0.159445,0.643389,0.524459,0.45749,0.444252,3
4,1.0,0,0.088475,0.802198,0.561265,0.543478,0.469136,5


In [4]:
test = pd.read_csv("../artifacts/test.csv")
test.head()

Unnamed: 0,Type,Machine failure,Rotational speed [rpm],Torque [Nm],Tool wear [min],Air temperature [c],Process temperature [c],type_of_failure
0,0.0,0,0.17458,0.640162,0.367495,0.458352,0.550464,3
1,0.0,0,0.186263,0.531593,0.577075,0.195652,0.333333,5
2,0.0,0,0.118743,0.623626,0.770751,0.532609,0.728395,5
3,0.0,1,0.109028,0.647565,0.74657,0.861843,0.721412,0
4,0.0,1,0.123728,0.863708,0.04103,0.275379,0.44048,2


## 1- Predict Machine Failure 

In [16]:
x_train = train.drop(["Machine failure","type_of_failure"],axis=1)
y_train = train["Machine failure"]
x_test = test.drop(["Machine failure","type_of_failure"],axis=1)
y_test = test["Machine failure"]

In [17]:
# Best parameters found from Grid Search
best_params = {
    'max_depth': 15,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 50
}

#Model with the best parameters
model = RandomForestClassifier(
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    random_state=42
)

# Perform cross-validation
cv_scores = cross_val_score(model, x_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Train the model
model.fit(x_train, y_train)

# Evaluate the model
train_accuracy = model.score(x_train, y_train)
val_accuracy = accuracy_score(y_test, model.predict(x_test))

print("Training accuracy:", train_accuracy)
print("Validation accuracy:", val_accuracy)

Cross-validation scores: [0.99050291 0.99158213 0.99276926 0.99136629 0.98985429]
Mean cross-validation score: 0.9912149763498885
Training accuracy: 0.9959420665242072
Validation accuracy: 0.9897263230596564


### default RandomForest Model

In [6]:
rf = RandomForestClassifier()
mlflow.set_experiment("Faillure machine Model Evaluation ")
with mlflow.start_run():
  rf.fit(x_train, y_train)
  train_acc = rf.score(x_train, y_train)
  mlflow.log_metric("train_acc",train_acc)
  y_pred = rf.predict(x_test)
  eval_acc = accuracy_score(y_test, y_pred)
  mlflow.log_metric("eval_acc",eval_acc)
  prec = precision_score(y_test, y_pred)
  mlflow.log_metric("precision",prec)
  rec = recall_score(y_test, y_pred)
  mlflow.log_metric("recall",rec)
  f1 = f1_score(y_test, y_pred)
  mlflow.log_metric("f1_score",f1)
  mlflow.sklearn.log_model(rf,"default_randomforest_model")
mlflow.end_run()
print("training accuracy",train_acc) 
print("eval accuracy",eval_acc)

  

2024/07/22 09:03:13 INFO mlflow.tracking.fluent: Experiment with name 'Faillure machine Model Evaluation ' does not exist. Creating a new experiment.


training accuracy 1.0
eval accuracy 0.9917119917119918


**The 100% training accuracy strongly indicates model overfitting.**

**This conclusion is further supported by the evaluation accuracy being marginally lower than the training accuracy**

### RandomForest with Hyperparmater Tuning 

In [7]:
data = pd.read_csv("../artifacts/transformed_data.csv")
X= data.drop(["Machine failure","type_of_failure"],axis=1)
y=data["Machine failure"]

In [8]:


param_grid = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
mlflow.sklearn.log_model(grid_search.best_estimator_,"best randomforest model")

for k in grid_search.best_params_.keys():
    mlflow.log_param(k,grid_search.best_params_[k])
print("Best parameters found: ", grid_search.best_params_)
print("Model saved in run %s" % mlflow.active_run().info.run_uuid)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.8s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.1s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=1, m

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Model saved in run bcb5501b4d314521bfc6372d6c954983


### Model with best params

In [11]:

#split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Best parameters found from Grid Search
best_params = {
    'max_depth': 15,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 50
}

#Model with the best parameters
model = RandomForestClassifier(
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    random_state=42
)

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = model.score(X_train, y_train)
val_accuracy = accuracy_score(y_val, model.predict(X_val))

print("Training accuracy:", train_accuracy)
print("Validation accuracy:", val_accuracy)

Cross-validation scores: [0.99007123 0.99136629 0.99255342 0.99061084 0.99114949]
Mean cross-validation score: 0.9911502514458734
Training accuracy: 0.996093159791923
Validation accuracy: 0.9902443235776569


1. **The cross-validation scores are consistently high, indicating stable model performance across different subsets of the data.**
2. **The mean cross-validation score is very close to the individual fold scores, reinforcing the model's reliability.**
3. **The training accuracy is very high, suggesting effective learning from the training data.**
4. **The validation accuracy is also high, indicating good generalization to new, unseen data.**

In [21]:
from sklearn.metrics import classification_report, roc_auc_score

# Predict probabilities for ROC-AUC
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Classification report
print(classification_report(y_val, model.predict(X_val)))

# ROC-AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print("ROC-AUC Score:", roc_auc)

              precision    recall  f1-score   support

           0       1.00      0.97      0.99      3873
           1       0.99      1.00      0.99      7710

    accuracy                           0.99     11583
   macro avg       0.99      0.99      0.99     11583
weighted avg       0.99      0.99      0.99     11583

ROC-AUC Score: 0.9995589204988609
