In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

In [None]:
df_maint = pd.read_csv("ai4i2020.csv")
df_maint = df_maint.drop(['UDI', 'Product ID'], axis=1)
df_maint = pd.get_dummies(df_maint, columns=['Type'])
df_maint = df_maint.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1)

X = df_maint.drop('Machine failure', axis=1)
y = df_maint['Machine failure']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=42)

search = RandomizedSearchCV(
    rf, param_distributions=param_dist,
    n_iter=10, cv=5, scoring='f1', n_jobs=-1, random_state=42
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
import joblib

# Save the trained random forest model to a file
joblib.dump(best_model, 'failure_prediction_model.joblib')

In [None]:
# # Later on... (in deployment environment)
# model_loaded = joblib.load('failure_prediction_model.joblib')