In [1]:
# Import the required libraries

import numpy as np
import pandas as pd
from keras.datasets import mnist
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn

In [2]:
#!pip install keras
#!pip install tensorflow

In [3]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [4]:
# Flatten data for RandomForest  (convert 28x28 to 784 features)
x_train_flat = x_train.reshape(-1, 784)
x_test_flat = x_test.reshape(-1, 784)

#Normalize pixel values (0-255) to (0-1)
X_train = x_train_flat / 255.0
X_test = x_test_flat / 255.0


In [5]:
## enable autologging
mlflow.sklearn.autolog()

In [6]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    #'bootstrap': [True, False]
}

In [7]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV (taking the cross validation as cv = 5)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

In [12]:
with mlflow.start_run():
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Log parameters and metrics
    mlflow.log_param("param_grid", param_grid)
    mlflow.log_params(grid_search.best_params_)

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_metric("test_accuracy", test_accuracy)

    # Log the best model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "best_random_forest_model")

    # Report results
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best Cross-Validated Accuracy:", grid_search.best_score_)
    print("Test Accuracy:", test_accuracy)

# End MLflow tracking

Fitting 5 folds for each of 81 candidates, totalling 405 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2025/02/12 01:47:42 INFO mlflow.sklearn.utils: Logging the 5 best runs, 76 runs will be omitted.


Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Cross-Validated Accuracy: 0.9679666666666668
Test Accuracy: 0.972


In [8]:
# Tracking the url
#http://127.0.0.1:5000
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [9]:
# Start MLflow tracking
mlflow.set_experiment("Random Forest Hyperparameter Tuning on mnist")

<Experiment: artifact_location='mlflow-artifacts:/253364769243853118', creation_time=1739379148662, experiment_id='253364769243853118', last_update_time=1739379148662, lifecycle_stage='active', name='Random Forest Hyperparameter Tuning on mnist', tags={}>

In [None]:
with mlflow.start_run():
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)

    # Log parameters and metrics
    mlflow.log_param("param_grid", param_grid)
    mlflow.log_params(grid_search.best_params_)

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_metric("test_accuracy", test_accuracy)

    # Log the best model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "best_random_forest_model")

    # Report results
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best Cross-Validated Accuracy:", grid_search.best_score_)
    print("Test Accuracy:", test_accuracy)

# End MLflow tracking

Fitting 5 folds for each of 81 candidates, totalling 405 fits
