# Interactive Random Forest Classifier on MNIST Dataset

This notebook demonstrates the use of a Random Forest Classifier on the MNIST dataset. Interactive tuning of hyperparameters will help understand how entropy and Gini impurity impact the model performance, focusing on principles of supervised learning, feature extraction, and classification boundaries.

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from ipywidgets import interact, IntSlider, Dropdown, ToggleButtons
import warnings
warnings.filterwarnings('ignore')

## Load and Preprocess the MNIST Dataset

In [None]:
# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target.astype(np.int8)

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

## Define Function to Train, Evaluate the Model, and Visualize Results

In [None]:
def train_evaluate_rf(n_estimators, max_depth, max_features, criterion):
    """
    Train and evaluate a Random Forest classifier with given hyperparameters.
    Parameters:
    - n_estimators: Number of trees in the forest
    - max_depth: Maximum depth of each tree
    - max_features: Maximum number of features considered for splitting
    - criterion: Splitting criterion ('gini' or 'entropy')
    """
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth if max_depth > 0 else None,
        max_features=max_features,
        criterion=criterion,
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Feature importance visualization
    feature_importances = rf.feature_importances_
    sorted_idx = np.argsort(feature_importances)[-10:]
    plt.figure(figsize=(8, 6))
    sns.barplot(x=feature_importances[sorted_idx], y=sorted_idx, orient='h', palette='viridis')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Index')
    plt.title('Top 10 Feature Importances')
    plt.show()
    
    # Accuracy over a range of trees for visualization
    accuracy_list = []
    n_estimator_range = range(10, n_estimators+1, 10)
    for n in n_estimator_range:
        temp_rf = RandomForestClassifier(n_estimators=n, criterion=criterion, random_state=42, n_jobs=-1)
        temp_rf.fit(X_train, y_train)
        accuracy_list.append(accuracy_score(y_test, temp_rf.predict(X_test)))
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=list(n_estimator_range), y=accuracy_list)
    plt.xlabel('Number of Trees')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. Number of Trees in Random Forest')
    plt.show()

## Interactive Widget for Hyperparameter Tuning

Select the parameters below to see how different settings affect model performance. Consider how the choice of splitting criterion (Gini impurity vs. entropy) influences decision-making, and observe the changes in accuracy, confusion matrix, feature importances, and the effect of different numbers of trees.

In [None]:
n_estimators_slider = IntSlider(
    value=100,
    min=10,
    max=500,
    step=10,
    description='n_estimators'
)

max_depth_slider = IntSlider(
    value=10,
    min=0,
    max=50,
    step=1,
    description='max_depth'
)

max_features_dropdown = Dropdown(
    options=['auto', 'sqrt', 'log2'],
    value='auto',
    description='max_features'
)

criterion_toggle = ToggleButtons(
    options=['gini', 'entropy'],
    description='criterion',
    button_style='info'
)

interact(
    train_evaluate_rf,
    n_estimators=n_estimators_slider,
    max_depth=max_depth_slider,
    max_features=max_features_dropdown,
    criterion=criterion_toggle
);