#### Comparing Random Forests and Histogram Gradient Boosting models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier,  HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import plotly.colors as colors
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
df = pd.read_csv('./Maternal Health Risk Data Set.csv')
X = df.drop('RiskLevel', axis=1)
y = df['RiskLevel']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
127,55,140,95,19.0,98.0,77
491,23,120,90,7.9,98.0,70
420,60,120,80,6.8,98.0,77
993,25,120,90,15.0,98.0,80
995,32,140,90,18.0,98.0,88


In [3]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Hist Gradient Boosting": HistGradientBoostingClassifier(),
}
param_grids = {
    "Random Forest": {
        'n_estimators': [100, 200, 500], 
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 15, 20],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'n_jobs': [2]
    },
    "Hist Gradient Boosting": {
        'max_iter': [100, 200, 500, 1000, 3000],
        'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7]
        },
}
cv = KFold(n_splits=4, shuffle=True, random_state=7)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        verbose=1,
        n_jobs=4
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_params": grid_search.best_params_, "best_score": grid_search.best_score_}
    results.append(result)

Fitting 4 folds for each of 54 candidates, totalling 216 fits
Fitting 4 folds for each of 25 candidates, totalling 100 fits


In [4]:
fig = make_subplots(
    rows=1,
    cols=2,
    shared_yaxes=True,
    subplot_titles=["Train time vs score", "Predict time vs score"],
)
model_names = [result["model"] for result in results]
colors_list = colors.qualitative.Plotly * (
    len(model_names) // len(colors.qualitative.Plotly) + 1
)

for idx, result in enumerate(results):
    cv_results = result["cv_results"].round(3)
    model_name = result["model"]
    param_name = list(param_grids[model_name].keys())[0]
    cv_results[param_name] = cv_results["param_" + param_name]
    cv_results["model"] = model_name

    scatter_fig = px.scatter(
        cv_results,
        x="mean_fit_time",
        y="mean_test_score",
        error_x="std_fit_time",
        error_y="std_test_score",
        hover_data=param_name,
        color="model",
    )
    line_fig = px.line(
        cv_results,
        x="mean_fit_time",
        y="mean_test_score",
    )

    scatter_trace = scatter_fig["data"][0]
    line_trace = line_fig["data"][0]
    scatter_trace.update(marker=dict(color=colors_list[idx]))
    line_trace.update(line=dict(color=colors_list[idx]))
    fig.add_trace(scatter_trace, row=1, col=1)
    fig.add_trace(line_trace, row=1, col=1)

    scatter_fig = px.scatter(
        cv_results,
        x="mean_score_time",
        y="mean_test_score",
        error_x="std_score_time",
        error_y="std_test_score",
        hover_data=param_name,
    )
    line_fig = px.line(
        cv_results,
        x="mean_score_time",
        y="mean_test_score",
    )

    scatter_trace = scatter_fig["data"][0]
    line_trace = line_fig["data"][0]
    scatter_trace.update(marker=dict(color=colors_list[idx]))
    line_trace.update(line=dict(color=colors_list[idx]))
    fig.add_trace(scatter_trace, row=1, col=2)
    fig.add_trace(line_trace, row=1, col=2)

fig.update_layout(
    xaxis=dict(title="Train time (s) - lower is better"),
    yaxis=dict(title="Test R2 score - higher is better"),
    xaxis2=dict(title="Predict time (s) - lower is better"),
    legend=dict(x=0.72, y=0.05, traceorder="normal", borderwidth=1),
    title=dict(x=0.5, text="Speed-score trade-off of tree-based ensembles"),
)

In [5]:
# print scores of each model
for result in results:
    print(f"Model: {result['model']}")
    print(f" - Best score: {result['best_score']}")
    print(f" - Best params: {result['best_params']}\n")

Model: Random Forest
 - Best score: 0.8421851171765585
 - Best params: {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 500, 'n_jobs': 2}

Model: Hist Gradient Boosting
 - Best score: 0.8382403286545704
 - Best params: {'learning_rate': 0.3, 'max_iter': 1000}



In [6]:
for result in results:
    model_name = result['model']
    
    if model_name == 'Random Forest':
        model = RandomForestClassifier(**result['best_params'])
    else:
        model = HistGradientBoostingClassifier(**result['best_params'])
    
    # Fit the model on the training set
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Output metrics
    print(f"Model: {model_name}")
    print(f" - Accuracy: {accuracy}")
    print(f" - Confusion Matrix:\n{conf_matrix}\n")
    print(f" - Classification Report:\n{report}\n")
    
    # Save
    joblib.dump(model, f"{model_name}.joblib")

Model: Random Forest
 - Accuracy: 0.8916256157635468
 - Confusion Matrix:
[[50  2  5]
 [ 2 69  7]
 [ 1  5 62]]

 - Classification Report:
              precision    recall  f1-score   support

   high risk       0.94      0.88      0.91        57
    low risk       0.91      0.88      0.90        78
    mid risk       0.84      0.91      0.87        68

    accuracy                           0.89       203
   macro avg       0.90      0.89      0.89       203
weighted avg       0.89      0.89      0.89       203


Model: Hist Gradient Boosting
 - Accuracy: 0.8719211822660099
 - Confusion Matrix:
[[50  3  4]
 [ 2 65 11]
 [ 1  5 62]]

 - Classification Report:
              precision    recall  f1-score   support

   high risk       0.94      0.88      0.91        57
    low risk       0.89      0.83      0.86        78
    mid risk       0.81      0.91      0.86        68

    accuracy                           0.87       203
   macro avg       0.88      0.87      0.88       203
weighte