In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

  from pandas.core.computation.check import NUMEXPR_INSTALLED


1. Load csv data

In [2]:
# 1. Load data from a CSV
df = pd.read_csv('./Maternal Health Risk Data Set.csv')
X = df.drop('RiskLevel', axis=1)
y = df['RiskLevel']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
127,55,140,95,19.0,98.0,77
491,23,120,90,7.9,98.0,70
420,60,120,80,6.8,98.0,77
993,25,120,90,15.0,98.0,80
995,32,140,90,18.0,98.0,88


2. Load model and hyperparameter optimisation

In [3]:
# Get a random forest classifier from sklearn
rf = RandomForestClassifier()

# 2. Perform a hyperparameter optimisation using GridSearchCV along with bootstrapping
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 15, 20],
    'criterion': ['gini', 'entropy', 'log_loss']
}

CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, verbose=1)
CV_rf.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [4]:
# Print the best hyperparameters found by GridSearchCV
print("Best hyperparameters:", CV_rf.best_params_)

Best hyperparameters: {'criterion': 'log_loss', 'max_depth': 15, 'max_features': 'log2', 'n_estimators': 100}


3. Train

In [5]:
# 3. Train the model using the derived hyperparameters
best_params = CV_rf.best_params_
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train, y_train)

4. Test

In [6]:
# 4. Test it on the test set. Report all necessary metrics: accuracy, confusion matrix, etc.
y_pred = rf_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"\nConfusion Matrix: \n{conf_matrix}")
print(f"\nClassification Report: \n{report}")

Accuracy: 0.8768472906403941

Confusion Matrix: 
[[50  3  4]
 [ 2 67  9]
 [ 1  6 61]]

Classification Report: 
              precision    recall  f1-score   support

   high risk       0.94      0.88      0.91        57
    low risk       0.88      0.86      0.87        78
    mid risk       0.82      0.90      0.86        68

    accuracy                           0.88       203
   macro avg       0.88      0.88      0.88       203
weighted avg       0.88      0.88      0.88       203



5. Save

In [7]:
# 5. Save the model to be used in the future
joblib.dump(rf_best, 'random_forest_model.pkl')

['random_forest_model.pkl']