In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

  from pandas.core.computation.check import NUMEXPR_INSTALLED


1. Load csv data

In [5]:
# 1. Load data from a CSV
df = pd.read_csv('./Maternal Health Risk Data Set.csv')
X = df.drop('RiskLevel', axis=1)
y = df['RiskLevel']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
127,55,140,95,19.0,98.0,77
491,23,120,90,7.9,98.0,70
420,60,120,80,6.8,98.0,77
993,25,120,90,15.0,98.0,80
995,32,140,90,18.0,98.0,88


2. Load model and hyperparameter optimisation

In [7]:
# Get a random forest classifier from sklearn
rf = RandomForestClassifier()

# 2. Perform a hyperparameter optimisation using GridSearchCV along with bootstrapping
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 15, 20],
    'criterion': ['gini', 'entropy', 'log_loss']
}

CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
CV_rf.fit(X_train, y_train)

90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\z004w55w\AppData\Local\mambaforge\envs\lovelace\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\z004w55w\AppData\Local\mambaforge\envs\lovelace\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\z004w55w\AppData\Local\mambaforge\envs\lovelace\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\z004w55w\AppData\Local\mambaforge\envs\love

3. Train

In [None]:
# 3. Train the model using the derived hyperparameters
best_params = CV_rf.best_params_
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train, y_train)

4. Test

In [None]:
# 4. Test it on the test set. Report all necessary metrics: accuracy, confusion matrix, etc.
y_pred = rf_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix: \n{conf_matrix}")
print(f"Classification Report: \n{report}")

5. Save

In [None]:
# 5. Save the model to be used in the future
joblib.dump(rf_best, 'random_forest_model.pkl')