Random Forest, CAT Boost

## 1. Imports

In [22]:
import pandas as pd
import numpy as np
import joblib
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix,roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import shap

## 2. Import Data

In [3]:
train_X = pd.read_csv("../data/train/X_train.csv")
train_y = pd.read_csv("../data/train/y_train.csv")
test_X = pd.read_csv("../data/test/X_test.csv")
test_y = pd.read_csv("../data/test/y_test.csv")

## 3. Metrics functions

In [5]:
# Function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# Function to evaluate metrics
def evaluate_metrics(y_true, y_pred, y_scores):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    sensitivity = recall  # Sensitivity is the same as recall
    auc = roc_auc_score(y_true, y_scores)
    return precision, recall, specificity, sensitivity, auc

## 4. Cross-Validation code

In [6]:
# 5-fold Cross-Validation
def cross_validate_model(model, train_X, train_y, metrics_fn, cv=5):
    skf = StratifiedKFold(n_splits=cv)
    metrics_list = []

    for train_index, val_index in skf.split(train_X, train_y):
        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_scores = model.predict_proba(X_val)[:, 1]
        
        metrics_list.append(metrics_fn(y_val, y_pred, y_scores))
    
    metrics_df = pd.DataFrame(metrics_list, columns=["Precision", "Recall", "Specificity", "Sensitivity", "AUC"])
    return metrics_df.mean(axis=0)


## 5. Train RF Model

In [12]:
rf_model = RandomForestClassifier(
    n_estimators=100,      
    max_depth=10,           
    min_samples_split=10,   
    min_samples_leaf=5,     
    random_state=42         
)
rf_metrics = cross_validate_model(rf_model, train_X, train_y, evaluate_metrics)


In [13]:
rf_metrics

Precision      0.643556
Recall         0.051857
Specificity    0.994801
Sensitivity    0.051857
AUC            0.802507
dtype: float64

In [17]:
model_dir = "../model"
joblib.dump(rf_model, os.path.join(model_dir, "random_forest_model.pkl"))

['../model\\random_forest_model.pkl']

## 6. Train CatBoost

In [None]:
# Initialize CatBoost model with parameters to avoid overfitting
catboost_model = CatBoostClassifier(
    iterations=500,         # Perform 500 boosting rounds
    depth=6,                # Depth of the tree
    learning_rate=0.1,      # Learning rate for gradient descent
    l2_leaf_reg=3,          # L2 regularization term on weights
    verbose=0               # Silent output
)
catboost_metrics = cross_validate_model(catboost_model, train_X, train_y, evaluate_metrics)

print("CatBoost Metrics:")
print(catboost_metrics)

CatBoost Metrics:
Precision      0.561045
Recall         0.161672
Specificity    0.977127
Sensitivity    0.161672
AUC            0.812594
dtype: float64


In [18]:
catboost_model.save_model(os.path.join(model_dir, "catboost_model.cbm"))

## 7. Load Model

In [19]:
loaded_rf_model = joblib.load(os.path.join(model_dir, "random_forest_model.pkl"))
loaded_catboost_model = CatBoostClassifier()
loaded_catboost_model.load_model(os.path.join(model_dir, "catboost_model.cbm"))

<catboost.core.CatBoostClassifier at 0x1d1849f9d88>