## Import Libraries

We import the necessary libraries for model selection, training, evaluation, and saving the trained models.

In [28]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
    classification_report, confusion_matrix
)
import joblib  # For saving models

## Load the Processed Dataset

We load the dataset from `data/processed/default_estimator_data.csv` after WoE binning and feature engineering. The target variable (`Target`) is separated from the feature set (`X`).

In [42]:
# Load the processed dataset
data_path = "../data/processed/default_estimator_data.csv"
df = pd.read_csv(data_path)

# Separate features and target variable
X = df.drop(columns=['Target', 'Risk_Label_x', 'Risk_Label_y', 'Risk_Label'])  # Exclude non-numeric columns
y = df['Target']

# Display the first few rows of X and y
print("Features (X):")
print(X.head())
print("\nTarget (y):")
print(y.head())

Features (X):
   CountryCode  FraudResult  TotalTransactionAmount  TransactionYear  \
0          256            0                0.557522             2018   
1          256            0                0.557522             2018   
2          256            0                0.556944             2018   
3          256            0                0.558153             2018   
4          256            0                0.558153             2018   

        SubscriptionId       CustomerId CurrencyCode     ProductId  \
0   SubscriptionId_887  CustomerId_4406          UGX  ProductId_10   
1  SubscriptionId_3829  CustomerId_4406          UGX   ProductId_6   
2   SubscriptionId_222  CustomerId_4683          UGX   ProductId_1   
3  SubscriptionId_2185   CustomerId_988          UGX  ProductId_21   
4  SubscriptionId_3829   CustomerId_988          UGX   ProductId_6   

   AverageTransactionAmount        TransactionId  ...  ProductCategory_tv_woe  \
0                  0.047184  TransactionId_76871  .

## Train-Test Split

We split the dataset into training (80%) and testing (20%) sets using stratified sampling to ensure balanced class distribution. Numerical features are scaled using `StandardScaler` for optimal model performance.

In [44]:
# Identify numeric features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Filter the feature set to include only numeric features
X = X[numeric_features]


# Verify the updated feature set
print("Updated Features (X):")
print(X.head())

Updated Features (X):
   CountryCode  FraudResult  TotalTransactionAmount  TransactionYear  \
0          256            0                0.557522             2018   
1          256            0                0.557522             2018   
2          256            0                0.556944             2018   
3          256            0                0.558153             2018   
4          256            0                0.558153             2018   

   AverageTransactionAmount  TransactionMonth  StdDevTransactionAmount  \
0                  0.047184                11                 0.000919   
1                  0.047184                11                 0.000919   
2                  0.047137                11                 0.000000   
3                  0.047749                11                 0.005187   
4                  0.047749                11                 0.005187   

   TransactionDay  PricingStrategy  TransactionHour  ...  \
0              15                2      

In [45]:
# Save the scaler for later use
scaler_path = "../models/scaler.pkl"
joblib.dump(scaler, scaler_path)

print(f"Scaler saved to {scaler_path}")

Scaler saved to ../models/scaler.pkl


In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Selection and Training

We train four models:
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Gradient Boosting

Each model is evaluated using metrics such as Accuracy, Precision, Recall, F1 Score, and ROC-AUC.

In [33]:
# Define a dictionary to store model results
model_results = {}

# Logistic Regression
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
model_results['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, y_pred_logreg),
    'Precision': precision_score(y_test, y_pred_logreg),
    'Recall': recall_score(y_test, y_pred_logreg),
    'F1 Score': f1_score(y_test, y_pred_logreg),
    'ROC-AUC': roc_auc_score(y_test, y_pred_logreg)
}

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
model_results['Decision Tree'] = {
    'Accuracy': accuracy_score(y_test, y_pred_dt),
    'Precision': precision_score(y_test, y_pred_dt),
    'Recall': recall_score(y_test, y_pred_dt),
    'F1 Score': f1_score(y_test, y_pred_dt),
    'ROC-AUC': roc_auc_score(y_test, y_pred_dt)
}

# Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
model_results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1 Score': f1_score(y_test, y_pred_rf),
    'ROC-AUC': roc_auc_score(y_test, y_pred_rf)
}

# Gradient Boosting
gbm = GradientBoostingClassifier(random_state=42)
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
model_results['Gradient Boosting'] = {
    'Accuracy': accuracy_score(y_test, y_pred_gbm),
    'Precision': precision_score(y_test, y_pred_gbm),
    'Recall': recall_score(y_test, y_pred_gbm),
    'F1 Score': f1_score(y_test, y_pred_gbm),
    'ROC-AUC': roc_auc_score(y_test, y_pred_gbm)
}

In [34]:
# Convert model results to a DataFrame
results_df = pd.DataFrame(model_results).T
results_df = results_df[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']]
results_df.sort_values(by='ROC-AUC', ascending=False, inplace=True)

# Display the results
print("Model Performance:")
print(results_df)

Model Performance:
                     Accuracy  Precision    Recall  F1 Score   ROC-AUC
Decision Tree        0.993467   0.983040  0.976352  0.979685  0.986556
Random Forest        0.980871   0.974538  0.905086  0.938529  0.950268
Gradient Boosting    0.909842   0.852850  0.533204  0.656169  0.757752
Logistic Regression  0.861391   0.642810  0.317136  0.424729  0.641617


In [36]:
# Define hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

In [37]:
# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)  # X_train and y_train should already be defined

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC-AUC Score: 0.995276478486252


In [38]:
# Retrieve the best estimator
best_rf = grid_search.best_estimator_

# Retrain on the full training set
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_best_rf = best_rf.predict(X_test)
y_prob_best_rf = best_rf.predict_proba(X_test)[:, 1]  # Probability estimates for class 1 (High-Risk)

In [39]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_best_rf)
precision = precision_score(y_test, y_pred_best_rf)
recall = recall_score(y_test, y_pred_best_rf)
f1 = f1_score(y_test, y_pred_best_rf)
roc_auc = roc_auc_score(y_test, y_prob_best_rf)

# Display results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Accuracy: 0.9823
Precision: 0.9787
Recall: 0.9099
F1 Score: 0.9431
ROC-AUC: 0.9978


In [40]:
# Save the tuned model
model_path = "../models/best_random_forest_model.pkl"
joblib.dump(best_rf, model_path)

print(f"Tuned Random Forest model saved to {model_path}")

Tuned Random Forest model saved to ../models/best_random_forest_model.pkl
