In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report, roc_curve
)
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import logging
import warnings

In [2]:
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
# Set MLflow tracking URI (local file-based)
mlflow.set_tracking_uri("./mlruns")  # Relative path (recommended)
mlflow.set_experiment("bati_bank_credit_scoring")

<Experiment: artifact_location='file:c:/Users/reus/Desktop/Tenx/credit-risk-model/notebooks/mlruns/292577243299913946', creation_time=1765891775184, experiment_id='292577243299913946', last_update_time=1765891775184, lifecycle_stage='active', name='bati_bank_credit_scoring', tags={}>

In [4]:
# Ensure reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [5]:
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# CELL 1: FULL SETUP ‚Äî RUN THIS FIRST
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Load data
print("Loading data...")
df = pd.read_csv("../data/processed/customer_features_with_target.csv")

# Separate features and target
X = df.drop(columns=['CustomerId', 'is_high_risk'], errors='ignore')
y = df['is_high_risk']

print(f"Dataset shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

# Train-test split (stratified)
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Scale for models that need it (e.g., Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data prepared successfully!")
print("Variables available: X_train, X_test, X_train_scaled, X_test_scaled, y_train, y_test")

Loading data...
Dataset shape: (3742, 31)
Target distribution:
is_high_risk
0    0.613041
1    0.386959
Name: proportion, dtype: float64
‚úÖ Data prepared successfully!
Variables available: X_train, X_test, X_train_scaled, X_test_scaled, y_train, y_test


In [6]:
print("Available variables:", [v for v in locals().keys() if 'X_train' in v])

Available variables: ['X_train', 'X_train_scaled']


In [11]:
def evaluate_model(y_true, y_pred, y_pred_proba):
    """Compute evaluation metrics."""
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_pred_proba)
    }

In [9]:
def train_and_log_model(model, model_name, params, X_train, X_test, y_train, y_test):
    """Train model, evaluate, and log to MLflow."""
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_params(params)
        
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Evaluate
        metrics = evaluate_model(y_test, y_pred, y_pred_proba)
        mlflow.log_metrics(metrics)
        
        # Log model with signature
        signature = infer_signature(X_train, y_pred)
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model", signature=signature)
        else:
            mlflow.sklearn.log_model(model, "model", signature=signature)
        
        # Log classification report as artifact
        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv("temp_report.csv")
        mlflow.log_artifact("temp_report.csv", "reports")
        os.remove("temp_report.csv")
        
        logger.info(f"{model_name} - ROC-AUC: {metrics['roc_auc']:.4f}")
        return metrics['roc_auc']

In [13]:
best_auc = 0
best_run_id = None
best_model_name = None

In [14]:
# ==============================
# 1. Logistic Regression (with GridSearch)
# ==============================
print("\n[1/4] Training Logistic Regression with GridSearch...")

lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_STATE
)

lr_search = GridSearchCV(
    lr,
    param_grid_lr,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1
)

lr_search.fit(X_train_scaled, y_train)

auc_lr = train_and_log_model(
    lr_search.best_estimator_,
    "LogisticRegression_GridSearch",
    lr_search.best_params_,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test
)

if auc_lr > best_auc:
    best_auc = auc_lr
    best_model_name = "LogisticRegression_GridSearch"



[1/4] Training Logistic Regression with GridSearch...


INFO:__main__:LogisticRegression_GridSearch - ROC-AUC: 1.0000


In [15]:
# ==============================
# 2. Decision Tree (with RandomizedSearch)
# ==============================
print("\n[2/4] Training Decision Tree with RandomizedSearch...")

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)

param_dist_dt = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt_search = RandomizedSearchCV(
    dt,
    param_dist_dt,
    n_iter=50,
    cv=cv,
    scoring='roc_auc',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)

auc_dt = train_and_log_model(
    dt_search.best_estimator_,
    "DecisionTree_RandomSearch",
    dt_search.best_params_,
    X_train,
    X_test,
    y_train,
    y_test
)

if auc_dt > best_auc:
    best_auc = auc_dt
    best_model_name = "DecisionTree_RandomSearch"



[2/4] Training Decision Tree with RandomizedSearch...


INFO:__main__:DecisionTree_RandomSearch - ROC-AUC: 0.9999


In [16]:
# ==============================
# 3. Random Forest (with RandomizedSearch)
# ==============================
print("\n[3/4] Training Random Forest with RandomizedSearch...")

rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)

param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_search = RandomizedSearchCV(
    rf,
    param_dist_rf,
    n_iter=50,
    cv=cv,
    scoring='roc_auc',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

auc_rf = train_and_log_model(
    rf_search.best_estimator_,
    "RandomForest_RandomSearch",
    rf_search.best_params_,
    X_train,
    X_test,
    y_train,
    y_test
)

if auc_rf > best_auc:
    best_auc = auc_rf
    best_model_name = "RandomForest_RandomSearch"



[3/4] Training Random Forest with RandomizedSearch...


INFO:__main__:RandomForest_RandomSearch - ROC-AUC: 0.9998


In [17]:
# ==============================
# 4. XGBoost (with RandomizedSearch)
# ==============================
print("\n[4/4] Training XGBoost with RandomizedSearch...")

xgb = XGBClassifier(
    random_state=RANDOM_STATE,
    use_label_encoder=False,
    eval_metric='logloss'
)

param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    xgb,
    param_dist_xgb,
    n_iter=50,
    cv=cv,
    scoring='roc_auc',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)

auc_xgb = train_and_log_model(
    xgb_search.best_estimator_,
    "XGBoost_RandomSearch",
    xgb_search.best_params_,
    X_train,
    X_test,
    y_train,
    y_test
)

if auc_xgb > best_auc:
    best_auc = auc_xgb
    best_model_name = "XGBoost_RandomSearch"



[4/4] Training XGBoost with RandomizedSearch...




TypeError: `_estimator_type` undefined.  Please use appropriate mixin to define estimator type.

In [None]:
# ==============================
# Identify and register best model
# ==============================
print(f"\n‚úÖ Best model: {best_model_name} (ROC-AUC: {best_auc:.4f})")

# Register best model in MLflow Model Registry
client = mlflow.tracking.MlflowClient()

# Find run ID of best model
experiment = mlflow.get_experiment_by_name("bati_bank_credit_scoring")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
best_run = runs[runs["tags.mlflow.runName"] == best_model_name].iloc[0]
run_id = best_run.run_id

# Register model
model_uri = f"runs:/{run_id}/model"

try:
    result = mlflow.register_model(model_uri, "BatiBank_CreditScoring_Model")

    # Add description
    client.update_model_version(
        name="BatiBank_CreditScoring_Model",
        version=result.version,
        description=f"Best model: {best_model_name} with ROC-AUC={best_auc:.4f}"
    )

    print(
        f"\n‚úÖ Registered model as 'BatiBank_CreditScoring_Model' "
        f"(Version {result.version})"
    )

except Exception as e:
    print(f"\n‚ö†Ô∏è Model registration failed: {e}")

print(
    "\nüìä Run `mlflow ui --backend-store-uri file://$(pwd)/mlruns` "
    "to compare experiments."
)
