<a href="https://www.kaggle.com/code/elijahnyasiando/creditriskmodel-me-fa?scriptVersionId=290095208" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Modules

In [None]:
import warnings
# warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.preprocessing import StandardScaler, QuantileTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, make_scorer
from statsmodels.distributions.empirical_distribution import ECDF

from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

# Set some display options for better viewing
pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')

# Creating Master Training Dataset

In [None]:
COHORTS = ['2018Q1','2018Q2','2018Q3','2018Q4', '2019Q1', '2019Q2', '2019Q3', '2019Q4']

ORIGINATION_COL_NAMES = [
    'CREDIT_SCORE', 'FIRST_PAYMENT_DATE', 'FIRST_TIME_HOMEBUYER_FLAG',
    'MATURITY_DATE', 'METROPOLITAN_STATISTICAL_AREA',
    'MORTGAGE_INSURANCE_PERCENTAGE', 'NUMBER_OF_UNITS', 'OCCUPANCY_STATUS',
    'ORIGINAL_COMBINED_LOAN_TO_VALUE_CLTV', 'ORIGINAL_DEBT_TO_INCOME_DTI_RATIO',
    'ORIGINAL_UPB', 'ORIGINAL_LOAN_TO_VALUE_LTV', 'ORIGINAL_INTEREST_RATE',
    'CHANNEL', 'PREPAYMENT_PENALTY_MORTGAGE_FLAG', 'AMORTIZATION_TYPE',
    'PROPERTY_STATE', 'PROPERTY_TYPE', 'POSTAL_CODE', 'LOAN_SEQUENCE_NUMBER',
    'LOAN_PURPOSE', 'ORIGINAL_LOAN_TERM', 'NUMBER_OF_BORROWERS', 'SELLER_NAME',
    'SERVICER_NAME', 'SUPER_CONFORMING_FLAG', 'PRE-RELIEF_REFINANCE_LOAN_SEQUENCE_NUMBER',
    'SPECIAL_ELIGIBILITY_PROGRAM', 'RELIEF_REFINANCE_INDICATOR', 'PROPERTY_VALUATION_METHOD',
    'INTEREST_ONLY_INDICATOR', 'MI_CANCELLATION_INDICATOR'
]

PERFORMANCE_COL_NAMES = [
    'LOAN_SEQUENCE_NUMBER', 'MONTHLY_REPORTING_PERIOD', 'CURRENT_ACTUAL_UPB',
    'CURRENT_LOAN_DELINQUENCY_STATUS', 'LOAN_AGE', 'REMAINING_MONTHS_TO_LEGAL_MATURITY',
    'DEFECT_SETTLEMENT_DATE', 'MODIFICATION_FLAG', 'ZERO_BALANCE_CODE',
    'ZERO_BALANCE_EFFECTIVE_DATE', 'CURRENT_INTEREST_RATE', 'CURRENT_NON-INTEREST_BEARING_UPB',
    'DUE_DATE_OF_LAST_PAID_INSTALLMENT_DDLPI', 'MI_RECOVERIES', 'NET_SALE_PROCEEDS',
    'NON_MI_RECOVERIES', 'TOTAL_EXPENSES', 'LEGAL_COSTS',
    'MAINTENANCE_AND_PRESERVATION_COSTS', 'TAXES_AND_INSURANCE', 'MISCELLANEOUS_EXPENSES',
    'ACTUAL_LOSS_CALCULATION', 'CUMULATIVE_MODIFICATION_COST', 'STEP_MODIFICATION_FLAG',
    'PAYMENT_DEFERRAL', 'ESTIMATED_LOAN_TO_VALUE_ELTV', 'ZERO_BALANCE_REMOVAL_UPB',
    'DELINQUENT_ACCRUED_INTEREST', 'DELINQUENCY_DUE_TO_DISASTER',
    'BORROWER_ASSISTANCE_STATUS_CODE', 'CURRENT_MONTH_MODIFICATION_COST', 'INTEREST_BEARING_UPB'
]

COVARIATES_TO_KEEP = [
    'LOAN_SEQUENCE_NUMBER', 'CREDIT_SCORE', 'ORIGINAL_DEBT_TO_INCOME_DTI_RATIO',
    'ORIGINAL_LOAN_TO_VALUE_LTV', 'ORIGINAL_UPB', 'ORIGINAL_INTEREST_RATE',
    'NUMBER_OF_UNITS', 'NUMBER_OF_BORROWERS', 'LOAN_PURPOSE', 'CHANNEL','FIRST_TIME_HOMEBUYER_FLAG', 'OCCUPANCY_STATUS', 'PROPERTY_TYPE'
]

perf_dtypes = {
    'LOAN_SEQUENCE_NUMBER': 'string',
    'CURRENT_LOAN_DELINQUENCY_STATUS': 'string', # Keep string, handle 'XX'/'R' later
    'ZERO_BALANCE_CODE': 'string'
}

all_cohort_data = []

for cohort in COHORTS:
    print(f"--> Processing cohort: {cohort}...")
    
    # --- OPTIMIZATION: Process Performance Data in Chunks ---
    perf_file_path = f'/kaggle/input/2018-2019-small-loans-data/data/historical_data_time_{cohort}.txt'
    
    chunk_list = []
    # Read in chunks of 1 million rows to prevent OOM
    for chunk in pd.read_csv(perf_file_path, sep='|', header=None, names=PERFORMANCE_COL_NAMES,
                             usecols=['LOAN_SEQUENCE_NUMBER', 'CURRENT_LOAN_DELINQUENCY_STATUS', 'ZERO_BALANCE_CODE'],
                             dtype=perf_dtypes, chunksize=1_000_000):
        
        # Vectorized clean-up within chunk
        chunk['delinq_num'] = pd.to_numeric(chunk['CURRENT_LOAN_DELINQUENCY_STATUS'], errors='coerce').fillna(0)
        
        # Vectorized Flagging
        # Flag 1: Every 90+ DPD
        chunk['is_90plus'] = (chunk['delinq_num'] >= 3)
        # Flag 2: Bad Termination (03=Foreclosure, 09=Deed/Short Sale)
        chunk['is_bad_term'] = chunk['ZERO_BALANCE_CODE'].isin(['03', '09'])
        
        # Group by Loan ID within chunk? NO. Loans span chunks. 
        # We must filter to relevant rows or aggregate partially.
        # STRATEGY: Reduce chunk to only "Bad" events, then aggregate later.
        
        # Filter chunk to only rows that indicate a default event
        bad_events = chunk[(chunk['is_90plus']) | (chunk['is_bad_term'])][['LOAN_SEQUENCE_NUMBER']]
        chunk_list.append(bad_events)

    # Concatenate all "Bad" events found
    all_bad_loans = pd.concat(chunk_list)
    
    # Get unique set of Defaulters
    defaulters_set = set(all_bad_loans['LOAN_SEQUENCE_NUMBER'].unique())
    print(f"    Identified {len(defaulters_set)} unique defaulters in {cohort}.")

    # --- Process Origination Data ---
    print(f"    Reading origination data for {cohort}...")
    orig_file_path = f'/kaggle/input/2018-2019-small-loans-data/data/historical_data_{cohort}.txt'
    
    # Load Origination (Usually fits in memory, but use dtypes to be safe)
    df_orig = pd.read_csv(orig_file_path, sep='|', header=None, names=ORIGINATION_COL_NAMES,
                          usecols=COVARIATES_TO_KEEP, dtype={'LOAN_SEQUENCE_NUMBER': 'string'})

    # --- Map Targets ---
    # 1 if in defaulters_set, else 0
    df_orig['Default_Flag'] = df_orig['LOAN_SEQUENCE_NUMBER'].isin(defaulters_set).astype(int)
    
    all_cohort_data.append(df_orig)

# Final Concatenation
master_training_set = pd.concat(all_cohort_data, ignore_index=True)
master_training_set.to_parquet('master_training_set.parquet', index=False)

print("\nPipeline complete! The file is ready for your modeling notebook.")



 # Exploratory Data Analysis (EDA)
 Initial Overview

In [None]:

# --- CONFIGURATION ---
INPUT_FILE = 'master_training_set.parquet'
TARGET_COL = 'Default_Flag'

# Regulatory Whitelist: Features strictly related to Capacity (Income) & Collateral (Asset)
# This excludes 'Postal Code' or 'MSA' to minimize geographic bias (Redlining risks)
NUMERIC_FEATURES = [
    'CREDIT_SCORE', 
    'ORIGINAL_DEBT_TO_INCOME_DTI_RATIO', 
    'ORIGINAL_LOAN_TO_VALUE_LTV', 
    'ORIGINAL_UPB', 
    'ORIGINAL_INTEREST_RATE'
]

CATEGORICAL_FEATURES = [
    'CHANNEL', 
    'LOAN_PURPOSE', 
    'FIRST_TIME_HOMEBUYER_FLAG', 
    'OCCUPANCY_STATUS', 
    'PROPERTY_TYPE'
]

def load_and_clean_data(file_path):
    """
    Loads data and applies Engineering-Driven Cleaning logic.
    """
    print(f"--> Loading dataset from {file_path}...")
    df = pd.read_parquet(file_path)
    
    # 1. MSA Logic: Treat Null as 'Rural/Unknown' (Kenya DPA: Avoid bias against unmapped areas)
    if 'METROPOLITAN_STATISTICAL_AREA' in df.columns:
        df['METROPOLITAN_STATISTICAL_AREA'] = df['METROPOLITAN_STATISTICAL_AREA'].fillna('Rural_Unknown')

    # 2. Credit Score Logic: Remove Invalid Data
    # 9999 is often used as a legacy error code in credit bureaus. 
    # Scores < 300 are theoretically impossible in standard FICO.
    print("    Cleaning Credit Scores...")
    df = df[df['CREDIT_SCORE'] != 9999] 
    df = df[df['CREDIT_SCORE'] > 300]
    
    # 3. DTI Logic: The "Hidden Risk" Imputation
    # We do NOT just fill with mean. Missing DTI often implies self-employment or non-conforming loans.
    # We impute the median to preserve distribution, but create a binary flag to capture the risk.
    if df['ORIGINAL_DEBT_TO_INCOME_DTI_RATIO'].isnull().sum() > 0:
        print("    Imputing missing DTI and creating flagging feature...")
        df['DTI_MISSING_FLAG'] = df['ORIGINAL_DEBT_TO_INCOME_DTI_RATIO'].isnull().astype(int)
        df['ORIGINAL_DEBT_TO_INCOME_DTI_RATIO'] = df['ORIGINAL_DEBT_TO_INCOME_DTI_RATIO'].fillna(
            df['ORIGINAL_DEBT_TO_INCOME_DTI_RATIO'].median()
        )
    
    # 4. LTV Cleaning
    df['ORIGINAL_LOAN_TO_VALUE_LTV'] = df['ORIGINAL_LOAN_TO_VALUE_LTV'].fillna(
        df['ORIGINAL_LOAN_TO_VALUE_LTV'].median()
    )

    print(f"    Data Cleaned. Rows remaining: {len(df)}")
    return df

def analyze_monotonicity(df):
    """
    Checks if risk drivers behave logically (Monotonic trends).
    This is required for Explainability (Right to Explanation).
    """
    print("\n--> 1. Checking Monotonicity (Risk Banding)...")
    
    # Binning FICO Scores
    # Standard Risk Bands: Subprime (<620), Near Prime (620-660), Prime (660-720), Super Prime (720+)
    bins = [300, 620, 660, 700, 740, 780, 850]
    labels = ['<620', '620-660', '660-700', '700-740', '740-780', '780+']
    
    df['FICO_Band'] = pd.cut(df['CREDIT_SCORE'], bins=bins, labels=labels)
    
    # Calculate Default Rate per Band
    risk_profile = df.groupby('FICO_Band', observed=True)[TARGET_COL].mean().reset_index()
    risk_profile.rename(columns={TARGET_COL: 'Default_Rate'}, inplace=True)
    
    # Visualization
    plt.figure(figsize=(10, 5))
    sns.barplot(x='FICO_Band', y='Default_Rate', data=risk_profile, palette='Reds_r')
    plt.title("Monotonicity Check: Default Rate by Credit Score Band")
    plt.ylabel("Probability of Default (PD)")
    plt.grid(axis='y', alpha=0.3)
    plt.show()
    
    print("    *Architect Note*: If the bars do not descend like a staircase, the model will be unstable.")

def visualize_tipping_points(df):
    """
    Identifies non-linear 'Cliffs' where risk spikes.
    """
    print("\n--> 2. Visualizing Non-Linear Risk Cliffs (Capacity & Collateral)...")
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))
    
    # Plot A: DTI (Capacity)
    # We look for the '43%' QM Rule cliff
    sns.kdeplot(data=df[df[TARGET_COL]==0], x='ORIGINAL_DEBT_TO_INCOME_DTI_RATIO', 
                color='green', label='Good Loans', ax=axes[0], fill=True, alpha=0.1)
    sns.kdeplot(data=df[df[TARGET_COL]==1], x='ORIGINAL_DEBT_TO_INCOME_DTI_RATIO', 
                color='red', label='Defaulters', ax=axes[0], fill=True, alpha=0.1)
    
    axes[0].set_title("Capacity Constraint: DTI Distribution")
    axes[0].set_xlim(10, 60)
    axes[0].axvline(43, color='black', linestyle='--', label='QM Threshold (43%)')
    axes[0].legend()

    # Plot B: LTV (Collateral)
    # We look for the '80%' PMI cliff
    sns.kdeplot(data=df[df[TARGET_COL]==0], x='ORIGINAL_LOAN_TO_VALUE_LTV', 
                color='green', label='Good Loans', ax=axes[1], fill=True, alpha=0.1)
    sns.kdeplot(data=df[df[TARGET_COL]==1], x='ORIGINAL_LOAN_TO_VALUE_LTV', 
                color='red', label='Defaulters', ax=axes[1], fill=True, alpha=0.1)
    
    axes[1].set_title("Collateral Risk: LTV Distribution")
    axes[1].set_xlim(50, 105)
    axes[1].axvline(80, color='black', linestyle='--', label='PMI Threshold (80%)')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

def correlation_matrix(df):
    """
    Simple linear correlation check.
    """
    print("\n--> 3. Correlation Matrix...")
    cols = NUMERIC_FEATURES + [TARGET_COL]
    corr = df[cols].corr()
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation of Risk Drivers")
    plt.show()

def finalize_features(df):
    """
    Selects only whitelist features for the modeling stage.
    """
    print("\n--> 4. Final Feature Selection...")
    
    # Check for engineered features
    final_numeric = NUMERIC_FEATURES.copy()
    if 'DTI_MISSING_FLAG' in df.columns:
        final_numeric.append('DTI_MISSING_FLAG')
        
    # Filter Dataset
    final_cols = final_numeric + CATEGORICAL_FEATURES + [TARGET_COL]
    df_final = df[final_cols].copy()
    
    print(f"    Selected {len(final_numeric)} Numeric & {len(CATEGORICAL_FEATURES)} Categorical features.")
    print("    Ready for Weight of Evidence (WoE) Transformation.")
    
    return df_final

# --- EXECUTION PIPELINE ---
df_master = load_and_clean_data(INPUT_FILE)
analyze_monotonicity(df_master)
visualize_tipping_points(df_master)
correlation_matrix(df_master)
df_model_ready = finalize_features(df_master)

# Display first few rows of the clean dataset
df_model_ready.head()

# Model Engineering and Financial Calibration

## 1. Architectural Objective
The objective of this phase was to transition from exploratory analysis to a production-grade Probability of Default (PD) classifier. Unlike standard machine learning tasks where "Accuracy" is the primary metric, this architecture prioritizes Financial Calibration and Regulatory Stability. The system is designed to minimize the economic impact of loan losses while adhering to Basel III standards for internal models.

## 2. Core Methodology: Cost-Sensitive Random Forest
We selected a Random Forest ensemble architecture due to its ability to capture non-linear interactions between key risk drivers (e.g., the compounding risk of low FICO scores combined with high LTV ratios) without requiring complex feature transformation.

### Key Engineering Decisions:

#### Data Integrity (No SMOTE):
We explicitly rejected synthetic oversampling techniques (such as SMOTE). Creating "fake" synthetic borrowers to balance classes introduces statistical noise and violates the Data Integrity principles required for auditability. Instead, we utilized the model's native class_weight='balanced' parameter to mathematically penalize the misclassification of defaulters.
#### Stability Constraints:
To prevent the model from learning "anecdotal" noise, we enforced a strict min_samples_leaf=50 constraint. This ensures that every decision rule (branch) in the forest is supported by a cohort of at least 50 actual historical loans, guaranteeing that our risk assessments are statistically significant and robust.
#### Stratified Validation:
All training and validation utilized Stratified K-Fold Cross-Validation to strictly maintain the population's natural default rate (approx. 1.5%) across all testing folds, preventing "lucky" splits that could mask model weakness.

## 3. Financial Calibration (The "Custom Scorer")
A generic model treats a False Positive (rejecting a good customer) and a False Negative (approving a defaulter) as equal errors. In banking, this is false.
We engineered a Custom Loss Function for the Hyperparameter Tuning grid:

Cost of False Negative ($10): Represents the Principal Loss (LGD) and workout costs.

Cost of False Positive ($1): Represents the Opportunity Cost (lost interest income).

The model optimization process (GridSearchCV) was directed to minimize this weighted financial loss rather than maximizing raw accuracy.

## 4. Regulatory Compliance & Ethics
Kenya Data Protection Act (2019): By using a tree-based model without "black-box" neural networks or synthetic data, we ensure the "Right to Explanation." Every prediction can be traced back to specific, intelligible financial thresholds (e.g., "DTI > 43%").
Fair Lending: The feature set was restricted strictly to Financial Capacity and Collateral metrics, explicitly excluding geographic and demographic proxies to prevent redlining or bias.

In [None]:
# --- CONFIGURATION ---
TARGET_COL = 'Default_Flag'

# "Financial Calibration" Constants
COST_FALSE_NEGATIVE = 10  # Unit cost of Default (Loss Given Default)
COST_FALSE_POSITIVE = 1   # Unit cost of Rejection (Lost Interest Income)

# Feature Lists
NUMERIC_FEATURES = [
    'CREDIT_SCORE', 
    'ORIGINAL_DEBT_TO_INCOME_DTI_RATIO', 
    'ORIGINAL_LOAN_TO_VALUE_LTV', 
    'ORIGINAL_UPB', 
    'ORIGINAL_INTEREST_RATE',
    'NUMBER_OF_UNITS',
    'NUMBER_OF_BORROWERS'
]

CATEGORICAL_FEATURES = [
    'CHANNEL', 
    'LOAN_PURPOSE'
]

def custom_financial_loss_score(y_true, y_pred):
    """
    Custom Scorer for GridSearchCV.
    Optimizes for Lowest Financial Loss rather than raw Accuracy.
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Total Cost Calculation
    total_cost = (fn * COST_FALSE_NEGATIVE) + (fp * COST_FALSE_POSITIVE)
    
    # We negate because GridSearchCV tries to maximize the score
    return -total_cost

def load_data():
    print(f"--> Loading {INPUT_FILE}...")
    df = pd.read_parquet(INPUT_FILE)
    
    # Infinite/NaN hygiene
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(subset=NUMERIC_FEATURES, inplace=True)
    
    X = df[NUMERIC_FEATURES + CATEGORICAL_FEATURES]
    y = df[TARGET_COL]
    return X, y

def build_pipeline():
    """
    Constructs the processing pipeline without Synthetic Sampling.
    """
    # 1. Preprocessing
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, NUMERIC_FEATURES),
            ('cat', categorical_transformer, CATEGORICAL_FEATURES)
        ]
    )
    
    # 2. Classifier
    # class_weight='balanced': Adjusts weights inversely proportional to class frequencies.
    # This is the "Regulatory Compliant" way to handle imbalance.
    rf = RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', rf)
    ])
    
    return pipeline

def execute_training_grid(X_train, y_train):
    print("\n--> Starting Hyperparameter Tuning (GridSearchCV)...")
    print("    Prioritizing Model Stability (min_samples_leaf) over Granularity.")

    pipeline = build_pipeline()
    
    # --- The Grid ---
    # max_depth: Restrict depth to prevent memorizing noise.
    # min_samples_leaf: High values (50+) ensure every "rule" applies to a large cohort (Stability).
    param_grid = {
        'classifier__n_estimators': [100], # Keep constant for speed
        'classifier__max_depth': [8, 12],
        'classifier__min_samples_leaf': [50, 100] 
    }
    
    # --- Stratified Validation ---
    # Ensures every fold has the same 1.5% default rate as the population.
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # Use our Custom Financial Scorer
    financial_scorer = make_scorer(custom_financial_loss_score)

    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring=financial_scorer, # Optimize for Cost, not Accuracy
        verbose=2,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"\n    Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_financial_impact(model, X_test, y_test):
    print("\n--> Generating Cost-Sensitive Evaluation...")
    
    y_pred = model.predict(X_test)
    
    # 1. Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # 2. Cost Calculation
    financial_loss = (fn * COST_FALSE_NEGATIVE) + (fp * COST_FALSE_POSITIVE)
    
    # 3. Report
    print("\n--- CONFUSION MATRIX (Raw Counts) ---")
    print(f"True Negatives (Good Loans Approved): {tn}")
    print(f"False Positives (Good Loans Rejected): {fp}  [Opp Cost: ${fp * COST_FALSE_POSITIVE}]")
    print(f"False Negatives (Defaults Missed):     {fn}  [Risk Cost: ${fn * COST_FALSE_NEGATIVE}]")
    print(f"True Positives (Defaults Caught):      {tp}")
    
    print("\n--- FINANCIAL PERFORMANCE ---")
    print(f"Total Model Cost Score: {financial_loss}")
    print(f"Ratio of Risk Cost vs Opp Cost: {(fn*COST_FALSE_NEGATIVE)/(fp*COST_FALSE_POSITIVE):.2f}x")
    
    # 4. Visual Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='RdBu', cbar=False)
    plt.title(f"Confusion Matrix\n(Cost Penalty: FN=${COST_FALSE_NEGATIVE}, FP=${COST_FALSE_POSITIVE})")
    plt.ylabel('Actual Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # 5. Standard Metrics
    print("\n--- CLASSIFICATION REPORT ---")
    print(classification_report(y_test, y_pred))

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    # 1. Load
    X, y = load_data()
    
    # 2. Split
    # Stratified Split is mandatory for imbalanced datasets (Basel Requirement)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # 3. Tune & Train
    best_model = execute_training_grid(X_train, y_train)
    
    # 4. Evaluate
    evaluate_financial_impact(best_model, X_test, y_test)

The objective has been partially achieved, but the business viability is currently low. We`ve built a safety net, but it is too wide.

### 1. E The "Business Revolt" Scenario

*The Good News (Risk Safety):* The model achieved a Recall of 69%; successfully caught nearly 70% of all defaults (19,514 out of ~28k). In the world of unsampled, highly imbalanced data (1.5% default rate), this is a strong starting point for a Random Forest.

The Bad News (Commercial Viability): The Precision is 9%. This means for every 100 loans it rejected to prevent a default, 91 of them were actually good customers.
##### The Verdict: The Business Unit (Sales/Origination) would reject this model. You are rejecting 197,484 good loans to save 8,627 defaults. While you avoided credit losses, you decimated the bank's revenue stream and market share.

### 2. Deep Dive: Why did this happen?

The issue lies in the Financial Calibration (10:1 Cost Ratio) combined with class_weight='balanced'.
Double Penalty: The model used class_weight='balanced' (which mathematically boosts the minority class error) AND its optimized for a scorer that penalizes False Negatives 10x.

The Model's Logic: The model "learned" that the safest way to minimize cost is to aggressively label anything in the "Grey Zone" as a Default. It decided: "It is cheaper to incorrectly reject a decent borrower (1cost)thantoriskapprovingabadone(1cost)thantoriskapprovingabadone(10 cost)."

The Outcome: The model became extremely risk-averse. It successfully minimized the expensive errors (FN), but it accumulated so many cheap errors (FP) that the sheer volume of rejected interest income (197k)isnowdoublethesavedriskcost(197k)isnowdoublethesavedriskcost(86k).

### 3. Next Steps: Refining the Model

To fix this, we must move from "avoiding loss" to "optimizing profit." We need to improve Precision without destroying Recall.

#### Step A: Probability Threshold Tuning (The "Quick Win")
Currently, the model uses a default threshold of 0.5 to decide Yes/No. Because of class_weight='balanced', the probabilities are skewed. The optimal decision boundary is likely much higher (e.g., 0.65 or 0.70).

##### Action: Plot the Precision-Recall Curve. Find the threshold that yields Precision > 20% while keeping Recall > 50%.
Code Implementation: Do not just predict classes (predict()); predict probabilities (predict_proba()) and apply a custom threshold.

#### Step B: Interaction Features (The "Signal Booster")
The model is struggling to distinguish "High Risk" from "Medium Risk" (hence the low precision). We need sharper features.

##### Action: Create Interaction Features to isolate specific toxic combinations.
LTV_x_DTI: High debt + Low equity is worse than the sum of its parts.
FICO_per_Unit_LTV: (FICO Score) / (LTV).

#### Step C: Adjust the Cost Function
The 10:1 ratio might be too aggressive for this specific dataset distribution.

##### Action: Retune the grid search with a 5:1 ratio or use a "Profit Scorer" (Interest Income from Good Loans minus Principal Loss from Defaults) instead of a pure "Cost Scorer."

for the Next Step (Threshold Tuning & Interactions)