In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import json
from sklearn.metrics import f1_score

print("Loading model...")
model = xgb.Booster()
model.load_model("loan_model.json")

with open("training_columns.json", "r") as f:
    training_cols = json.load(f)

df = pd.read_csv("balanced_dataset.csv")
print(f"Loaded {len(df)} loans")

# Convert Yes and No to 1/0
for col in ['HasMortgage', 'HasDependents', 'HasCoSigner']:
    if col in df.columns:
        df[col] = (df[col] == 'Yes').astype(int)


def prep_data(row_dict):
    row = pd.DataFrame([row_dict])
    
    # Convert to dummies
    row = pd.get_dummies(row, columns=['Education', 'EmploymentType', 'MaritalStatus', 'LoanPurpose'], dtype=int)
    
    # Match training columns
    for col in training_cols:
        if col not in row.columns:
            row[col] = 0
    
    return row[training_cols]

# Add streamlit risk modifications
def adjust_prob(row, prob):
    loan_to_income = row['LoanAmount'] / row['Income']
    monthly_payment = (row['LoanAmount'] * (row['InterestRate']/100/12)) / (1 - (1 + row['InterestRate']/100/12)**(-row['LoanTerm']))
    monthly_income = row['Income'] / 12

    LoanAmount = row['LoanAmount']
    Income = row['Income']
    
    
    reject_mult = 1
    accept_mult = 1
    # Reject mult
    if row['LoanPurpose'] != "Home":
        if  LoanAmount / Income > 2:
            reject_mult *= 3
        elif monthly_payment > monthly_income * 0.4:
            reject_mult *= 3.5
    else:
        if (LoanAmount / Income > 10):
            reject_mult *= 3
    if row['Age'] + (row['LoanTerm'] / 12) > 75:
        reject_mult *= 2
    # Accept mult
    if LoanAmount / Income < 0.1:
        accept_mult *= 0.5
    if monthly_payment < monthly_income * 0.1:
        accept_mult *= 0.5
    
    return prob * reject_mult * accept_mult

# Get predictions
print("\nGetting predictions...")
X = df.drop(['Default', 'LoanID'], axis=1, errors='ignore')
y_true = df['Default'].values

predictions = []
for i, row in X.iterrows():
    if i % 1000 == 0:
        print(f"Checked {i}/{len(X)} cases...")
    
    processed = prep_data(row.to_dict())
    dmatrix = xgb.DMatrix(processed)
    base_prob = float(model.predict(dmatrix)[0])
    final_prob = adjust_prob(row, base_prob)
    predictions.append(final_prob)

predictions = np.array(predictions)
print("Done!\n")

# Find best thresholds
def calc_f1(threshold):
    y_pred = (predictions >= threshold).astype(int)
    return f1_score(y_true, y_pred)

def get_approval_rate(threshold):
    y_pred = (predictions >= threshold).astype(int)
    return np.mean(y_pred == 0)

# Test different thresholds
thresholds = np.linspace(0.01, 0.99, 500)

# Moderate Risk Profile 
# Best F1
print("Finding MODERATE threshold (best F1)...")
f1_scores = [calc_f1(t) for t in thresholds]
moderate_idx = np.argmax(f1_scores)
moderate_threshold = thresholds[moderate_idx]
moderate_f1 = f1_scores[moderate_idx]
moderate_approval = get_approval_rate(moderate_threshold)

# Conservative Risk Profile 
# Minimize bad loans but can't reject more than 40% extra
print("Finding CONSERVATIVE threshold...")
min_approval = moderate_approval * 0.60  # can reject up to 40% more

conservative_threshold = moderate_threshold
conservative_f1 = moderate_f1

for t in thresholds:
    approval = get_approval_rate(t)
    if approval >= min_approval:  # meets constraint
        y_pred = (predictions >= t).astype(int)
        fn = np.sum((y_true == 1) & (y_pred == 0))  # bad loans approved
        
        current_fn = np.sum((y_true == 1) & ((predictions >= conservative_threshold).astype(int) == 0))
        if fn < current_fn:  # fewer bad loans
            conservative_threshold = t
            conservative_f1 = calc_f1(t)

# Aggressive Risk Profile
# Maximize true approvals but don't approve too many 
print("Finding AGGRESSIVE threshold...")
max_approval = moderate_approval * 1.40

aggressive_threshold = moderate_threshold
aggressive_f1 = moderate_f1

for t in thresholds:
    approval = get_approval_rate(t)
    if approval <= max_approval:  # meets constraint
        if approval > get_approval_rate(aggressive_threshold):
            aggressive_threshold = t
            aggressive_f1 = calc_f1(t)

# Print the stuffs
print(f"\nCONSERVATIVE:")
print(f"  Threshold: {conservative_threshold:.4f}")
print(f"  F1 Score:  {conservative_f1:.4f}")

print(f"\nMODERATE:")
print(f"  Threshold: {moderate_threshold:.4f}")
print(f"  F1 Score:  {moderate_f1:.4f}")

print(f"\nAGGRESSIVE:")
print(f"  Threshold: {aggressive_threshold:.4f}")
print(f"  F1 Score:  {aggressive_f1:.4f}")

Loading model...
Loaded 59306 loans

Getting predictions...
Checked 0/59306 cases...
Checked 1000/59306 cases...
Checked 2000/59306 cases...
Checked 3000/59306 cases...
Checked 4000/59306 cases...
Checked 5000/59306 cases...
Checked 6000/59306 cases...
Checked 7000/59306 cases...
Checked 8000/59306 cases...
Checked 9000/59306 cases...
Checked 10000/59306 cases...
Checked 11000/59306 cases...
Checked 12000/59306 cases...
Checked 13000/59306 cases...
Checked 14000/59306 cases...
Checked 15000/59306 cases...
Checked 16000/59306 cases...
Checked 17000/59306 cases...
Checked 18000/59306 cases...
Checked 19000/59306 cases...
Checked 20000/59306 cases...
Checked 21000/59306 cases...
Checked 22000/59306 cases...
Checked 23000/59306 cases...
Checked 24000/59306 cases...
Checked 25000/59306 cases...
Checked 26000/59306 cases...
Checked 27000/59306 cases...
Checked 28000/59306 cases...
Checked 29000/59306 cases...
Checked 30000/59306 cases...
Checked 31000/59306 cases...
Checked 32000/59306 cases