In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.onnx
from sklearn.preprocessing import StandardScaler

df_wa = pd.read_csv(r"hmda_2016_wa_all-records_labels.csv", low_memory=False)
df_ak = pd.read_csv(r"hmda_2016_ak_all-records_labels.csv", low_memory=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

cols_to_use = [
    'loan_type_name', 'loan_purpose_name', 'loan_amount_000s',
    'applicant_income_000s', 'property_type_name', 'purchaser_type_name',
    'owner_occupancy_name', 'applicant_ethnicity_name', 'preapproval_name',
    'lien_status_name', 'sequence_number',
    'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
    'hud_median_family_income', 'tract_to_msamd_income',
    'applicant_race_name_1', 'applicant_sex_name', 'action_taken'
]

train_df = df_wa[cols_to_use].copy()
test_df = df_ak[cols_to_use].copy()

num_cols = [
    'applicant_income_000s', 'number_of_owner_occupied_units',
    'number_of_1_to_4_family_units', 'hud_median_family_income',
    'tract_to_msamd_income'
]
for col in num_cols:
    med_train = train_df[col].median()
    med_test = test_df[col].median()
    train_df[col] = train_df[col].fillna(med_train)
    test_df[col] = test_df[col].fillna(med_test)

categorical_cols = [
    'loan_type_name', 'loan_purpose_name', 'property_type_name',
    'purchaser_type_name', 'owner_occupancy_name',
    'applicant_ethnicity_name', 'preapproval_name',
    'lien_status_name',
    'applicant_race_name_1', 'applicant_sex_name'
]
numerical_cols = [
    'loan_amount_000s', 'applicant_income_000s', 'sequence_number',
    'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
    'hud_median_family_income', 'tract_to_msamd_income'
]
target = 'action_taken'

train_ohe = pd.get_dummies(train_df, columns=categorical_cols, drop_first=False)
test_ohe = pd.get_dummies(test_df, columns=categorical_cols, drop_first=False)

train_ohe, test_ohe = train_ohe.align(test_ohe, join='left', axis=1, fill_value=0)

def to_binary(code):
    approved = {1,2,6,8}
    return 1 if code in approved else 0

y_train = train_ohe[target].apply(to_binary).values
X_train = train_ohe.drop(columns=[target]).values
y_test = test_ohe[target].apply(to_binary).values
X_test = test_ohe.drop(columns=[target]).values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.reshape(-1,1), dtype=torch.float32).to(device)

model = LogisticRegression(X_train_tensor.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 101):
    model.train()
    preds = model(X_train_tensor)
    loss = criterion(preds, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    with torch.no_grad():
        acc = ((preds>=0.5).float()==y_train_tensor).float().mean().item()
    print(f"Epoch {epoch:03d}: Loss={loss.item():.4f}, Train Acc={acc:.4f}")

model.eval()
with torch.no_grad():
    prob_test = model(X_test_tensor)
    acc_test = ((prob_test>=0.5).float()==y_test_tensor).float().mean().item()
    print(f"Test Accuracy: {acc_test:.4f}")

weights = model.linear.weight.detach().cpu().flatten().numpy()
bias = model.linear.bias.detach().item()
feature_names = train_ohe.drop(columns=[target]).columns.tolist()

def explain_rejection(x, top_k=3, include_bias=False):
    x_np = x.cpu().numpy().flatten()
    contrib = weights * x_np
    names = feature_names.copy()
    if include_bias:
        contrib = np.append(contrib, bias)
        names.append('BIAS')
    worst = np.argsort(contrib)[:top_k]
    return [(names[i], contrib[i]) for i in worst]

print("\nSample explanations:")
with torch.no_grad():
    for i in range(5):
        p = model(X_test_tensor[i].unsqueeze(0)).item()
        label = "APPROVED" if p>=0.5 else "REJECTED"
        print(f"Sample {i} ({label}, p={p:.2f}):", explain_rejection(X_test_tensor[i]))

model_cpu = model.cpu().eval()
dummy_in = torch.randn(1, X_train_tensor.shape[1], dtype=torch.float32)
torch.onnx.export(
    model_cpu, dummy_in, "logistic_regression_model.onnx",
    export_params=True, opset_version=11, do_constant_folding=True,
    input_names=['input'], output_names=['output'],
    dynamic_axes={'input':{0:'batch_size'}, 'output':{0:'batch_size'}}
)
print("\nONNX model saved to logistic_regression_model.onnx")


Epoch 001: Loss=0.6657, Train Acc=0.5873
Epoch 002: Loss=0.6484, Train Acc=0.6090
Epoch 003: Loss=0.6332, Train Acc=0.6267
Epoch 004: Loss=0.6198, Train Acc=0.6425
Epoch 005: Loss=0.6078, Train Acc=0.6564
Epoch 006: Loss=0.5969, Train Acc=0.6698
Epoch 007: Loss=0.5869, Train Acc=0.6825
Epoch 008: Loss=0.5776, Train Acc=0.6953
Epoch 009: Loss=0.5689, Train Acc=0.7077
Epoch 010: Loss=0.5606, Train Acc=0.7207
Epoch 011: Loss=0.5528, Train Acc=0.7337
Epoch 012: Loss=0.5453, Train Acc=0.7458
Epoch 013: Loss=0.5380, Train Acc=0.7574
Epoch 014: Loss=0.5310, Train Acc=0.7676
Epoch 015: Loss=0.5243, Train Acc=0.7774
Epoch 016: Loss=0.5178, Train Acc=0.7870
Epoch 017: Loss=0.5115, Train Acc=0.7960
Epoch 018: Loss=0.5055, Train Acc=0.8037
Epoch 019: Loss=0.4998, Train Acc=0.8100
Epoch 020: Loss=0.4943, Train Acc=0.8151
Epoch 021: Loss=0.4890, Train Acc=0.8192
Epoch 022: Loss=0.4839, Train Acc=0.8231
Epoch 023: Loss=0.4791, Train Acc=0.8263
Epoch 024: Loss=0.4744, Train Acc=0.8290
Epoch 025: Loss=

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.onnx
from sklearn.preprocessing import StandardScaler
import json
import pickle

import shap
import lime.lime_tabular

# [Previous data loading and model training code remains the same...]
# Load data
df_wa = pd.read_csv(r"hmda_2016_wa_all-records_labels.csv", low_memory=False)
df_ak = pd.read_csv(r"hmda_2016_ak_all-records_labels.csv", low_memory=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

cols_to_use = [
    'loan_type_name', 'loan_purpose_name', 'loan_amount_000s',
    'applicant_income_000s', 'property_type_name', 'purchaser_type_name',
    'owner_occupancy_name', 'applicant_ethnicity_name', 'preapproval_name',
    'lien_status_name', 'sequence_number',
    'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
    'hud_median_family_income', 'tract_to_msamd_income',
    'applicant_race_name_1', 'applicant_sex_name', 'action_taken'
]

train_df = df_wa[cols_to_use].copy()
test_df = df_ak[cols_to_use].copy()

# [Data preprocessing code...]
num_cols = [
    'applicant_income_000s', 'number_of_owner_occupied_units',
    'number_of_1_to_4_family_units', 'hud_median_family_income',
    'tract_to_msamd_income'
]
for col in num_cols:
    med_train = train_df[col].median()
    med_test = test_df[col].median()
    train_df[col] = train_df[col].fillna(med_train)
    test_df[col] = test_df[col].fillna(med_test)

categorical_cols = [
    'loan_type_name', 'loan_purpose_name', 'property_type_name',
    'purchaser_type_name', 'owner_occupancy_name',
    'applicant_ethnicity_name', 'preapproval_name',
    'lien_status_name', 'applicant_race_name_1', 'applicant_sex_name'
]
numerical_cols = [
    'loan_amount_000s', 'applicant_income_000s', 'sequence_number',
    'number_of_owner_occupied_units', 'number_of_1_to_4_family_units',
    'hud_median_family_income', 'tract_to_msamd_income'
]
target = 'action_taken'

train_ohe = pd.get_dummies(train_df, columns=categorical_cols, drop_first=False)
test_ohe = pd.get_dummies(test_df, columns=categorical_cols, drop_first=False)
train_ohe, test_ohe = train_ohe.align(test_ohe, join='left', axis=1, fill_value=0)

def to_binary(code):
    approved = {1,2,6,8}
    return 1 if code in approved else 0

y_train = train_ohe[target].apply(to_binary).values
X_train = train_ohe.drop(columns=[target]).values
y_test = test_ohe[target].apply(to_binary).values
X_test = test_ohe.drop(columns=[target]).values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.reshape(-1,1), dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.reshape(-1,1), dtype=torch.float32).to(device)

# Train model
model = LogisticRegression(X_train_tensor.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 101):
    model.train()
    preds = model(X_train_tensor)
    loss = criterion(preds, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        with torch.no_grad():
            acc = ((preds>=0.5).float()==y_train_tensor).float().mean().item()
        print(f"Epoch {epoch:03d}: Loss={loss.item():.4f}, Train Acc={acc:.4f}")

model.eval()
with torch.no_grad():
    prob_test = model(X_test_tensor)
    acc_test = ((prob_test>=0.5).float()==y_test_tensor).float().mean().item()
    print(f"Test Accuracy: {acc_test:.4f}")

feature_names = train_ohe.drop(columns=[target]).columns.tolist()

# Save model weights and metadata ONLY (no stored explanations)
weights = model.linear.weight.detach().cpu().flatten().numpy().tolist()
bias = model.linear.bias.detach().cpu().item()
scaler_mean = scaler.mean_.tolist()
scaler_scale = scaler.scale_.tolist()

# Save background data for dynamic SHAP explanations
background_data = X_train_scaled[:100].tolist()  # Small background sample

# Clean model data - NO STORED EXPLANATIONS
model_data = {
    "weights": weights,
    "bias": bias,
    "scaler_mean": scaler_mean,
    "scaler_scale": scaler_scale,
    "feature_names": feature_names,
    "background_data": background_data,  # For dynamic SHAP
    "feature_categories": {
        "numerical": numerical_cols,
        "categorical": categorical_cols
    },
    "dynamic_explanations": True  # Flag indicating dynamic explanations
}

# Save clean model data (no stored explanations)
with open('model_data.json', 'w') as f:
    json.dump(model_data, f, indent=2)

print("✅ Model data saved for DYNAMIC explanations!")
print("🚫 No stored explanations - all explanations computed on-demand")
print(f"📊 Features: {len(feature_names)}")
print(f"🎯 Background samples for SHAP: {len(background_data)}")

# Export ONNX model
model_cpu = model.cpu().eval()
dummy_in = torch.randn(1, X_train_tensor.shape[1], dtype=torch.float32)
torch.onnx.export(
    model_cpu, dummy_in, "logistic_regression_model.onnx",
    export_params=True, opset_version=11, do_constant_folding=True,
    input_names=['input'], output_names=['output'],
    dynamic_axes={'input':{0:'batch_size'}, 'output':{0:'batch_size'}}
)
print("✅ ONNX model saved")


Epoch 020: Loss=0.5006, Train Acc=0.8058
Epoch 040: Loss=0.4128, Train Acc=0.8590
Epoch 060: Loss=0.3691, Train Acc=0.8576
Epoch 080: Loss=0.3439, Train Acc=0.8608
Epoch 100: Loss=0.3273, Train Acc=0.8623
Test Accuracy: 0.9135
✅ Model data saved for DYNAMIC explanations!
🚫 No stored explanations - all explanations computed on-demand
📊 Features: 52
🎯 Background samples for SHAP: 100
✅ ONNX model saved
