In [17]:
# =============================================
# Cell 0: 从 Supabase 获取数据
# =============================================
# !pip install supabase pandas scikit-learn

from supabase import create_client, Client
import pandas as pd
import time

url = "https://ptukzshzuloxipzwycte.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InB0dWt6c2h6dWxveGlwend5Y3RlIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTIxNjg0OTMsImV4cCI6MjA2Nzc0NDQ5M30.MAnlnrt0traaFjE-QV3jSKETU6woZJ8LcVIqjrAIiQ4"
supabase: Client = create_client(url, key)

table_name = "freddie_mac_delinquency_30_model_2013_2025"
rows = []
batch_size = 1000
offset = 0
max_rows = 40000

while offset < max_rows:
    print(f"Get rows {offset} - {offset + batch_size - 1} ...")
    try:
        res = supabase.table(table_name).select("*").range(offset, offset + batch_size - 1).execute()
        if not res.data:
            print("End data reading.")
            break
        rows.extend(res.data)
        offset += batch_size
        print(f"Progress：{offset}/{max_rows} ({(offset/max_rows)*100:.1f}%)")
    except Exception as e:
        print(f"Request fail：{e}，retry after 5 seconds...")
        time.sleep(5)
        continue

df = pd.DataFrame(rows)
csv_name = "freddie_mac_delinquency_balanced.csv"
df.to_csv(csv_name, index=False)
print("Raw file saved：", csv_name)
print("Total rows：", df.shape[0])
print("Headings count：", df.shape[1])
print("Headings：", df.columns.tolist())


Get rows 0 - 999 ...
Progress：1000/40000 (2.5%)
Get rows 1000 - 1999 ...
Progress：2000/40000 (5.0%)
Get rows 2000 - 2999 ...
Progress：3000/40000 (7.5%)
Get rows 3000 - 3999 ...
Progress：4000/40000 (10.0%)
Get rows 4000 - 4999 ...
Progress：5000/40000 (12.5%)
Get rows 5000 - 5999 ...
Progress：6000/40000 (15.0%)
Get rows 6000 - 6999 ...
Progress：7000/40000 (17.5%)
Get rows 7000 - 7999 ...
Progress：8000/40000 (20.0%)
Get rows 8000 - 8999 ...
Progress：9000/40000 (22.5%)
Get rows 9000 - 9999 ...
Progress：10000/40000 (25.0%)
Get rows 10000 - 10999 ...
Progress：11000/40000 (27.5%)
Get rows 11000 - 11999 ...
Progress：12000/40000 (30.0%)
Get rows 12000 - 12999 ...
Progress：13000/40000 (32.5%)
Get rows 13000 - 13999 ...
Progress：14000/40000 (35.0%)
Get rows 14000 - 14999 ...
Progress：15000/40000 (37.5%)
Get rows 15000 - 15999 ...
Progress：16000/40000 (40.0%)
Get rows 16000 - 16999 ...
Progress：17000/40000 (42.5%)
Get rows 17000 - 17999 ...
Progress：18000/40000 (45.0%)
Get rows 18000 - 18999 ...
P

In [18]:
# =============================================
# Cell 1: 全局时间划分 + Loan ID 隔离
# =============================================
# 修正后的方案：
# 1. 按全局时间划分（不是按每个 loan 内部划分）
# 2. 训练集: 2013-2021，验证集: 2022-2023，测试集: 2024-2025
# 3. 测试集的 loan 不能出现在训练集/验证集中（避免数据泄漏）
# 4. 同一个 loan 的所有记录保持在一起

import numpy as np
import pandas as pd

# 确保必要的列存在
assert 'loan_identifier' in df.columns, "loan_identifier 列不存在！"
assert 'period' in df.columns, "period 列不存在！"

target_col = "delinquency_30d_label"
assert target_col in df.columns, f"{target_col} 列不存在！"

# 转换 period 为字符串格式，并提取年份
df['period'] = df['period'].astype(str)
df['period_year'] = df['period'].str[:4].astype(int)

print("=" * 60)
print("数据时间分布")
print("=" * 60)
print(df['period_year'].value_counts().sort_index())

# =============================================
# 步骤 1: 按全局时间划分
# =============================================
# 训练集: 2013-2021 (用于训练模型)
# 验证集: 2022-2023 (用于调参和概率校准)
# 测试集: 2024-2025 (用于最终评估 - 真正的时间外推)

train_years = list(range(2013, 2022))  # 2013-2021
val_years = [2022, 2023]               # 2022-2023
test_years = [2024, 2025]              # 2024-2025

print("\n" + "=" * 60)
print("步骤 1: 按全局时间划分")
print("=" * 60)
print(f"训练集年份: {train_years}")
print(f"验证集年份: {val_years}")
print(f"测试集年份: {test_years}")

# 初步按时间划分
df_train_raw = df[df['period_year'].isin(train_years)].copy()
df_val_raw = df[df['period_year'].isin(val_years)].copy()
df_test_raw = df[df['period_year'].isin(test_years)].copy()

print(f"\n初步划分结果:")
print(f"训练集: {len(df_train_raw):,} 条记录")
print(f"验证集: {len(df_val_raw):,} 条记录")
print(f"测试集: {len(df_test_raw):,} 条记录")

# =============================================
# 步骤 2: Loan ID 隔离（避免数据泄漏）
# =============================================
print("\n" + "=" * 60)
print("步骤 2: Loan ID 隔离")
print("=" * 60)

# 找出各时间段的 loan
train_loans = set(df_train_raw['loan_identifier'].unique())
val_loans = set(df_val_raw['loan_identifier'].unique())
test_loans_raw = set(df_test_raw['loan_identifier'].unique())

print(f"训练集 Loan 数量: {len(train_loans):,}")
print(f"验证集 Loan 数量: {len(val_loans):,}")
print(f"测试集 Loan 数量: {len(test_loans_raw):,}")

# 检查重叠
train_val_overlap = train_loans & val_loans
train_test_overlap = train_loans & test_loans_raw
val_test_overlap = val_loans & test_loans_raw

print(f"\n重叠情况:")
print(f"训练集与验证集重叠 Loan: {len(train_val_overlap):,}")
print(f"训练集与测试集重叠 Loan: {len(train_test_overlap):,}")
print(f"验证集与测试集重叠 Loan: {len(val_test_overlap):,}")

# 从测试集中移除在训练集/验证集中出现过的 loan
contaminated_loans = train_loans | val_loans
df_test = df_test_raw[~df_test_raw['loan_identifier'].isin(contaminated_loans)].copy()

df_val = df_val_raw.copy()
df_train = df_train_raw.copy()

print(f"\n隔离后:")
print(f"测试集 Loan 数量: {df_test['loan_identifier'].nunique():,}")

# =============================================
# 步骤 3: 最终数据集统计
# =============================================
print("\n" + "=" * 60)
print("步骤 3: 最终数据集统计")
print("=" * 60)

print(f"训练集: {len(df_train):,} 条记录, {df_train['loan_identifier'].nunique():,} 个 loan")
print(f"验证集: {len(df_val):,} 条记录, {df_val['loan_identifier'].nunique():,} 个 loan")
print(f"测试集: {len(df_test):,} 条记录, {df_test['loan_identifier'].nunique():,} 个 loan")

# =============================================
# 步骤 4: 类别分布检查
# =============================================
print("\n" + "=" * 60)
print("步骤 4: 类别分布检查")
print("=" * 60)

train_pos_rate = df_train[target_col].mean()
val_pos_rate = df_val[target_col].mean()
test_pos_rate = df_test[target_col].mean() if len(df_test) > 0 else 0

print(f"训练集 - 违约比例: {train_pos_rate:.4f} ({df_train[target_col].sum():,} / {len(df_train):,})")
print(f"验证集 - 违约比例: {val_pos_rate:.4f} ({df_val[target_col].sum():,} / {len(df_val):,})")
print(f"测试集 - 违约比例: {test_pos_rate:.4f} ({df_test[target_col].sum():,} / {len(df_test):,})")

train_imbalance = (1 - train_pos_rate) / train_pos_rate if train_pos_rate > 0 else float('inf')
print(f"\n训练集不平衡比例: {train_imbalance:.2f}:1 (正常:违约)")

# =============================================
# 步骤 5: 数据隔离验证
# =============================================
print("\n" + "=" * 60)
print("步骤 5: 数据隔离验证")
print("=" * 60)

final_train_loans = set(df_train['loan_identifier'].unique())
final_val_loans = set(df_val['loan_identifier'].unique())
final_test_loans = set(df_test['loan_identifier'].unique()) if len(df_test) > 0 else set()

train_test_final = len(final_train_loans & final_test_loans)
val_test_final = len(final_val_loans & final_test_loans)

print(f"训练集与测试集 Loan 重叠: {train_test_final} (应该 = 0)")
print(f"验证集与测试集 Loan 重叠: {val_test_final} (应该 = 0)")

if train_test_final == 0 and val_test_final == 0:
    print("\n✅ 数据隔离验证通过！")
else:
    print("\n❌ 警告：存在数据泄漏！")

# =============================================
# 步骤 6: 时间范围检查
# =============================================
print("\n" + "=" * 60)
print("步骤 6: 时间范围检查")
print("=" * 60)
print(f"训练集时间范围: {df_train['period'].min()} - {df_train['period'].max()}")
print(f"验证集时间范围: {df_val['period'].min()} - {df_val['period'].max()}")
if len(df_test) > 0:
    print(f"测试集时间范围: {df_test['period'].min()} - {df_test['period'].max()}")
else:
    print("测试集: 无数据（所有 loan 都在训练/验证集中出现过）")

print("\n" + "=" * 60)
print("时间外推验证说明")
print("=" * 60)
print("""
✓ 训练集 (2013-2021): 用于训练模型
✓ 验证集 (2022-2023): 用于调参和概率校准
✓ 测试集 (2024-2025): 用于最终评估（真正的时间外推）

评估维度:
1. 验证集性能 → 模型对"近期数据"的预测能力
2. 测试集性能 → 模型对"未来新客户"的预测能力（最重要！）
""")


数据时间分布
period_year
2013    1140
2014    2729
2015    5361
2016    2422
2017    3585
2018    3223
2019    3729
2020    2425
2021    3501
2022    2655
2023    1101
2024    4540
2025    3589
Name: count, dtype: int64

步骤 1: 按全局时间划分
训练集年份: [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
验证集年份: [2022, 2023]
测试集年份: [2024, 2025]

初步划分结果:
训练集: 28,115 条记录
验证集: 3,756 条记录
测试集: 8,129 条记录

步骤 2: Loan ID 隔离
训练集 Loan 数量: 25,643
验证集 Loan 数量: 3,609
测试集 Loan 数量: 7,872

重叠情况:
训练集与验证集重叠 Loan: 49
训练集与测试集重叠 Loan: 0
验证集与测试集重叠 Loan: 59

隔离后:
测试集 Loan 数量: 7,813

步骤 3: 最终数据集统计
训练集: 28,115 条记录, 25,643 个 loan
验证集: 3,756 条记录, 3,609 个 loan
测试集: 8,066 条记录, 7,813 个 loan

步骤 4: 类别分布检查
训练集 - 违约比例: 0.5013 (14,095 / 28,115)
验证集 - 违约比例: 0.4712 (1,770 / 3,756)
测试集 - 违约比例: 0.5055 (4,077 / 8,066)

训练集不平衡比例: 0.99:1 (正常:违约)

步骤 5: 数据隔离验证
训练集与测试集 Loan 重叠: 0 (应该 = 0)
验证集与测试集 Loan 重叠: 0 (应该 = 0)

✅ 数据隔离验证通过！

步骤 6: 时间范围检查
训练集时间范围: 201307 - 202112
验证集时间范围: 202201 - 202312
测试集时间范围: 202401 - 202506

时间外推验证说明

✓ 训练集 (2013-2021

In [None]:
# =============================================
# Cell 2: 数据清洗和特征工程（分别处理三个数据集）
# =============================================
# 原则：
# 1. 清洗逻辑对三个数据集分别应用
# 2. LabelEncoder 只用训练集 fit，验证/测试集只 transform
# 3. 缺失值填充用训练集的统计量

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

target_col = "delinquency_30d_label"

# =============================================
# 定义需要保留和删除的列
# =============================================
must_keep_cols = [
    # "period_year",  # 移除！会导致时间外推问题（验证集有训练集没见过的年份）
    "period_month",  # 月份可以保留（1-12 循环）
    "credit_score", "original_loan_to_value_ltv",
    "original_debt_to_income_dti_ratio", "current_interest_rate",
    "loan_age_years", "interest_rate_diff"
]

# 这些列是潜在的数据泄漏或不适合作为特征
leakage_cols = [
    "loan_identifier",  # ID 列，划分后需要删除
    "first_payment_date", "maturity_date",
    "loan_age", "remaining_months_to_legal_maturity",
    "current_loan_delinquency_status", "payment_history",  # ⚠️ 直接泄漏！
    "loan_to_value_ratio_bucket", "credit_score_bucket",
    "high_dti_flag", "loan_size_bucket", "interest_rate_bucket",
    "seasonality_flag",
    "period_year",  # 时间外推问题
    # ⚠️ 新增可疑泄漏特征
    "recent_delinquency_flag",  # 直接泄漏！这个特征可能基于目标变量计算
    "distressed_principal_balance_flag",  # 可能泄漏
    "interest_rate_diff",  # 如果是当前利率与原始利率的差，可能OK；但需要确认
]

# =============================================
# 数据清洗函数
# =============================================
def clean_dataset(df, name, drop_leakage=True):
    """对单个数据集进行基本清洗"""
    df = df.copy()
    
    # 删除潜在泄漏列（但保留 loan_identifier 用于后续检查）
    if drop_leakage:
        cols_to_drop = [col for col in leakage_cols if col in df.columns and col != 'loan_identifier']
        df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
    
    # 转换数值类型
    for col in df.columns:
        if df[col].dtype == object and col not in [target_col, 'loan_identifier', 'period']:
            try:
                df[col] = pd.to_numeric(df[col])
            except:
                pass
    
    print(f"{name} 清洗后: {df.shape}")
    return df

# 对三个数据集分别清洗
df_train_clean = clean_dataset(df_train, "训练集")
df_val_clean = clean_dataset(df_val, "验证集")
df_test_clean = clean_dataset(df_test, "测试集")

# =============================================
# 基于训练集确定要删除的列
# =============================================
print("\n" + "=" * 50)
print("基于训练集确定特征选择")
print("=" * 50)

# 1. 高缺失率列（基于训练集）
missing_rate = df_train_clean.isnull().mean()
high_missing_cols = missing_rate[missing_rate > 0.4].index.tolist()
high_missing_cols = [col for col in high_missing_cols if col not in must_keep_cols + [target_col, 'loan_identifier', 'period']]
print(f"高缺失率列 (>40%): {high_missing_cols}")

# 2. 低方差列（基于训练集）
numeric_cols = [col for col in df_train_clean.select_dtypes(include=[np.number]).columns 
                if col not in [target_col, 'loan_identifier']]
if numeric_cols:
    selector = VarianceThreshold(threshold=0.01)
    try:
        selector.fit(df_train_clean[numeric_cols].fillna(0))
        low_var_cols = [col for col, var in zip(numeric_cols, selector.variances_) 
                        if var < 0.01 and col not in must_keep_cols]
    except:
        low_var_cols = []
else:
    low_var_cols = []
print(f"低方差列: {low_var_cols}")

# 3. 高相关列（基于训练集）
numeric_cols_for_corr = [col for col in df_train_clean.select_dtypes(include=[np.number]).columns 
                          if col not in [target_col, 'loan_identifier'] + high_missing_cols + low_var_cols]
if len(numeric_cols_for_corr) > 1:
    corr_matrix = df_train_clean[numeric_cols_for_corr].corr()
    drop_corr = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                if corr_matrix.columns[i] not in must_keep_cols:
                    drop_corr.add(corr_matrix.columns[i])
    drop_corr = list(drop_corr)
else:
    drop_corr = []
print(f"高相关列 (>0.9): {drop_corr}")

# 合并所有要删除的列
cols_to_remove = list(set(high_missing_cols + low_var_cols + drop_corr))
print(f"\n总共删除列数: {len(cols_to_remove)}")

# 对三个数据集应用相同的列删除
for df_clean, name in [(df_train_clean, "训练集"), (df_val_clean, "验证集"), (df_test_clean, "测试集")]:
    df_clean.drop(columns=[c for c in cols_to_remove if c in df_clean.columns], inplace=True, errors='ignore')
    print(f"{name} 删除后: {df_clean.shape}")

# =============================================
# LabelEncoder：只用训练集 fit
# =============================================
print("\n" + "=" * 50)
print("LabelEncoder 编码（只用训练集 fit）")
print("=" * 50)

# 找出需要编码的列（排除 ID 和 period）
obj_cols = [col for col in df_train_clean.select_dtypes(include=['object']).columns 
            if col not in [target_col, 'loan_identifier', 'period']]
print(f"需要编码的列: {obj_cols}")

label_encoders = {}
for col in obj_cols:
    le = LabelEncoder()
    
    # 只用训练集 fit
    le.fit(df_train_clean[col].astype(str))
    
    # 处理验证集和测试集中可能出现的未知类别
    def safe_transform(series, encoder):
        """安全转换，未知类别用 -1 表示"""
        result = []
        for val in series.astype(str):
            if val in encoder.classes_:
                result.append(encoder.transform([val])[0])
            else:
                result.append(-1)  # 未知类别
        return result
    
    # 转换三个数据集
    df_train_clean[col] = le.transform(df_train_clean[col].astype(str))
    df_val_clean[col] = safe_transform(df_val_clean[col], le)
    df_test_clean[col] = safe_transform(df_test_clean[col], le)
    
    label_encoders[col] = le

print(f"编码完成，共 {len(label_encoders)} 个编码器")

# =============================================
# 缺失值填充：用训练集的统计量
# =============================================
print("\n" + "=" * 50)
print("缺失值填充（用训练集统计量）")
print("=" * 50)

# 计算训练集的填充值
fill_values = {}
for col in df_train_clean.columns:
    if col in [target_col, 'loan_identifier', 'period']:
        continue
    if df_train_clean[col].dtype == 'object':
        fill_values[col] = df_train_clean[col].mode()[0] if len(df_train_clean[col].mode()) > 0 else 'unknown'
    else:
        fill_values[col] = df_train_clean[col].median()

# 用训练集的统计量填充三个数据集
for df_clean, name in [(df_train_clean, "训练集"), (df_val_clean, "验证集"), (df_test_clean, "测试集")]:
    for col, fill_val in fill_values.items():
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(fill_val)
    missing_count = df_clean.isnull().sum().sum()
    print(f"{name} 填充后缺失值: {missing_count}")

# =============================================
# 删除 loan_identifier 和 period（不作为特征）
# =============================================
for df_clean in [df_train_clean, df_val_clean, df_test_clean]:
    df_clean.drop(columns=['loan_identifier', 'period'], inplace=True, errors='ignore')

# =============================================
# 最终数据集信息
# =============================================
print("\n" + "=" * 50)
print("最终数据集")
print("=" * 50)
print(f"训练集: {df_train_clean.shape}")
print(f"验证集: {df_val_clean.shape}")
print(f"测试集: {df_test_clean.shape}")
print(f"\n特征列: {[c for c in df_train_clean.columns if c != target_col]}")


训练集 清洗后: (28115, 39)
验证集 清洗后: (3756, 39)
测试集 清洗后: (8066, 39)

基于训练集确定特征选择
高缺失率列 (>40%): ['modification_flag', 'delinquency_due_to_disaster', 'bankruptcy_flag', 'number_of_modifications', 'modification_debt_to_income_ratio', 'interest_rate_step_indicator', 'property_valuation_method', 'borrower_assistance_plan', 'payment_deferral_flag', 'state_default_rate', 'msa_default_rate', 'modification_history_flag']
低方差列: ['number_of_modifications', 'state_default_rate', 'msa_default_rate', 'modification_history_flag']
高相关列 (>0.9): []

总共删除列数: 12
训练集 删除后: (28115, 27)
验证集 删除后: (3756, 27)
测试集 删除后: (8066, 27)

LabelEncoder 编码（只用训练集 fit）
需要编码的列: ['amortization_type', 'seller_name', 'property_state', 'loan_purpose', 'channel', 'property_type', 'occupancy_status', 'first_time_homebuyer_indicator', 'distressed_principal_balance_flag']
编码完成，共 9 个编码器

缺失值填充（用训练集统计量）
训练集 填充后缺失值: 0
验证集 填充后缺失值: 0
测试集 填充后缺失值: 0

最终数据集
训练集: (28115, 25)
验证集: (3756, 25)
测试集: (8066, 25)

特征列: ['period_month', 'amortization_type',

In [23]:
# =============================================
# Cell 3: 准备特征矩阵和目标变量
# =============================================
import numpy as np

target_col = "delinquency_30d_label"

# =============================================
# 准备 X 和 y
# =============================================
feature_cols = [c for c in df_train_clean.columns if c != target_col]

X_train = df_train_clean[feature_cols].values
y_train = df_train_clean[target_col].astype(int).values

X_val = df_val_clean[feature_cols].values
y_val = df_val_clean[target_col].astype(int).values

X_test = df_test_clean[feature_cols].values
y_test = df_test_clean[target_col].astype(int).values

print("=" * 50)
print("特征矩阵和目标变量")
print("=" * 50)
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# =============================================
# 计算样本权重（基于训练集类别分布）
# =============================================
print("\n" + "=" * 50)
print("样本权重计算")
print("=" * 50)

class_counts = np.bincount(y_train)
w0 = class_counts.sum() / (2.0 * class_counts[0]) if class_counts[0] > 0 else 1.0
w1 = class_counts.sum() / (2.0 * class_counts[1]) if class_counts[1] > 0 else 1.0

print(f"类别 0 (正常) 数量: {class_counts[0]}, 权重: {w0:.4f}")
print(f"类别 1 (违约) 数量: {class_counts[1]}, 权重: {w1:.4f}")

# 为训练集分配样本权重
sw_train = np.where(y_train == 1, w1, w0).astype(float)

print(f"\n样本权重分配完成")
print(f"训练集样本权重范围: [{sw_train.min():.4f}, {sw_train.max():.4f}]")

# =============================================
# 保存特征列名（用于后续分析）
# =============================================
# 明确指定分类特征（这些特征的类别在训练集和验证/测试集中应该一致）
# 注意：不要把 period_year 等时间相关特征作为分类特征！
explicit_cat_cols = [
    'amortization_type', 'loan_purpose', 'channel', 'property_type',
    'occupancy_status', 'first_time_homebuyer_indicator',
    'number_of_units', 'distressed_principal_balance_flag',
    'period_month'  # 月份 1-12，在所有数据集中都有
]

# 识别分类特征和连续特征
cat_cols = [col for col in feature_cols if col in explicit_cat_cols]
cont_cols = [col for col in feature_cols if col not in cat_cols]

print(f"\n特征类型:")
print(f"分类特征 ({len(cat_cols)}): {cat_cols}")
print(f"连续特征 ({len(cont_cols)}): {cont_cols}")

# 检查分类特征的类别是否在训练集中都出现过
print("\n分类特征类别检查:")
for col in cat_cols:
    train_cats = set(df_train_clean[col].unique())
    val_cats = set(df_val_clean[col].unique())
    test_cats = set(df_test_clean[col].unique()) if len(df_test_clean) > 0 else set()
    
    val_new = val_cats - train_cats
    test_new = test_cats - train_cats
    
    if val_new or test_new:
        print(f"  ⚠️ {col}: 验证集新类别={val_new}, 测试集新类别={test_new}")
    else:
        print(f"  ✓ {col}: OK")


特征矩阵和目标变量
X_train: (28115, 24), y_train: (28115,)
X_val: (3756, 24), y_val: (3756,)
X_test: (8066, 24), y_test: (8066,)

样本权重计算
类别 0 (正常) 数量: 14020, 权重: 1.0027
类别 1 (违约) 数量: 14095, 权重: 0.9973

样本权重分配完成
训练集样本权重范围: [0.9973, 1.0027]

特征类型:
分类特征 (9): ['period_month', 'amortization_type', 'loan_purpose', 'channel', 'property_type', 'number_of_units', 'occupancy_status', 'first_time_homebuyer_indicator', 'distressed_principal_balance_flag']
连续特征 (15): ['seller_name', 'property_state', 'msa', 'original_loan_term', 'original_interest_rate', 'original_upb', 'credit_score', 'original_loan_to_value_ltv', 'original_debt_to_income_dti_ratio', 'mortgage_insurance_percentage_mi_percent', 'current_interest_rate', 'current_actual_upb', 'loan_age_years', 'interest_rate_diff', 'recent_delinquency_flag']

分类特征类别检查:
  ✓ period_month: OK
  ✓ amortization_type: OK
  ✓ loan_purpose: OK
  ✓ channel: OK
  ✓ property_type: OK
  ✓ number_of_units: OK
  ✓ occupancy_status: OK
  ✓ first_time_homebuyer_indicator: OK

In [24]:
# =============================================
# Cell 4: LogisticGAM 模型训练
# =============================================
from pygam import LogisticGAM, s, f
from sklearn.metrics import roc_auc_score, f1_score, classification_report, log_loss, brier_score_loss
from sklearn.isotonic import IsotonicRegression
import numpy as np

# =============================================
# 构建 GAM terms
# =============================================
def build_gam_terms(feature_cols, cat_cols):
    """根据特征类型构建 GAM terms"""
    terms = None
    for i, col in enumerate(feature_cols):
        if col in cat_cols:
            t = f(i)  # 分类特征用 factor term
        else:
            t = s(i, n_splines=8, spline_order=3)  # 连续特征用 spline term
        terms = t if terms is None else terms + t
    return terms

terms = build_gam_terms(feature_cols, cat_cols)
print("GAM terms 构建完成")

# =============================================
# 超参数搜索（使用验证集）
# =============================================
print("\n" + "=" * 50)
print("超参数搜索")
print("=" * 50)

best_lam, best_score, best_model = None, np.inf, None
lam_values = [10, 20, 40, 80, 120, 160, 240, 320, 480]

for lam_val in lam_values:
    try:
        model = LogisticGAM(terms, lam=lam_val).fit(X_train, y_train, weights=sw_train)
        proba = np.clip(model.predict_proba(X_val), 1e-6, 1-1e-6)
        loss = log_loss(y_val, proba)
        print(f"  lam={lam_val:3d} -> val_log_loss={loss:.4f}")
        
        if loss < best_score:
            best_score = loss
            best_lam = lam_val
            best_model = model
    except Exception as e:
        print(f"  lam={lam_val:3d} -> 训练失败: {e}")

print(f"\n最佳 lambda: {best_lam}, 最佳验证集 log_loss: {best_score:.4f}")

# 检查是否找到了有效的模型
if best_model is None:
    raise ValueError("所有 lambda 值都训练失败！请检查数据和特征。")

# =============================================
# 概率校准（Isotonic Regression）
# =============================================
print("\n" + "=" * 50)
print("概率校准")
print("=" * 50)

# 用验证集进行概率校准
val_proba_raw = np.clip(best_model.predict_proba(X_val), 1e-6, 1-1e-6)
iso_reg = IsotonicRegression(out_of_bounds='clip')
iso_reg.fit(val_proba_raw, y_val)

print("Isotonic Regression 校准完成")

# =============================================
# 在测试集上评估
# =============================================
print("\n" + "=" * 50)
print("测试集评估")
print("=" * 50)

# 原始概率
test_proba_raw = np.clip(best_model.predict_proba(X_test), 1e-6, 1-1e-6)

# 校准后概率
test_proba_cal = np.clip(iso_reg.predict(test_proba_raw), 1e-6, 1-1e-6)

# 评估指标
auc_raw = roc_auc_score(y_test, test_proba_raw)
auc_cal = roc_auc_score(y_test, test_proba_cal)

brier_raw = brier_score_loss(y_test, test_proba_raw)
brier_cal = brier_score_loss(y_test, test_proba_cal)

logloss_raw = log_loss(y_test, test_proba_raw)
logloss_cal = log_loss(y_test, test_proba_cal)

print(f"{'指标':<15} {'原始概率':<12} {'校准后概率':<12}")
print("-" * 40)
print(f"{'AUC-ROC':<15} {auc_raw:<12.4f} {auc_cal:<12.4f}")
print(f"{'Brier Score':<15} {brier_raw:<12.4f} {brier_cal:<12.4f}")
print(f"{'Log Loss':<15} {logloss_raw:<12.4f} {logloss_cal:<12.4f}")

# 使用最优阈值计算 F1
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, test_proba_cal)
f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)

print(f"\n最优阈值: {best_threshold:.4f}")
print(f"最优 F1 Score: {best_f1:.4f}")

# 分类报告
y_pred = (test_proba_cal >= best_threshold).astype(int)
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=['正常', '违约']))


GAM terms 构建完成

超参数搜索


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam= 10 -> val_log_loss=0.0221


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam= 20 -> val_log_loss=0.0219


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam= 40 -> val_log_loss=0.0218


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam= 80 -> val_log_loss=0.0217


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam=120 -> val_log_loss=0.0217


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam=160 -> val_log_loss=0.0217


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam=240 -> val_log_loss=0.0217


  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T


  lam=320 -> val_log_loss=0.0217
  lam=480 -> val_log_loss=0.0217

最佳 lambda: 480, 最佳验证集 log_loss: 0.0217

概率校准
Isotonic Regression 校准完成

测试集评估
指标              原始概率         校准后概率       
----------------------------------------
AUC-ROC         0.9972       0.9971      
Brier Score     0.0029       0.0028      
Log Loss        0.0189       0.0184      

最优阈值: 0.9362
最优 F1 Score: 0.9972

分类报告:
              precision    recall  f1-score   support

          正常       1.00      0.99      1.00      3989
          违约       0.99      1.00      1.00      4077

    accuracy                           1.00      8066
   macro avg       1.00      1.00      1.00      8066
weighted avg       1.00      1.00      1.00      8066



  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
  B = (u @ vh[:rank]).conj().T
