# Objective 3: Covid Status Age Regression
*Notebook 04_O3_CovidAgePrediction*

## 1. 导入库

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

## 2. 加载与预处理

In [12]:
# 2.1 加载数据
df = pd.read_csv('../data/custom_covid19.csv')
df.replace([97,98,99], np.nan, inplace=True)
# 2.2 标记死亡
df['DIED'] = (df['DATE_DIED'] != '9999-99-99').astype(int)
# 2.3 二元特征编码 + 缺失值填补
binary_cols = [
    'USMER','MEDICAL_UNIT','SEX','PATIENT_TYPE','INTUBED','PNEUMONIA',
    'PREGNANT','DIABETES','COPD','ASTHMA','INMSUPR','HYPERTENSION',
    'OTHER_DISEASE','CARDIOVASCULAR','OBESITY','RENAL_CHRONIC','TOBACCO',
    'TEST_RESULT','ICU'
]
for c in binary_cols:
    df[c] = df[c].map({1:1,2:0})
imp = SimpleImputer(strategy='median')
df[binary_cols] = imp.fit_transform(df[binary_cols])

## 3. 子集筛选：仅死亡人群 & 按 Covid 状态拆分

In [13]:
# 3.1 仅保留死亡者
df_died = df[df['DIED']==1].copy()
# 3.2 Covid 阳性 vs 阴性
df_pos = df_died[df_died['TEST_RESULT']<=3]
df_neg = df_died[df_died['TEST_RESULT']>=4]
print('Died total:', len(df_died), 'Positive:', len(df_pos), 'Negative:', len(df_neg))

Died total: 7338 Positive: 7338 Negative: 0


## 4. 训练与评估函数

In [14]:
def train_and_eval(df_sub, label):
    # 特征与目标
    y = df_sub['AGE']
    X = df_sub.drop(columns=['AGE','DATE_DIED','DIED','TEST_RESULT'])
    # 丢弃标签缺失
    mask = y.notna()
    X, y = X[mask], y[mask]
    # 划分
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    # 基线：线性回归
    lr = LinearRegression().fit(X_tr, y_tr)
    y_pred_lr = lr.predict(X_val)
    # 基线：决策树回归
    dt = DecisionTreeRegressor(random_state=42).fit(X_tr, y_tr)
    y_pred_dt = dt.predict(X_val)
    # 调优：随机森林回归
    param_rf = {'n_estimators':[100,200], 'max_depth':[5,10,15]}
    gs_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_rf, cv=5,
                         scoring='neg_mean_squared_error', n_jobs=-1)
    gs_rf.fit(X_tr, y_tr)
    rf = gs_rf.best_estimator_
    y_pred_rf = rf.predict(X_val)
    # 评估函数
    def eval_reg(name, y_true, y_pred):
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae  = mean_absolute_error(y_true, y_pred)
        r2   = r2_score(y_true, y_pred)
        print(f"--- {label} {name} ---")
        print(f"R2: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}\n")
    # 输出基线 & 调优结果
    eval_reg('LinearRegression', y_val, y_pred_lr)
    eval_reg('DecisionTree', y_val, y_pred_dt)
    print('Best RF params:', gs_rf.best_params_)
    eval_reg('RandomForest', y_val, y_pred_rf)
    # 保存模型
    joblib.dump(best_rf, 'best_age_model.pkl')
    print("Best age model saved to best_age_model.pkl")
    return rf

## 5. 训练并评估两个子集

In [15]:
# Covid 阳性死亡人群
rf_pos = train_and_eval(df_pos, 'covid_pos')
# Covid 阴性死亡人群
rf_neg = train_and_eval(df_neg, 'covid_neg')

--- covid_pos LinearRegression ---
R2: 0.0990, RMSE: 14.6120, MAE: 11.3320

--- covid_pos DecisionTree ---
R2: -0.1232, RMSE: 16.3146, MAE: 12.4136

Best RF params: {'max_depth': 5, 'n_estimators': 200}
--- covid_pos RandomForest ---
R2: 0.1017, RMSE: 14.5902, MAE: 11.2847



NameError: name 'best_rf' is not defined

## 6. 误差分布可视化

In [None]:
for label, model, df_sub in [('covid_pos', rf_pos, df_pos), ('covid_neg', rf_neg, df_neg)]:
    X_eval = df_sub.drop(columns=['AGE','DATE_DIED','DIED','TEST_RESULT'])
    errors = df_sub.loc[X_eval.index, 'AGE'] - model.predict(X_eval)
    plt.figure(figsize=(5,3))
    sns.histplot(errors, bins=20, kde=True)
    plt.title(f'Error Distribution ({label})')
    plt.xlabel('Error (Age)')
    plt.ylabel('Frequency')
    plt.show()