### labs_first_day_lgbm

In [1]:
import psycopg2
import pandas as pd
import joblib
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# ================== 数据库配置部分 ==================
# 数据库连接配置
con = psycopg2.connect(
    database="mimiciii",
    user="postgres",
    password="123456",
    host="localhost",
    port="5433"
)

# ================== 修改后的SQL查询 ==================
# 新的阳性样本查询
query_positive = '''
SELECT DISTINCT lfd.*,
                p.gender,
                1 AS match_flag
FROM labs_first_day lfd
LEFT JOIN diagnoses_icd d ON d.hadm_id = lfd.hadm_id
LEFT JOIN d_icd_diagnoses di ON di.icd9_code = d.icd9_code
LEFT JOIN patients p ON p.subject_id = d.subject_id
WHERE (di.long_title ILIKE '% renal fail%'
    OR di.long_title ILIKE '%kidney fail%'
    OR di.long_title ILIKE '%liver fail%'
    OR di.long_title ILIKE '%spleen fail%')
'''

# 新的阴性样本查询（动态匹配阳性样本数量）
query_negative = '''
WITH positive_hadm AS (
    SELECT DISTINCT d.hadm_id
    FROM diagnoses_icd d
    JOIN d_icd_diagnoses di ON di.icd9_code = d.icd9_code
    WHERE di.long_title ILIKE '% renal fail%'
        OR di.long_title ILIKE '%kidney fail%'
        OR di.long_title ILIKE '%liver fail%'
        OR di.long_title ILIKE '%spleen fail%'
)
SELECT
    lfd.*,
    p.gender,
    0 AS match_flag
FROM labs_first_day lfd
LEFT JOIN patients p ON p.subject_id = lfd.subject_id
WHERE NOT EXISTS (
    SELECT 1
    FROM positive_hadm ph
    WHERE ph.hadm_id = lfd.hadm_id
)
ORDER BY RANDOM()
LIMIT (SELECT COUNT(*) FROM positive_hadm);
'''


#================== 数据获取部分 ==================
def get_dynamic_keep_columns(cursor_description):
    """
    从数据库查询结果中动态提取列名，并确保保留关键列。
    :param cursor_description: 数据库查询结果的description属性
    :return: 动态生成的keep_columns列表
    """
    # 提取所有列名
    all_columns = [desc[0] for desc in cursor_description]

    # 确保关键列存在
    required_columns = ['hadm_id', 'match_flag']
    for col in required_columns:
        if col not in all_columns:
            raise ValueError(f"关键列 {col} 不在查询结果中！")

    # 动态构建keep_columns
    # 排除不需要的列（可根据需要调整）
    exclude_columns = ['row_id', 'subject_id', 'icustay_id']  # 示例：排除这些列
    keep_columns = [col for col in all_columns if col not in exclude_columns]

    return keep_columns


try:
    cur = con.cursor()

    # 获取阳性样本
    print("正在获取阳性样本...")
    cur.execute(query_positive)
    positive_columns = [desc[0] for desc in cur.description]  # 提取列名
    positive_df = pd.DataFrame(cur.fetchall(), columns=positive_columns)
    print(f"阳性样本获取完成，共 {len(positive_df)} 条")

    # 动态更新阴性样本查询
    query_negative = query_negative.replace('SELECT COUNT(*) FROM positive_hadm', str(len(positive_df)))

    # 获取阴性样本
    print("正在获取阴性样本...")
    cur.execute(query_negative)
    negative_columns = [desc[0] for desc in cur.description]  # 提取列名
    negative_df = pd.DataFrame(cur.fetchall(), columns=negative_columns)
    print(f"阴性样本获取完成，共 {len(negative_df)} 条")

    # 合并数据
    combined_df = pd.concat([positive_df, negative_df], ignore_index=True)

    # 动态生成keep_columns
    keep_columns = get_dynamic_keep_columns(cur.description)
    print(f"动态保留的列：{keep_columns}")

    # 仅保留需要的列
    combined_df = combined_df[keep_columns]

    # 保存原始数据
    raw_output = "./data/labs_first_day_raw.csv"
    combined_df.to_csv(raw_output, index=False)
    print(f"原始数据已保存到 {raw_output}")

except Exception as e:
    print(f"数据库操作失败：{str(e)}")
finally:
    if con:
        cur.close()
        con.close()

# ================== 数据预处理部分 ==================
# 读取数据
df = pd.read_csv('./data/labs_first_day_raw.csv')

# 1. 动态保留列（基于查询结果）
keep_columns = get_dynamic_keep_columns([(col,) for col in df.columns])  # 模拟cursor.description
df = df[keep_columns]

# 2. 去除重复记录（基于hadm_id）
df = df.drop_duplicates(subset=['hadm_id'], keep='first')

# 3. 性别编码
df['gender'] = df['gender'].map({'M': 1, 'F': 0}).fillna(-1).astype(int)

# 4. 缺失值处理（保留hadm_id）
impute_cols = [col for col in df.columns if col not in ['hadm_id', 'match_flag']]  # 动态选择需要填补的列
imputer = IterativeImputer(
    estimator=LGBMRegressor(n_estimators=50, random_state=42),
    max_iter=10,
    random_state=42
)
df[impute_cols] = imputer.fit_transform(df[impute_cols])
# 保存插值模型
joblib.dump(imputer, "./models/labs_first_day_lgbm.pkl")
print("插值模型已保存！")

# 5. 标准化处理（排除hadm_id和标签列）
features = df.drop(columns=['hadm_id', 'match_flag'])
scaler = StandardScaler().fit(features)
# 保存标准化模型
scaler_path = "./models/labs_first_day_lgbm_standard_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"标准化模型已保存至 {scaler_path}")

features_scaled = scaler.transform(features)

# 重组最终DataFrame
df_final = pd.DataFrame(features_scaled, columns=features.columns)
df_final = pd.concat([
    df[['hadm_id']].reset_index(drop=True),
    df_final,
    df['match_flag'].reset_index(drop=True)
], axis=1)

# 保存最终数据
final_output = './data/cleaned_labs_first_day_lgbm.csv'
df_final.to_csv(final_output, index=False)
print(f"处理后的数据已保存到 {final_output}")
print("数据特征分布：\n", df_final.describe())

# ================== 新增验证部分 ==================
# 验证hadm_id保留情况
assert 'hadm_id' in df_final.columns, "hadm_id列丢失！"
print("\n验证结果：")
print(f"总样本数：{len(df_final)}")
print(f"阳性样本数：{df_final.match_flag.sum()}")
print(f"阴性样本数：{len(df_final) - df_final.match_flag.sum()}")
print(f"唯一hadm_id数量：{df_final.hadm_id.nunique()}")

正在获取阳性样本...
阳性样本获取完成，共 12888 条
正在获取阴性样本...
阴性样本获取完成，共 12888 条
动态保留的列：['hadm_id', 'aniongap_min', 'aniongap_max', 'albumin_min', 'albumin_max', 'bands_min', 'bands_max', 'bicarbonate_min', 'bicarbonate_max', 'bilirubin_min', 'bilirubin_max', 'creatinine_min', 'creatinine_max', 'chloride_min', 'chloride_max', 'glucose_min', 'glucose_max', 'hematocrit_min', 'hematocrit_max', 'hemoglobin_min', 'hemoglobin_max', 'lactate_min', 'lactate_max', 'platelet_min', 'platelet_max', 'potassium_min', 'potassium_max', 'ptt_min', 'ptt_max', 'inr_min', 'inr_max', 'pt_min', 'pt_max', 'sodium_min', 'sodium_max', 'bun_min', 'bun_max', 'wbc_min', 'wbc_max', 'gender', 'match_flag']
原始数据已保存到 ./data/labs_first_day_raw.csv
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5839
[LightGBM] [Info] Number of data points in the train set: 24120, number of used features: 38
[Ligh



插值模型已保存！
标准化模型已保存至 ./models/labs_first_day_lgbm_standard_scaler.pkl
处理后的数据已保存到 ./data/cleaned_labs_first_day_lgbm.csv
数据特征分布：
              hadm_id  aniongap_min  aniongap_max   albumin_min   albumin_max  \
count   24120.000000  2.412000e+04  2.412000e+04  2.412000e+04  2.412000e+04   
mean   150082.362687  4.713385e-17  2.262425e-16  1.885354e-17  2.262425e-16   
std     28885.723621  1.000021e+00  1.000021e+00  1.000021e+00  1.000021e+00   
min    100001.000000 -3.552508e+00 -2.897971e+00 -4.344937e+00 -4.602457e+00   
25%    125097.250000 -7.233269e-01 -7.037127e-01 -4.138343e-01 -3.857830e-01   
50%    150201.500000 -1.574907e-01 -1.052786e-01  7.140714e-02 -1.006574e-02   
75%    175010.000000  4.828870e-01  3.276020e-01  5.078758e-01  5.649661e-01   
max    199999.000000  8.612970e+00  8.272799e+00  5.158488e+00  5.112298e+00   

          bands_min     bands_max  bicarbonate_min  bicarbonate_max  \
count  2.412000e+04  2.412000e+04     2.412000e+04     2.412000e+04   
mean   1.1

### microbiologyevents

In [2]:
import psycopg2
import pandas as pd
import joblib
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# ================== 数据库配置部分 ==================
# 数据库连接配置
con = psycopg2.connect(
    database="mimiciii",
    user="postgres",
    password="123456",
    host="localhost",
    port="5433"
)

# ================== 修改后的SQL查询 ==================
# 新的阳性样本查询
query_positive = '''
select  distinct m.hadm_id,
				m.spec_itemid,
				m.org_itemid,
				m.isolate_num,
				m.ab_itemid,
				m.dilution_text,
				m.dilution_value,
    			uofd.urineoutput,
				p.gender,
				1 AS match_flag
from microbiologyevents m
left join labevents l on l.hadm_id=m.hadm_id
left join urine_output_first_day uofd on uofd.hadm_id = m.hadm_id
left join diagnoses_icd d on d.hadm_id=m.hadm_id
left join d_icd_diagnoses di on di.icd9_code=d.icd9_code
left join patients p  on p.subject_id = d.subject_id
where l.itemid = 50912
	AND l.valuenum <= 150
	and(di.long_title ilike '% renal fail%' 
	or di.long_title ilike '%kidney fail%'
	or di.long_title ilike '%liver fail%'
	or di.long_title ilike '%spleen fail%')
'''

# 新的阴性样本查询（动态匹配阳性样本数量）
query_negative = '''
WITH positive_hadm AS (
    SELECT DISTINCT d.hadm_id
    FROM diagnoses_icd d
    JOIN d_icd_diagnoses di ON di.icd9_code = d.icd9_code
    WHERE di.long_title ILIKE '% renal fail%' 
        OR di.long_title ILIKE '%kidney fail%'
        OR di.long_title ILIKE '%liver fail%'
        OR di.long_title ILIKE '%spleen fail%'
)
SELECT * FROM (
    SELECT 
        DISTINCT m.hadm_id,
        m.spec_itemid,
        m.org_itemid,
        m.isolate_num,
        m.ab_itemid,
        m.dilution_text,
        m.dilution_value,
        uofd.urineoutput,
        p.gender,
        0 AS match_flag 
    FROM microbiologyevents m
    left join urine_output_first_day uofd on uofd.hadm_id = m.hadm_id
    LEFT JOIN patients p ON p.subject_id = m.subject_id
    WHERE NOT EXISTS (
        SELECT 1 
        FROM positive_hadm ph 
        WHERE ph.hadm_id = m.hadm_id
    )
) AS subquery
ORDER BY RANDOM()
LIMIT (SELECT COUNT(*) FROM positive_hadm);

'''

# ================== 数据获取部分 ==================
def get_dynamic_keep_columns(cursor_description):
    """
    从数据库查询结果中动态提取列名，并确保保留关键列。
    :param cursor_description: 数据库查询结果的description属性
    :return: 动态生成的keep_columns列表
    """
    # 提取所有列名
    all_columns = [desc[0] for desc in cursor_description]
    
    # 确保关键列存在
    required_columns = ['hadm_id', 'match_flag']
    for col in required_columns:
        if col not in all_columns:
            raise ValueError(f"关键列 {col} 不在查询结果中！")
    
    # 动态构建keep_columns
    # 排除不需要的列（可根据需要调整）
    exclude_columns = ['row_id', 'subject_id', 'icustay_id']  # 示例：排除这些列
    keep_columns = [col for col in all_columns if col not in exclude_columns]
    
    return keep_columns

try:
    cur = con.cursor()
    
    # 获取阳性样本
    print("正在获取阳性样本...")
    cur.execute(query_positive)
    positive_columns = [desc[0] for desc in cur.description]  # 提取列名
    positive_df = pd.DataFrame(cur.fetchall(), columns=positive_columns)
    print(f"阳性样本获取完成，共 {len(positive_df)} 条")
    
    # 动态更新阴性样本查询
    query_negative = query_negative.replace('SELECT COUNT(*) FROM positive_hadm', str(len(positive_df)))
    
    # 获取阴性样本
    print("正在获取阴性样本...")
    cur.execute(query_negative)
    negative_columns = [desc[0] for desc in cur.description]  # 提取列名
    negative_df = pd.DataFrame(cur.fetchall(), columns=negative_columns)
    print(f"阴性样本获取完成，共 {len(negative_df)} 条")
    
    # 合并数据
    combined_df = pd.concat([positive_df, negative_df], ignore_index=True)
    
    # 动态生成keep_columns
    keep_columns = get_dynamic_keep_columns(cur.description)
    print(f"动态保留的列：{keep_columns}")
    
    # 仅保留需要的列
    combined_df = combined_df[keep_columns]
    
    # 保存原始数据
    raw_output = "./data/microbiologyevents_plus_raw.csv"
    combined_df.to_csv(raw_output, index=False)
    print(f"原始数据已保存到 {raw_output}")

except Exception as e:
    print(f"数据库操作失败：{str(e)}")
finally:
    if con:
        cur.close()
        con.close()

# ================== 数据预处理部分 ==================
from sklearn.preprocessing import LabelEncoder
# 读取数据
df = pd.read_csv('./data/microbiologyevents_plus_raw.csv')

# 1. 动态保留列（基于查询结果）
keep_columns = get_dynamic_keep_columns([(col,) for col in df.columns])  # 模拟cursor.description
df = df[keep_columns]

df['dilution_text'] = df['dilution_text'].str.extract(r'([\d\.]+)').astype(float)


# 2. 去除重复记录（基于hadm_id）
df = df.drop_duplicates(subset=['hadm_id'], keep='first')

# 3. 性别编码
df['gender'] = df['gender'].map({'M': 1, 'F': 0}).fillna(-1).astype(int)

# 4. 缺失值处理（保留hadm_id）
impute_cols = [col for col in df.columns if col not in ['hadm_id', 'match_flag','dilution_comparison']]  # 动态选择需要填补的列
imputer = IterativeImputer(
    estimator=LGBMRegressor(n_estimators=50, random_state=42),
    max_iter=10,
    random_state=42
)
df[impute_cols] = imputer.fit_transform(df[impute_cols])

# 保存插值模型
joblib.dump(imputer, "./models/microbiologyevents_plus_lgbm.pkl")
print("插值模型已保存！")

# 5. 标准化处理（排除hadm_id和标签列）
features = df.drop(columns=['hadm_id', 'match_flag'])
scaler = StandardScaler().fit(features)
# 保存标准化模型
scaler_path = "./models/microbiologyevents_plus_standard_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"标准化模型已保存至 {scaler_path}")

features_scaled = scaler.transform(features)

# 重组最终DataFrame
df_final = pd.DataFrame(features_scaled, columns=features.columns)
df_final = pd.concat([
    df[['hadm_id']].reset_index(drop=True),
    df_final,
    df['match_flag'].reset_index(drop=True)
], axis=1)

# 保存最终数据
final_output = './data/cleaned_microbiologyevents_plus.csv'
df_final.to_csv(final_output, index=False)
print(f"处理后的数据已保存到 {final_output}")
print("数据特征分布：\n", df_final.describe())

# ================== 新增验证部分 ==================
# 验证hadm_id保留情况
assert 'hadm_id' in df_final.columns, "hadm_id列丢失！"
print("\n验证结果：")
print(f"总样本数：{len(df_final)}")
print(f"阳性样本数：{df_final.match_flag.sum()}")
print(f"阴性样本数：{len(df_final) - df_final.match_flag.sum()}")
print(f"唯一hadm_id数量：{df_final.hadm_id.nunique()}")

正在获取阳性样本...
阳性样本获取完成，共 175293 条
正在获取阴性样本...
阴性样本获取完成，共 175293 条
动态保留的列：['hadm_id', 'spec_itemid', 'org_itemid', 'isolate_num', 'ab_itemid', 'dilution_text', 'dilution_value', 'urineoutput', 'gender', 'match_flag']
原始数据已保存到 ./data/microbiologyevents_plus_raw.csv
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 41682, number of used features: 7
[LightGBM] [Info] Start training from score 0.547047
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 41680, number of 



插值模型已保存！
标准化模型已保存至 ./models/microbiologyevents_plus_standard_scaler.pkl
处理后的数据已保存到 ./data/cleaned_microbiologyevents_plus.csv
数据特征分布：
              hadm_id   spec_itemid    org_itemid   isolate_num     ab_itemid  \
count   41682.000000  4.168200e+04  4.168200e+04  4.168200e+04  4.168200e+04   
mean   150072.721127 -1.932249e-14 -7.546665e-14 -7.531255e-16  1.004380e-12   
std     28829.557101  1.000012e+00  1.000012e+00  1.000012e+00  1.000012e+00   
min    100001.000000 -1.315455e+00 -1.724758e+00 -9.450496e-01 -2.138009e+00   
25%    125230.750000 -1.011345e+00 -8.927180e-01 -2.831002e-01 -7.789612e-01   
50%    150106.000000  2.262750e-02 -2.381719e-03 -2.244500e-01 -4.241566e-02   
75%    174960.250000  1.026189e+00  9.020176e-01 -4.798164e-02  7.493887e-01   
max    199999.000000  1.451943e+00  1.808563e+00  2.390816e+01  2.902726e+00   

       dilution_text  dilution_value   urineoutput        gender    match_flag  
count   4.168200e+04    4.168200e+04  4.168200e+04  4.168200e+0

### vitals_first_day

In [3]:
import psycopg2
import pandas as pd
import numpy as np
import joblib
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# ================== 数据库配置部分 ==================
# 数据库连接配置
con = psycopg2.connect(
    database="mimiciii",
    user="postgres",
    password="123456",
    host="localhost",
    port="5433"
)

# ================== 修改后的SQL查询 ==================
# 新的阳性样本查询
query_positive = '''
select  distinct vfd.hadm_id,
				vfd.heartrate_min,vfd.heartrate_max,vfd.heartrate_mean,
				vfd.sysbp_min,vfd.sysbp_max,vfd.sysbp_mean,
				vfd.diasbp_min,vfd.diasbp_max,vfd.diasbp_mean,
				vfd.meanbp_min,vfd.meanbp_max,vfd.meanbp_mean,
				vfd.resprate_min,vfd.resprate_max,vfd.resprate_mean,
				vfd.tempc_min,vfd.tempc_max,vfd.tempc_mean,
				vfd.spo2_min,vfd.spo2_max,vfd.spo2_mean,
				vfd.glucose_min,vfd.glucose_max,vfd.glucose_mean,
				p.gender,
				1 AS match_flag
from vitals_first_day vfd
left join labevents l on l.hadm_id=vfd.hadm_id
left join diagnoses_icd d on d.hadm_id=vfd.hadm_id
left join d_icd_diagnoses di on di.icd9_code=d.icd9_code
left join patients p  on p.subject_id = d.subject_id
where l.itemid = 50912
	AND l.valuenum <= 150
	and(di.long_title ilike '% renal fail%' 
	or di.long_title ilike '%kidney fail%'
	or di.long_title ilike '%liver fail%'
	or di.long_title ilike '%spleen fail%')
'''

# 新的阴性样本查询（动态匹配阳性样本数量）
query_negative = '''
WITH positive_hadm AS (
    SELECT DISTINCT d.hadm_id
    FROM diagnoses_icd d
    JOIN d_icd_diagnoses di ON di.icd9_code = d.icd9_code
    WHERE di.long_title ILIKE '% renal fail%' 
        OR di.long_title ILIKE '%kidney fail%'
        OR di.long_title ILIKE '%liver fail%'
        OR di.long_title ILIKE '%spleen fail%'
)
SELECT * FROM (
select  distinct vfd.hadm_id,
				vfd.heartrate_min,vfd.heartrate_max,vfd.heartrate_mean,
				vfd.sysbp_min,vfd.sysbp_max,vfd.sysbp_mean,
				vfd.diasbp_min,vfd.diasbp_max,vfd.diasbp_mean,
				vfd.meanbp_min,vfd.meanbp_max,vfd.meanbp_mean,
				vfd.resprate_min,vfd.resprate_max,vfd.resprate_mean,
				vfd.tempc_min,vfd.tempc_max,vfd.tempc_mean,
				vfd.spo2_min,vfd.spo2_max,vfd.spo2_mean,
				vfd.glucose_min,vfd.glucose_max,vfd.glucose_mean,
				p.gender,
				0 AS match_flag
from vitals_first_day vfd
    LEFT JOIN patients p ON p.subject_id = vfd.subject_id
    WHERE NOT EXISTS (
        SELECT 1 
        FROM positive_hadm ph 
        WHERE ph.hadm_id = vfd.hadm_id
    )
) AS subquery
ORDER BY RANDOM()
LIMIT (SELECT COUNT(*) FROM positive_hadm);

'''

# ================== 数据获取部分 ==================
def get_dynamic_keep_columns(cursor_description):
    """
    从数据库查询结果中动态提取列名，并确保保留关键列。
    :param cursor_description: 数据库查询结果的description属性
    :return: 动态生成的keep_columns列表
    """
    # 提取所有列名
    all_columns = [desc[0] for desc in cursor_description]
    
    # 确保关键列存在
    required_columns = ['hadm_id', 'match_flag']
    for col in required_columns:
        if col not in all_columns:
            raise ValueError(f"关键列 {col} 不在查询结果中！")
    
    # 动态构建keep_columns
    # 排除不需要的列（可根据需要调整）
    exclude_columns = ['row_id', 'subject_id', 'icustay_id']  # 示例：排除这些列
    keep_columns = [col for col in all_columns if col not in exclude_columns]
    
    return keep_columns

try:
    cur = con.cursor()
    
    # 获取阳性样本
    print("正在获取阳性样本...")
    cur.execute(query_positive)
    positive_columns = [desc[0] for desc in cur.description]  # 提取列名
    positive_df = pd.DataFrame(cur.fetchall(), columns=positive_columns)
    print(f"阳性样本获取完成，共 {len(positive_df)} 条")
    
    # 动态更新阴性样本查询
    query_negative = query_negative.replace('SELECT COUNT(*) FROM positive_hadm', str(len(positive_df)))
    
    # 获取阴性样本
    print("正在获取阴性样本...")
    cur.execute(query_negative)
    negative_columns = [desc[0] for desc in cur.description]  # 提取列名
    negative_df = pd.DataFrame(cur.fetchall(), columns=negative_columns)
    print(f"阴性样本获取完成，共 {len(negative_df)} 条")
    
    # 合并数据
    combined_df = pd.concat([positive_df, negative_df], ignore_index=True)
    
    # 动态生成keep_columns
    keep_columns = get_dynamic_keep_columns(cur.description)
    print(f"动态保留的列：{keep_columns}")
    
    # 仅保留需要的列
    combined_df = combined_df[keep_columns]
    
    # 保存原始数据
    raw_output = "./data/vitals_first_day_raw.csv"
    combined_df.to_csv(raw_output, index=False)
    print(f"原始数据已保存到 {raw_output}")

except Exception as e:
    print(f"数据库操作失败：{str(e)}")
finally:
    if con:
        cur.close()
        con.close()

# ================== 数据预处理部分 ==================
from sklearn.preprocessing import LabelEncoder
# 读取数据
df = pd.read_csv('./data/vitals_first_day_raw.csv')

# 1. 动态保留列（基于查询结果）
keep_columns = get_dynamic_keep_columns([(col,) for col in df.columns])  # 模拟cursor.description
df = df[keep_columns]

# df['dilution_text'] = df['dilution_text'].str.extract(r'([\d\.]+)').astype(float)


# 2. 去除重复记录（基于hadm_id）
df = df.drop_duplicates(subset=['hadm_id'], keep='first')

# 3. 性别编码
df['gender'] = df['gender'].map({'M': 1, 'F': 0}).fillna(-1).astype(int)

# 4. 缺失值处理（保留hadm_id）
impute_cols = [col for col in df.columns if col not in ['hadm_id', 'match_flag']]  # 动态选择需要填补的列
imputer = IterativeImputer(
    estimator=LGBMRegressor(n_estimators=50, random_state=42),
    max_iter=10,
    random_state=42
)
df[impute_cols] = imputer.fit_transform(df[impute_cols])

# 保存插值模型
joblib.dump(imputer, "./models/vitals_first_day_lgmb.pkl")
print("插值模型已保存！")

# 5. 标准化处理（排除hadm_id和标签列）
features = df.drop(columns=['hadm_id', 'match_flag'])
scaler = StandardScaler().fit(features)
# 保存标准化模型
scaler_path = "./models/vitals_first_day_standard_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"标准化模型已保存至 {scaler_path}")

features_scaled = scaler.transform(features)

# 重组最终DataFrame
df_final = pd.DataFrame(features_scaled, columns=features.columns)
df_final = pd.concat([
    df[['hadm_id']].reset_index(drop=True),
    df_final,
    df['match_flag'].reset_index(drop=True)
], axis=1)

# 保存最终数据
final_output = './data/cleaned_vitals_first_day.csv'
df_final.to_csv(final_output, index=False)
print(f"处理后的数据已保存到 {final_output}")
print("数据特征分布：\n", df_final.describe())

# ================== 新增验证部分 ==================
# 验证hadm_id保留情况
assert 'hadm_id' in df_final.columns, "hadm_id列丢失！"
print("\n验证结果：")
print(f"总样本数：{len(df_final)}")
print(f"阳性样本数：{df_final.match_flag.sum()}")
print(f"阴性样本数：{len(df_final) - df_final.match_flag.sum()}")
print(f"唯一hadm_id数量：{df_final.hadm_id.nunique()}")

正在获取阳性样本...
阳性样本获取完成，共 12637 条
正在获取阴性样本...
阴性样本获取完成，共 12637 条
动态保留的列：['hadm_id', 'heartrate_min', 'heartrate_max', 'heartrate_mean', 'sysbp_min', 'sysbp_max', 'sysbp_mean', 'diasbp_min', 'diasbp_max', 'diasbp_mean', 'meanbp_min', 'meanbp_max', 'meanbp_mean', 'resprate_min', 'resprate_max', 'resprate_mean', 'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max', 'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'gender', 'match_flag']
原始数据已保存到 ./data/vitals_first_day_raw.csv
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4550
[LightGBM] [Info] Number of data points in the train set: 23673, number of used features: 24
[LightGBM] [Info] Start training from score 0.565792
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_col_wise=true` to remove the overhead.
