In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, classification_report
#定义数据路径和输出文件路径
dpath = 'D:\\Python_study\\5组代码\\卒中发病预测\\'
outfile = dpath + 'output__new/Stroke__panel/Stroke_hierarchy.csv'
df = pd.read_csv("D:\\课题\\bio_people448749.csv")
# 确保 data_imputed 是 DataFrame
data_imputed_df = pd.read_csv("D:\\Python_study\\5组代码\\卒中发病预测\\bio_imputed_data448749(mean).csv")

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
import time
import joblib
# 筛选来自英格兰的参与者
england_indices = df[df['Region'] == 'England'].index
X_england = data_imputed_df.loc[england_indices]
y_england = df.loc[england_indices, 'status']

# 定义5折交叉验证
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# 存储每折的评估结果
accuracy_scores = []
auc_scores = []
training_times = []
models = []
# 计算类别权重（需要在循环内计算，因为每折的分布可能不同）
# 注意：这里我们会在循环内计算，因为y_train在每折不同

# 固定参数（从你的网格中取第一个值）
params = {
    'alpha': 1.5,          # L1 正则化
    'lambda': 1.5,         # L2 正则化
    'gamma': 0.2,          # 分裂最小损失下降
    'colsample_bytree': 0.75,  # 特征采样比例
    'subsample': 0.85,     # 数据采样比例
    'learning_rate': 0.007, # 学习率
    'max_depth': 9,        # 树的最大深度
    'n_estimators': 5200,  # 树的数量
    'min_child_weight': 3, # 叶子节点最小样本权重
    'eval_metric':'auc'
}
# 开始交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_england), 1):
    try:
        print(f"\n===== Fold {fold} =====\n")
        X_train, X_val = X_england.iloc[train_idx], X_england.iloc[val_idx]
        y_train, y_val = y_england.iloc[train_idx], y_england.iloc[val_idx]

        # 计算当前fold的类别权重
        scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
        current_params = params.copy()
        current_params['scale_pos_weight'] = scale_pos_weight

        # 初始化模型
        model = XGBClassifier(**current_params)

        # 训练模型并计时
        start_time = time.time()
        print("Training started...")
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=10  # 每10轮显示一次进度
        )
        training_time = time.time() - start_time
        training_times.append(training_time)
        print(f"Training completed in {training_time:.2f} seconds")

        # 预测
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:, 1]  # 获取正类的概率

        # 评估
        accuracy = accuracy_score(y_val, y_pred)
        auc = roc_auc_score(y_val, y_pred_proba)

        accuracy_scores.append(accuracy)
        auc_scores.append(auc)
        models.append(model)  # 保存当前fold的模型
        print(f"Fold {fold} Accuracy: {accuracy:.4f}")
        print(f"Fold {fold} AUC: {auc:.4f}")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")
        break
    except Exception as e:
        print(f"\nError occurred during fold {fold}: {str(e)}")
        continue

# 输出结果
if accuracy_scores:
    print("\n=== Final Results ===")
    print(f"Average Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
    print(f"Average AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
    print(f"Average Training Time: {np.mean(training_times):.2f} seconds")

    # 找到AUC最高的fold
    best_fold = np.argmax(auc_scores)
    best_model = models[best_fold]
    best_auc = auc_scores[best_fold]

    print(f"\nBest model from Fold {best_fold + 1} with AUC: {best_auc:.4f}")

    # 保存最佳模型
    model_filename = f"best_xgboost_model_fold{best_fold + 1}_auc{best_auc:.4f}.joblib"
    # joblib.dump(best_model, model_filename)
    print(f"Best model saved as {model_filename}")
else:
    print("\nNo valid results were obtained")


===== Fold 1 =====

Training started...
[0]	validation_0-auc:0.60352
[10]	validation_0-auc:0.65387
[20]	validation_0-auc:0.65968
[30]	validation_0-auc:0.66249
[40]	validation_0-auc:0.66416
[50]	validation_0-auc:0.66532
[60]	validation_0-auc:0.66625
[70]	validation_0-auc:0.66790
[80]	validation_0-auc:0.66788
[90]	validation_0-auc:0.66775
[100]	validation_0-auc:0.66764
[110]	validation_0-auc:0.66837
[120]	validation_0-auc:0.66931
[130]	validation_0-auc:0.66949
[140]	validation_0-auc:0.66922
[150]	validation_0-auc:0.66898
[160]	validation_0-auc:0.66916
[170]	validation_0-auc:0.66921
[180]	validation_0-auc:0.66950
[190]	validation_0-auc:0.66962
[200]	validation_0-auc:0.66988
[210]	validation_0-auc:0.66936
[220]	validation_0-auc:0.66946
[230]	validation_0-auc:0.66958
[240]	validation_0-auc:0.66968
[250]	validation_0-auc:0.66960
[260]	validation_0-auc:0.66955
[270]	validation_0-auc:0.66960
[280]	validation_0-auc:0.66945
[290]	validation_0-auc:0.66909
[300]	validation_0-auc:0.66898
[310]	val

In [3]:
external_indices = df[df['Region'].isin(['Scotland', 'Wales'])].index
# 获取苏格兰和威尔士参与者的数据
X_external = data_imputed_df.loc[external_indices]  # 特征
y_external = df.loc[external_indices, 'status']  # 标签
print("外部验证集形状:", X_external.shape)
X_external
# 预测类别（0/1）
y_pred = best_model.predict(X_external)
# 预测概率（用于 AUC 等指标）
y_pred_proba = best_model.predict_proba(X_external)[:, 1]  # 正类的概率
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# 计算准确率和 AUC
accuracy = accuracy_score(y_external, y_pred)
auc = roc_auc_score(y_external, y_pred_proba)

print(f"测试集准确率: {accuracy:.4f}")
print(f"测试集 AUC: {auc:.4f}")

外部验证集形状: (50653, 62)
测试集准确率: 0.9637
测试集 AUC: 0.6084


In [5]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
import time
import joblib
# 筛选来自英格兰的参与者
england_indices = df[df['Region'] == 'England'].index
X_england = data_imputed_df.loc[england_indices]
y_england = df.loc[england_indices, 'status']

# 定义5折交叉验证
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# 存储每折的评估结果
accuracy_scores = []
auc_scores = []
training_times = []
models = []
# 计算类别权重（需要在循环内计算，因为每折的分布可能不同）
# 注意：这里我们会在循环内计算，因为y_train在每折不同

# 固定参数（从你的网格中取第一个值）
params = {
    'alpha': 1.5,          # L1 正则化
    'lambda': 1.5,         # L2 正则化
    'colsample_bytree': 0.75,  # 特征采样比例
    'n_estimators': 400,
    'learning_rate': 0.01,
    'max_depth': 3,
    'subsample': 0.7,
    'min_child_weight': 3,
    'gamma': 0.9,
    'eval_metric':'auc'
}
# 开始交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_england), 1):
    try:
        print(f"\n===== Fold {fold} =====\n")
        X_train, X_val = X_england.iloc[train_idx], X_england.iloc[val_idx]
        y_train, y_val = y_england.iloc[train_idx], y_england.iloc[val_idx]

        # 计算当前fold的类别权重
        scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
        current_params = params.copy()
        current_params['scale_pos_weight'] = scale_pos_weight

        # 初始化模型
        model = XGBClassifier(**current_params)

        # 训练模型并计时
        start_time = time.time()
        print("Training started...")
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=10  # 每10轮显示一次进度
        )
        training_time = time.time() - start_time
        training_times.append(training_time)
        print(f"Training completed in {training_time:.2f} seconds")

        # 预测
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:, 1]  # 获取正类的概率

        # 评估
        accuracy = accuracy_score(y_val, y_pred)
        auc = roc_auc_score(y_val, y_pred_proba)

        accuracy_scores.append(accuracy)
        auc_scores.append(auc)
        models.append(model)  # 保存当前fold的模型
        print(f"Fold {fold} Accuracy: {accuracy:.4f}")
        print(f"Fold {fold} AUC: {auc:.4f}")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")
        break
    except Exception as e:
        print(f"\nError occurred during fold {fold}: {str(e)}")
        continue

# 输出结果
if accuracy_scores:
    print("\n=== Final Results ===")
    print(f"Average Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
    print(f"Average AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
    print(f"Average Training Time: {np.mean(training_times):.2f} seconds")

    # 找到AUC最高的fold
    best_fold = np.argmax(auc_scores)
    best_model = models[best_fold]
    best_auc = auc_scores[best_fold]

    print(f"\nBest model from Fold {best_fold + 1} with AUC: {best_auc:.4f}")

    # 保存最佳模型
    model_filename = f"best_xgboost_model_fold{best_fold + 1}_auc{best_auc:.4f}.joblib"
    # joblib.dump(best_model, model_filename)
    print(f"Best model saved as {model_filename}")
else:
    print("\nNo valid results were obtained")


===== Fold 1 =====

Training started...
[0]	validation_0-auc:0.63862
[10]	validation_0-auc:0.65990
[20]	validation_0-auc:0.66114
[30]	validation_0-auc:0.66283
[40]	validation_0-auc:0.66297
[50]	validation_0-auc:0.66462
[60]	validation_0-auc:0.66497
[70]	validation_0-auc:0.66587
[80]	validation_0-auc:0.66567
[90]	validation_0-auc:0.66638
[100]	validation_0-auc:0.66708
[110]	validation_0-auc:0.66768
[120]	validation_0-auc:0.66829
[130]	validation_0-auc:0.66908
[140]	validation_0-auc:0.66957
[150]	validation_0-auc:0.67014
[160]	validation_0-auc:0.67124
[170]	validation_0-auc:0.67196
[180]	validation_0-auc:0.67272
[190]	validation_0-auc:0.67346
[200]	validation_0-auc:0.67407
[210]	validation_0-auc:0.67483
[220]	validation_0-auc:0.67528
[230]	validation_0-auc:0.67605
[240]	validation_0-auc:0.67642
[250]	validation_0-auc:0.67704
[260]	validation_0-auc:0.67748
[270]	validation_0-auc:0.67797
[280]	validation_0-auc:0.67843
[290]	validation_0-auc:0.67890
[300]	validation_0-auc:0.67922
[310]	val

In [6]:
external_indices = df[df['Region'].isin(['Scotland', 'Wales'])].index
# 获取苏格兰和威尔士参与者的数据
X_external = data_imputed_df.loc[external_indices]  # 特征
y_external = df.loc[external_indices, 'status']  # 标签
print("外部验证集形状:", X_external.shape)
X_external
# 预测类别（0/1）
y_pred = best_model.predict(X_external)
# 预测概率（用于 AUC 等指标）
y_pred_proba = best_model.predict_proba(X_external)[:, 1]  # 正类的概率
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# 计算准确率和 AUC
accuracy = accuracy_score(y_external, y_pred)
auc = roc_auc_score(y_external, y_pred_proba)

print(f"测试集准确率: {accuracy:.4f}")
print(f"测试集 AUC: {auc:.4f}")

外部验证集形状: (50653, 62)
测试集准确率: 0.6152
测试集 AUC: 0.6975


In [4]:
from sklearn.metrics import roc_curve, auc, classification_report

# 用最佳模型预测
y_proba = best_model.predict_proba(X_test)[:, 1]
# 计算并绘制 ROC 曲线
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Best XGBoost ROC Curve(internal)')
plt.legend()
plt.show()
#绘制混淆矩阵
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 使用最佳模型进行预测
y_pred = best_model.predict(X_test)

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_pred)

# 可视化混淆矩阵
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['No Stroke', 'Stroke'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Stroke Prediction(internal)')
plt.show()
# 筛选来自苏格兰和威尔士的参与者
external_indices = df[df['Region'].isin(['Scotland', 'Wales'])].index
# 获取苏格兰和威尔士参与者的数据
X_external = data_imputed_df.loc[external_indices]  # 特征
y_external = df.loc[external_indices, 'status']  # 标签
print("外部验证集形状:", X_external.shape)
#外部 4. 用最佳模型预测
y_proba = best_model.predict_proba(X_external)[:, 1]
# 5. 计算并绘制 ROC 曲线
fpr, tpr, _ = roc_curve(y_external, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Best XGBoost ROC Curve(external)')
plt.legend()
plt.show()
#绘制混淆矩阵
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 使用最佳模型进行预测
y_pred = best_model.predict(X_external)

# 计算混淆矩阵
cm = confusion_matrix(y_external, y_pred)

# 可视化混淆矩阵
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['No Stroke', 'Stroke'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Stroke Prediction(external)')
plt.show()


TypeError: 'numpy.float64' object is not callable