In [9]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, mean_squared_error, r2_score)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

# ==================== 参数配置 ====================
input_path = "归一化数据-20251120.xlsx"  # 输入数据路径
output_dir = "lgbm_model_ncv"     # 输出目录
target_column = "SO2 tolerance"     # 目标变量列名
outer_test_size = 0.5               # 外层测试集比例
inner_test_size=0.2                 # 内层测试集比例   
random_state = 42                   # 随机种子
n_splits = 5                        # 交叉验证折数
model_path = os.path.join(output_dir, "lgbm_model.pkl")  # 模型保存路径

# ==================== 函数定义 ====================
def prepare_data(input_path, target_column):
    """数据加载与预处理"""
    df = pd.read_excel(input_path)
    if target_column not in df.columns:
        raise ValueError(f"目标列 {target_column} 不存在于数据中")
    
    y = df[target_column]
    X = df.drop(columns=[target_column])
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    return X[numeric_cols], y

def determine_problem_type(y):
    """自动判断问题类型"""
    if y.nunique() <= 10:  # 分类问题（类别数≤10）
        return ('binary' if y.nunique() == 2 else 'multiclass', 
                "classification")
    return ('regression', "regression")

def get_model_params(objective, random_state):
    """获取模型参数"""
    params = {
        'objective': objective,
        'metric': 'binary_logloss' if objective == 'binary' else ('multi_logloss' if objective == 'multiclass' else 'rmse'),
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.8,
        'random_state': random_state,
        'n_jobs': -1,
        'verbose': -1
    }
    
    if objective == 'multiclass':
        params['num_class'] = y.nunique()
    
    return params

def train_and_evaluate(X_train, y_train, X_test, y_test, params, problem_type):
    """训练模型并返回评估结果"""
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, test_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)  # Disable logging
        ]
    )
    
    # 评估模型
    if problem_type == "classification":
        y_pred_train = np.round(model.predict(X_train)) if params['objective'] == 'binary' else np.argmax(model.predict(X_train), axis=1)
        y_pred_test = np.round(model.predict(X_test)) if params['objective'] == 'binary' else np.argmax(model.predict(X_test), axis=1)
        metrics = {
            'Train Accuracy': accuracy_score(y_train, y_pred_train),
            'Test Accuracy': accuracy_score(y_test, y_pred_test),
            'Train F1': f1_score(y_train, y_pred_train, average='macro'),
            'Test F1': f1_score(y_test, y_pred_test, average='macro')
        }
    else:
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        metrics = {
            'Train R2': r2_score(y_train, y_pred_train),
            'Test R2': r2_score(y_test, y_pred_test),
            'Train RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'Test RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test))
        }
    return model, metrics

def plot_feature_importance(model, output_dir):
    """
    绘制特征重要性图（使用自定义颜色方案）
    
    参数:
        model: 训练好的LGBM模型
        output_dir: 输出目录
    """
    # 设置全局字体样式
    plt.rcParams['font.family'] = 'Arial'
    plt.rcParams['font.weight'] = 'bold'
    # 自定义颜色方案（根据用户提供的颜色）
    custom_colors = {
        'EN.': '#82bbf0', 
        'P.Tim. ':'#dff1ff', 
        'S.A.': '#FDB3CC', 
        'GHSV': '#ff78A8', 
        'I.R.': '#82bbf0', 
        'I.E.': '#82bbf0', 
        'P.D.': '#FDB3CC',
        'H2O': '#ff78A8', 
        'P.TEM.': '#dff1ff', 
        'M.P.': '#82bbf0', 
        'T.C.': '#82bbf0', 
        'P.V.': '#FDB3CC', 
        'C.Tem.': '#FDB3CC',
        'Density': '#82bbf0',
        'NH3': '#ff78A8', 
        'SO2': '#dff1ff', 
        'C.Tim.': '#FDB3CC', 
        'O2': '#ff78A8', 
        'NO': '#ff78A8'
    }
    
    # 设置图形大小和边缘距离
    fig, ax = plt.subplots(figsize=(14, 10))
    plt.subplots_adjust(left=0.2, right=0.95, top=0.9, bottom=0.1)
    
    # 获取特征重要性数据
    importance = model.feature_importance(importance_type='split')
    feature_names = model.feature_name()
    
    # 将特征名称和重要性值组合并排序
    importance = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=False)
    feature_names = [x[0] for x in importance]
    feature_imp = [x[1] for x in importance]
    
    # 为每个特征分配颜色（如果特征未在自定义颜色中定义，使用默认颜色）
    colors = [custom_colors.get(feature, '#1f77b4') for feature in feature_names]
    
    # 绘制水平条形图
    bars = ax.barh(
        range(len(feature_names)), 
        feature_imp, 
        height=0.8,  # 调整柱形宽度
        color=colors,  # 使用自定义颜色
    )
    
    # 设置y轴刻度
    ax.set_yticks(range(len(feature_names)))
    ax.set_yticklabels(feature_names)
    
    # 设置x轴标签
    plt.xlabel(
        "Importance",
        fontdict={
            'fontname': 'Arial',
            'fontsize': 26,
            'fontweight': 'bold'
        },
        labelpad=10  # 标签与轴的距离
    )
    
    # 设置y轴标签
    plt.ylabel(
        ' ',
        fontdict={
            'fontname': 'Arial',
            'fontsize': 22,
            'fontweight': 'bold'
        },
        labelpad=10  # 标签与轴的距离
    )
    
    # 设置坐标轴刻度字体
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_fontname('Arial')
        label.set_fontsize(22)
    
    # 保留所有边框并设置样式
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_linewidth(3)  # 加粗边框线
        spine.set_color('black')  # 设置边框颜色
    
    # 添加数值标签
    for bar in bars:
        width = bar.get_width()
        ax.text(
            width + max(feature_imp)*0.01,  # 位置微调
            bar.get_y() + bar.get_height()/2,
            f'{width:.1f}',
            ha='left',
            va='center',
            fontname='Arial',
            fontsize=18,
            color='black'  # 确保文字颜色清晰可见
        )
    
    # 保存图像
    plt.savefig(
        os.path.join(output_dir, "feature_importance.png"), 
        dpi=300, 
        bbox_inches='tight',
        transparent=False
    )
    plt.close()

def plot_actual_vs_predicted(X_train, y_train, X_test, y_test, model, output_dir):
    """绘制实际值 vs 预测值图（回归问题）"""
    # 设置全局字体样式
    plt.rcParams['font.family'] = 'Arial'
    plt.rcParams['font.weight'] = 'bold'
        
    pp_tr = model.predict(X_train)
    y_predicted = model.predict(X_test)
    
    plt.figure(figsize=(10, 8))
    g = sns.JointGrid(x=y_test, y=y_predicted, height=8, space=0)
    
    # 绘制散点图和回归线
    sns.scatterplot(x=y_train, y=pp_tr, s=100, color='#DB7987', ax=g.ax_joint, label='Training')
    sns.scatterplot(x=y_test, y=y_predicted, s=100, color='#82BBF0', ax=g.ax_joint, label='Testing')
    sns.regplot(x=y_test, y=y_predicted, ax=g.ax_joint, scatter=False, color='gray')
       
    # 调整坐标轴刻度标签样式
    for ax in [g.ax_joint, g.ax_marg_x, g.ax_marg_y]:
        ax.tick_params(
            axis='both', 
            which='major', 
            labelsize=28,      # 刻度标签大小
            colors='black',   # 刻度标签颜色
            width=3,
            length=8
        )
    
    # 设置图例样式
    g.ax_joint.legend(
        title_fontsize=28,      # 标题字体大小
        fontsize=28,           # 项目字体大小
        frameon=False,          # 显示边框
        framealpha=1,        # 边框透明度
        edgecolor='white',   # 边框颜色
        facecolor='NONE'      # 背景颜色
    )
    
    # 主图边框（闭合，线宽3）
    for spine in g.ax_joint.spines.values():
        spine.set_visible(True)
        spine.set_edgecolor('#333333')
        spine.set_linewidth(3)
    
    # 计算实际值和预测值的全局范围
    all_values = np.concatenate([y_train, y_test, pp_tr, y_predicted])
    min_val = np.min(all_values)
    max_val = np.max(all_values)
    buffer = (max_val - min_val) * 0.15  # 增加缓冲区，确保边缘数据有足够空间
    
    # 设置主图坐标范围
    g.ax_joint.set_xlim(min_val - buffer, max_val + buffer)
    g.ax_joint.set_ylim(min_val - buffer, max_val + buffer)
    
    # 绘制理想预测线
    g.ax_joint.plot([min_val, max_val], [min_val, max_val], '--', color='gray', alpha=0.5)
    
    # 优化边缘分布的面积图
    # X轴面积图 - 使用更合适的带宽和范围
    sns.kdeplot(
        x=y_train, ax=g.ax_marg_x, color='#DB7987', alpha=0.5,
        fill=True, linewidth=0, label='Training', bw_adjust=1.5,
        clip=(min_val - buffer, max_val + buffer)
    )
    sns.kdeplot(
        x=y_test, ax=g.ax_marg_x, color='#82BBF0', alpha=0.5,
        fill=True, linewidth=0, label='Testing', bw_adjust=1.5,
        clip=(min_val - buffer, max_val + buffer)
    )
    
    # y轴面积图 - 使用更合适的带宽和范围
    sns.kdeplot(
        y=pp_tr, ax=g.ax_marg_y, color='#DB7987', alpha=0.5,
        fill=True, linewidth=0, label='Training', bw_adjust=1.5,
        clip=(min_val - buffer, max_val + buffer)
    )
    sns.kdeplot(
        y=y_predicted, ax=g.ax_marg_y, color='#82BBF0', alpha=0.5,
        fill=True, linewidth=0, label='Testing', bw_adjust=1.5,
        clip=(min_val - buffer, max_val + buffer)
    )
    
    # 隐藏边缘分布的图例
    legend_x = g.ax_marg_x.legend()
    legend_x.set_visible(False)
    legend_y = g.ax_marg_y.legend()
    legend_y.set_visible(False)
    
    # 调整边缘分布坐标轴粗细
    for ax in [g.ax_marg_x, g.ax_marg_y]:
        for spine in ax.spines.values():
            spine.set_linewidth(2)
    
    # 确保边缘分布与主图坐标范围一致
    g.ax_marg_x.set_xlim(g.ax_joint.get_xlim())
    g.ax_marg_y.set_ylim(g.ax_joint.get_ylim())
    
    # 关键修改：在所有绘图操作完成后设置轴标签
    g.ax_joint.set_xlabel("Actual value", fontsize=32, fontname='Arial', fontweight='bold')
    g.ax_joint.set_ylabel("Predicted value", fontsize=32, fontname='Arial', fontweight='bold')
    
    # 调整整体布局，确保有足够空间显示面积图
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.12, right=0.95)
    
    # 保存图像
    plot_path = os.path.join(output_dir, "actual_vs_predicted.png")
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    return plot_path



# ==================== 主程序 ====================
def main():
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. 数据准备
    X, y = prepare_data(input_path, target_column)
    objective, problem_type = determine_problem_type(y)
    params = get_model_params(objective, random_state)
    
    # 2. 外层数据划分
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=outer_test_size, random_state=random_state)
    
    # 3. 外层模型训练与评估
    print("外层模型训练与评估...")
    outer_model, outer_metrics = train_and_evaluate(
        X_train, y_train, X_test, y_test, params, problem_type)
    joblib.dump(outer_model, model_path)
    print(f"\n外层模型已保存至：{model_path}")
    
    # 打印外层评估结果
    print("\n外层模型性能：")
    print(pd.DataFrame([outer_metrics], index=['Value']))
    
    # 4. 内层5折交叉验证
    print(f"\n开始内层{n_splits}折交叉验证...")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    cv_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
        print(f"\n正在处理第 {fold} 折...")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model, metrics = train_and_evaluate(X_tr, y_tr, X_val, y_val, params, problem_type)
        cv_metrics.append(metrics)
        print(f"第 {fold} 折完成 - 验证集R2: {metrics.get('Test R2', metrics.get('Test Accuracy')):.4f}")
    
    # 5. 交叉验证结果分析
    cv_results = pd.DataFrame(cv_metrics)
    print("\n交叉验证结果汇总：")
    print(cv_results)
    
    mean_metrics = cv_results.mean().to_dict()
    std_metrics = cv_results.std().to_dict()
    
    print("\n交叉验证平均性能：")
    for metric in mean_metrics:
        print(f"{metric}: {mean_metrics[metric]:.4f} ± {std_metrics[metric]:.4f}")
    
    # 6. 可视化与结果保存
    plot_feature_importance(outer_model, output_dir)
    
    if problem_type == "regression":
        plot_path = plot_actual_vs_predicted(X_train, y_train, X_test, y_test, 
                                           outer_model, output_dir)
        print(f"\n实际值 vs 预测值图已保存至：{plot_path}")
    
    # 7. 保存评估结果
    results = {
        'outer_test_r2': outer_metrics.get('Test R2', outer_metrics.get('Test Accuracy')),
        'cv_mean_r2': mean_metrics.get('Test R2', mean_metrics.get('Test Accuracy')),
        'cv_std_r2': std_metrics.get('Test R2', std_metrics.get('Test Accuracy'))
    }
    
    with open(os.path.join(output_dir, "results_summary.txt"), 'w') as f:
        f.write("模型评估结果汇总:\n")
        f.write(f"外层测试集R2: {results['outer_test_r2']:.4f}\n")
        f.write(f"交叉验证平均R2: {results['cv_mean_r2']:.4f} ± {results['cv_std_r2']:.4f}\n")
    
    print("\n评估结果已保存至 results_summary.txt")
    print("\n嵌套交叉验证完成！所有结果保存在目录：", output_dir)

if __name__ == "__main__":
    main()

外层模型训练与评估...

外层模型已保存至：lgbm_model_ncv\lgbm_model.pkl

外层模型性能：
       Train R2   Test R2  Train RMSE  Test RMSE
Value  0.683754  0.238706    0.055434   0.083224

开始内层5折交叉验证...

正在处理第 1 折...
第 1 折完成 - 验证集R2: 0.3646

正在处理第 2 折...
第 2 折完成 - 验证集R2: 0.5505

正在处理第 3 折...
第 3 折完成 - 验证集R2: 0.1027

正在处理第 4 折...
第 4 折完成 - 验证集R2: -0.1421

正在处理第 5 折...
第 5 折完成 - 验证集R2: -0.0031

交叉验证结果汇总：
   Train R2   Test R2  Train RMSE  Test RMSE
0  0.694595  0.364567    0.057985   0.054827
1  0.825644  0.550540    0.042615   0.053227
2  0.872095  0.102723    0.025833   0.153665
3  0.064302 -0.142096    0.099794   0.078213
4  0.084064 -0.003095    0.101118   0.060651

交叉验证平均性能：
Train R2: 0.5081 ± 0.4015
Test R2: 0.1745 ± 0.2804
Train RMSE: 0.0655 ± 0.0339
Test RMSE: 0.0801 ± 0.0423

实际值 vs 预测值图已保存至：lgbm_model_ncv\actual_vs_predicted.png

评估结果已保存至 results_summary.txt

嵌套交叉验证完成！所有结果保存在目录： lgbm_model_ncv


<Figure size 1000x800 with 0 Axes>