# H6-2 基线测试：回归 & 分类


In [1]:
# 确保src目录在Python路径中
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import identify_attributes, parse_petrel_file

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [3]:
data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

正在解析文件: ../data/H6-2_attr
正在识别文件属性: ../data/H6-2_attr
识别到 END ATTRIBUTES 位于第 31 行
识别到 14 个属性:
  - Average envelope
  - Average instantaneous frequency
  - Average instantaneous phase
  - Average peak value
  - Geometric mean
  - Half energy
  - Harmonic mean
  - Maximum amplitude
  - Mean amplitude
  - Minimum amplitude
  - Most of
  - RMS amplitude
  - Sum of amplitudes
  - Sum of energy
解析到数据有 20 列
总列数: 20, 其中:
  - 3 列为坐标 (X, Y, Z)
  - 3 列为占位符
  - 14 列为属性


  df = pd.read_csv(


成功读取数据，共 51714 行


## 导入井震数据

使用 xlsx / csv 数据，注意表名为 Sheet1，注意数据需包含表头

请检查 excel 表头和下面代码中的 selected_columns 是否一致


In [4]:
file_H6_2_well = "../data/well_processed.xlsx"
data_H6_2_well = pd.read_excel(file_H6_2_well, sheet_name="Sheet1")

# 只选择 Surface 为 H6-2 的行，并丢弃砂厚为 NaN 的行
data_H6_2_well_selected = (
    data_H6_2_well[data_H6_2_well["Surface"] == "H6-2"]
    .replace(-999, np.nan)
    .dropna(subset=["Thickness of facies(1: Fine sand)"])
    .reset_index(drop=True)
)

data_H6_2_well_selected.head()

Unnamed: 0,X,Y,Z,Surface,Well,Thickness of facies(1: Fine sand),facies(1: Fine sand),Average energy,Average envelope,Average instantaneous frequency,...,Average peak value,Half energy,Harmonic mean,Maximum amplitude,Mean amplitude,Minimum amplitude,Most of,RMS amplitude,Sum of amplitudes,Sum of energy
0,686325.6,3217019.1,-2649.7,H6-2,A1,0.0,0.0,89001976.0,11011.2,21.9,...,,5.2,-4573.6,-1763.2,-8415.1,-14124.2,-13431.1,9511.8,-66162.7,718160960
1,686616.5,3217415.2,-2633.0,H6-2,A10,7.87,45.82,76951152.0,11667.5,25.9,...,,4.0,-43402.7,4934.6,-4832.2,-15217.6,-13626.4,8764.0,-36986.6,724726848
2,686278.0,3217627.9,-2650.4,H6-2,A11,0.0,0.0,6199530.0,2127.4,49.7,...,,7.2,-1264.8,4551.3,623.9,-2246.4,-556.0,2459.4,2576.6,26151754
3,686149.5,3216665.5,-2642.5,H6-2,A2,0.75,5.16,88260688.0,12334.4,21.0,...,,4.0,-7747.0,1608.4,-7710.8,-15313.4,-14014.2,9393.7,-57804.2,667979712
4,685921.1,3216986.2,-2644.7,H6-2,A4,0.0,0.0,31338386.0,5349.9,25.9,...,,7.0,3644.1,7899.5,4837.1,624.7,4900.9,5553.1,15000.3,97357456


## 提取共同属性


In [6]:
# 获取地震属性列表
seismic_attr, _ = identify_attributes("../data/H6-2_attr")

# 提取Excel的属性列表（从第8列开始的所有列）
well_seismic_attr = data_H6_2_well.columns[7:].tolist()

# 计算两个列表的交集
common_attributes = list(set(seismic_attr) & set(well_seismic_attr))

# 打印结果
print(f"地震属性数量: {len(seismic_attr)}")
print(f"Excel属性数量: {len(well_seismic_attr)}")
print(f"共同属性数量: {len(common_attributes)}")
print("\n共同属性列表:")
for attr in common_attributes:
    print(f"- {attr}")

正在识别文件属性: ../data/H6-2_attr
识别到 END ATTRIBUTES 位于第 31 行
识别到 14 个属性:
  - Average envelope
  - Average instantaneous frequency
  - Average instantaneous phase
  - Average peak value
  - Geometric mean
  - Half energy
  - Harmonic mean
  - Maximum amplitude
  - Mean amplitude
  - Minimum amplitude
  - Most of
  - RMS amplitude
  - Sum of amplitudes
  - Sum of energy
地震属性数量: 14
Excel属性数量: 14
共同属性数量: 13

共同属性列表:
- Average instantaneous frequency
- Mean amplitude
- Average envelope
- Most of
- Average peak value
- Sum of amplitudes
- Sum of energy
- RMS amplitude
- Half energy
- Minimum amplitude
- Average instantaneous phase
- Harmonic mean
- Maximum amplitude


## ✅ 总结建议（适用于你的砂厚预测任务）：

- **快速筛选** → 用 **相关性 + 方差过滤**
- **建模前重要性分析** → 用 **随机森林 / XGBoost 重要性**
- **模型训练后解释特征贡献** → 用 **SHAP 排序 + 可视化**
- **高维 + 稀疏特征** → 可以用 **LASSO 或 RFE 进一步压缩特征维度**


## 函数：分析砂厚与地震属性的相关系数 & 可视化


In [None]:
def analyze_correlations(
    df,
    target_col="Thickness of LITHOLOGIES(1: sand)",
    start_col_idx=7,
    method="spearman",
    top_n=None,
):
    """
    计算目标列与数据框中指定列开始的所有列的相关系数

    参数:
        df: 包含数据的DataFrame
        target_col: 目标列名
        start_col_idx: 从哪一列开始计算相关系数
        method: 使用的相关系数方法，'spearman'或'pearson'
        top_n: 返回前N个显著相关(p<0.05)的属性，默认不限制

    返回:
        按相关系数降序排列的DataFrame，包含相关系数、p值和显著性标记
    """
    results = []

    # 检查方法参数
    if method.lower() not in ["spearman", "pearson"]:
        print(f"不支持的相关系数方法: {method}，使用默认的Spearman")
        method = "spearman"

    # 显示使用的相关系数方法
    print(f"使用 {method.capitalize()} 相关系数计算")

    # 遍历所有地震属性列
    for col in df.columns[start_col_idx:]:
        try:
            # 移除包含NaN的行以确保数据有效性
            valid_data = df[[target_col, col]].dropna()

            # 检查有效数据点数量
            if len(valid_data) < 3:
                print(f"警告: '{col}' 与 '{target_col}' 的有效数据点少于3个，跳过计算")
                continue

            if method.lower() == "spearman":
                # Spearman相关系数
                coef, pval = spearmanr(valid_data[target_col], valid_data[col])
            else:
                # Pearson相关系数
                from scipy.stats import pearsonr

                coef, pval = pearsonr(valid_data[target_col], valid_data[col])

            # 保存结果
            results.append(
                {
                    "属性名称": col,
                    f"{method.capitalize()} 相关系数": coef,
                    "p-value": pval,
                    "显著性": "**" if pval < 0.05 else ("*" if pval < 0.1 else ""),
                }
            )
        except Exception as e:
            print(f"计算 '{col}' 列的相关系数时出错: {str(e)}")

    # 检查是否有成功计算的结果
    if not results:
        print("警告: 没有找到可以计算相关系数的有效列")
        return pd.DataFrame()  # 返回空DataFrame

    # 转换为DataFrame并排序
    result_df = pd.DataFrame(results)
    result_df.set_index("属性名称", inplace=True)

    # 按相关系数绝对值大小降序排列
    result_df = result_df.sort_values(f"{method.capitalize()} 相关系数", key=abs, ascending=False)

    # 添加相关强度分类
    def correlation_strength(r):
        """根据相关系数的绝对值确定相关强度"""
        r_abs = abs(r)
        if r_abs >= 0.8:
            return "极强"
        elif r_abs >= 0.6:
            return "强"
        elif r_abs >= 0.4:
            return "中等"
        elif r_abs >= 0.2:
            return "弱"
        else:
            return "极弱或无"

    result_df["相关强度"] = result_df[f"{method.capitalize()} 相关系数"].apply(correlation_strength)

    # 筛选显著相关的结果
    significant_results = result_df[result_df["p-value"] < 0.05].copy()

    # 如果指定了top_n，则只返回前N个显著相关的属性
    if top_n is not None and len(significant_results) > 0:
        return significant_results.head(top_n)

    return result_df


# 分析结果并可视化
def analyze_and_visualize(
    df,
    target_col="Thickness of LITHOLOGIES(1: sand)",
    start_col_idx=7,
    method="spearman",
    top_n=10,
):
    """
    分析相关性并可视化结果

    参数:
        df: 包含数据的DataFrame
        target_col: 目标列名
        start_col_idx: 从哪一列开始计算相关系数
        method: 使用的相关系数方法，'spearman'或'pearson'
        top_n: 可视化展示的属性数量
    """
    # 执行相关性分析
    correlation_results = analyze_correlations(df, target_col=target_col, start_col_idx=start_col_idx, method=method)

    top_attributes = correlation_results.head(top_n)
    plt.figure(figsize=(15, 9))

    # 创建水平条形图
    bars = plt.barh(
        top_attributes.index,
        top_attributes[f"{method.capitalize()} 相关系数"],
        color=[plt.cm.RdYlGn(0.5 * (x + 1)) for x in top_attributes[f"{method.capitalize()} 相关系数"]],
    )

    # 添加零线
    plt.axvline(x=0, color="gray", linestyle="-", alpha=0.3)
    plt.xlabel(f"{method.capitalize()} 相关系数")
    plt.title(f"砂厚与前{top_n}个最相关地震属性 ({method.capitalize()})")
    plt.grid(axis="x", linestyle="--", alpha=0.6)

    # 获取x轴限制以确定标签位置
    x_min, x_max = plt.xlim()

    # 在柱状图上标注相关系数值和显著性
    for i, bar in enumerate(bars):
        attr = top_attributes.index[i]
        coef = top_attributes.loc[attr, f"{method.capitalize()} 相关系数"]
        sig = top_attributes.loc[attr, "显著性"]
        width = bar.get_width()  # 获取柱的宽度

        # 计算标签位置 - 将标签放在柱状图边缘
        x_pos = width + np.sign(width) * 0.01 * (x_max - x_min)  # 根据正负值确定方向

        # 根据正负值确定对齐方式
        ha_value = "left" if width >= 0 else "right"
        color = "black"  # 统一使用黑色，因为标签在柱状图外部

        plt.text(
            x_pos,
            i,
            f"{coef:.3f}{sig}",
            va="center",
            ha=ha_value,
            color=color,
            fontweight="bold",
            fontsize=10,
        )

    plt.tight_layout()
    plt.show()

    return correlation_results

### 测试


In [None]:
# 使用默认的Spearman相关系数分析并可视化
correlation_results_spearman = analyze_and_visualize(
    data_H6_2_well_selected, target_col="LITHOLOGIES(1: sand)", top_n=10
)

In [None]:
top_n_attr = correlation_results_spearman.head(10).index.tolist()
top_n_attr

## AutoFeat 生成组合特征


In [None]:
# 准备训练数据
# 使用井点数据作为训练集，因为只有井点位置有实际的砂厚数据
X = data_H6_2_well_selected[top_n_attr].copy()
y = data_H6_2_well_selected["LITHOLOGIES(1: sand)"].copy()

# 处理可能存在的NaN值
X = X.fillna(X.mean())
y = y.fillna(y.mean())

print(f"输入特征数量: {X.shape[1]}")
print(f"训练样本数量: {X.shape[0]}")

# 使用AutoFeat生成组合特征
print("\n开始使用AutoFeat生成组合特征...")
autofeat_model = AutoFeatRegressor(
    categorical_cols=[],  # 没有分类特征
    feateng_cols=X.columns.tolist(),  # 使用所有地震属性作为特征工程的输入列
    feateng_steps=3,  # 特征工程的步骤数（决定组合特征的复杂度）
    n_jobs=-1,  # 使用所有CPU核心
    verbose=1,  # 显示详细信息
)

# 拟合并转换数据
X_new = autofeat_model.fit_transform(X, y)

# 保存AutoFeat模型以便后续应用到全区域数据
joblib.dump(autofeat_model, "output/autofeat_pipeline_seismic.pkl")

# 输出生成的特征列表和数量
print("\n\n生成的组合特征列表:")
print(X_new.columns.tolist())
print(f"生成的组合特征数量: {X_new.shape[1]}")

# 将生成的特征名称保存到Excel文件
pd.Series(X_new.columns).to_excel("output/AutoFeat_生成特征列名.xlsx", index=False)

## Baseline 模型测试


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb
import time

# 设置随机种子，保证结果可复现
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


def train_model_with_grid_search(X_train, X_test, y_train, y_test, model_name, feature_names, is_classification=False):
    """
    使用网格搜索训练模型，同时优化特征选择和超参数

    参数:
        X_train, X_test: 训练和测试特征
        y_train, y_test: 训练和测试标签
        model_name: 'rf', 'xgb', 或 'svr'
        feature_names: 特征名称列表
        is_classification: 是否为分类任务

    返回:
        最佳模型, 选择的特征, 性能指标
    """
    print(f"\n{'=' * 20} 训练 {model_name.upper()} 模型 {'=' * 20}")
    start_time = time.time()

    # 特征标准化 - 对SVR是必要的
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 创建Pipeline和参数网格
    if model_name.lower() == "rf":
        # 随机森林模型
        pipeline = Pipeline(
            [
                ("feature_selection", SelectKBest(f_regression)),
                ("model", RandomForestRegressor(random_state=42)),
            ]
        )

        param_grid = {
            "feature_selection__k": [3, 5, "all"] if X_train.shape[1] > 5 else [3, "all"],
            "model__n_estimators": [100, 200],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2],
        }

    elif model_name.lower() == "xgb":
        # XGBoost模型
        pipeline = Pipeline(
            [
                ("feature_selection", SelectKBest(f_regression)),
                ("model", xgb.XGBRegressor(random_state=42)),
            ]
        )

        param_grid = {
            "feature_selection__k": [3, 5, "all"] if X_train.shape[1] > 5 else [3, "all"],
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.01, 0.1],
            "model__max_depth": [3, 6],
            "model__colsample_bytree": [0.7, 1.0],
        }

    elif model_name.lower() == "svr":
        # SVR模型 - 必须使用标准化后的特征
        pipeline = Pipeline([("feature_selection", SelectKBest(f_regression)), ("model", SVR())])

        param_grid = {
            "feature_selection__k": [3, 5, "all"] if X_train.shape[1] > 5 else [3, "all"],
            "model__kernel": ["linear", "rbf"],
            "model__C": [0.1, 1, 10, 100],
            "model__gamma": ["scale", "auto", 0.1],
        }

    # 进行网格搜索
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=3 if X_train.shape[0] >= 10 else 2,  # 小样本时使用更少的折数
        scoring="neg_mean_squared_error",
        verbose=1,
        n_jobs=-1,
    )

    # 使用标准化后的数据训练SVR，使用原始数据训练其他模型
    if model_name.lower() == "svr":
        grid_search.fit(X_train_scaled, y_train)
    else:
        grid_search.fit(X_train, y_train)

    # 获取最佳参数和模型
    best_params = grid_search.best_params_
    print(f"\n最佳参数: {best_params}")

    # 获取选择的特征
    best_k = best_params["feature_selection__k"]
    if best_k == "all":
        selected_features = feature_names
    else:
        # 对特征重新打分
        feature_selector = SelectKBest(f_regression, k=best_k)
        if model_name.lower() == "svr":
            feature_selector.fit(X_train_scaled, y_train)
        else:
            feature_selector.fit(X_train, y_train)
        selected_indices = feature_selector.get_support(indices=True)
        selected_features = [feature_names[i] for i in selected_indices]

    print(f"\n选择的特征 ({len(selected_features)}个): {selected_features}")

    # 获取最佳模型
    best_model = grid_search.best_estimator_

    # 预测和评估
    if model_name.lower() == "svr":
        y_pred = best_model.predict(X_test_scaled)
    else:
        y_pred = best_model.predict(X_test)

    # 计算指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name.upper()} 评估结果:")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R²:   {r2:.4f}\n")

    # 计算运行时间
    end_time = time.time()
    run_time = end_time - start_time
    print(f"网格搜索运行时间: {run_time:.2f} 秒")

    # 可视化实际值vs预测值
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, y_pred, alpha=0.6)
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], "r--")
    plt.xlabel("实际值")
    plt.ylabel("预测值")
    plt.title(f"{model_name.upper()}: 实际值 vs. 预测值")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # 返回最佳模型、选择的特征和度量指标
    metrics = {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2}
    return best_model, selected_features, metrics

In [None]:
# --------------------------
# 使用网格搜索优化模型和特征选择
# --------------------------

print("准备使用网格搜索优化模型...")

# 准备数据
if "X_new" in globals():
    X_data = X_new.copy()
    print(f"使用AutoFeat生成的特征 ({X_data.shape[1]} 个特征)")
else:
    X_data = data_H6_2_well_selected[top_n_attr].copy()
    print(f"使用原始特征 ({X_data.shape[1]} 个特征)")

# 确定任务类型和目标变量
is_classification = False
target_column = "Thickness of LITHOLOGIES(1: sand)"
if "标签" in data_H6_2_well_selected.columns:
    y_data = data_H6_2_well_selected["标签"].copy()
    is_classification = True
    target_column = "标签"
else:
    y_data = data_H6_2_well_selected[target_column].copy()

# 填充缺失值
X_data = X_data.fillna(X_data.mean())
y_data = y_data.fillna(y_data.mean() if not is_classification else y_data.mode()[0])

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=RANDOM_SEED)

print(f"训练集样本数: {X_train.shape[0]}")
print(f"测试集样本数: {X_test.shape[0]}")

# 存储网格搜索结果
gs_model_results = {}

# 训练并评估随机森林模型
rf_best_model, rf_selected_features, rf_metrics = train_model_with_grid_search(
    X_train,
    X_test,
    y_train,
    y_test,
    model_name="rf",
    feature_names=X_train.columns.tolist(),
    is_classification=is_classification,
)
gs_model_results["Random Forest"] = {
    "model": rf_best_model,
    "selected_features": rf_selected_features,
    "metrics": rf_metrics,
}

# 训练并评估XGBoost模型
xgb_best_model, xgb_selected_features, xgb_metrics = train_model_with_grid_search(
    X_train,
    X_test,
    y_train,
    y_test,
    model_name="xgb",
    feature_names=X_train.columns.tolist(),
    is_classification=is_classification,
)
gs_model_results["XGBoost"] = {
    "model": xgb_best_model,
    "selected_features": xgb_selected_features,
    "metrics": xgb_metrics,
}

# 训练并评估SVR模型
svr_best_model, svr_selected_features, svr_metrics = train_model_with_grid_search(
    X_train,
    X_test,
    y_train,
    y_test,
    model_name="svr",
    feature_names=X_train.columns.tolist(),
    is_classification=is_classification,
)
gs_model_results["SVR"] = {
    "model": svr_best_model,
    "selected_features": svr_selected_features,
    "metrics": svr_metrics,
}

## 目标属性预测


In [None]:
# --------------------------
# 目标属性预测 - 使用最佳模型预测整个区域
# --------------------------

print("\n========== 使用最佳模型进行全区域预测 ==========")

# 1. 确定最佳模型
if "gs_model_results" in globals():
    # 如果已经进行过网格搜索，使用网格搜索的结果
    model_results = gs_model_results
else:
    # 否则使用全局变量中可能存在的模型结果
    model_results = globals().get("model_results", {})

if not model_results:
    print("错误：未找到训练好的模型结果。请先运行模型训练代码。")
else:
    # 比较不同模型的性能找出最佳模型
    performance_metrics = {}
    for model_name, result in model_results.items():
        metrics = result["metrics"]
        if "r2" in metrics:  # 回归模型
            performance_metrics[model_name] = metrics["r2"]
        elif "accuracy" in metrics:  # 分类模型
            performance_metrics[model_name] = metrics["accuracy"]

    # 找出性能最好的模型
    best_model_name = max(performance_metrics.items(), key=lambda x: x[1])[0]
    best_model = model_results[best_model_name]["model"]
    best_features = model_results[best_model_name]["selected_features"]

    print(f"\n选择的最佳模型: {best_model_name}")
    print(f"R² 或准确率: {performance_metrics[best_model_name]:.4f}")
    print(f"使用特征数量: {len(best_features)}")
    print(f"使用的特征: {best_features}")

    # 2. 准备全区域数据进行预测
    print("\n准备全区域数据进行预测...")

    # 检查是否存在地震属性数据
    if "data_H6_2_attr" not in globals() or data_H6_2_attr is None:
        print("未找到地震属性数据，重新加载...")
        data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

    if data_H6_2_attr is not None:
        print(f"地震属性数据加载成功，共 {len(data_H6_2_attr)} 个点位")

        # 3. 处理全区域数据

        # 判断是否需要使用AutoFeat转换
        if "autofeat_model" in globals() and any(f not in data_H6_2_attr.columns for f in best_features):
            print("检测到使用了AutoFeat生成的特征，应用AutoFeat转换...")

            # 检查是否有原始特征列表
            if "top_n_attr" in globals():
                # 准备原始特征并处理缺失值
                X_seismic_orig = data_H6_2_attr[top_n_attr].copy()

                # 检查原始特征中的NaN值并处理
                missing_cols = X_seismic_orig.columns[X_seismic_orig.isna().any()].tolist()
                if missing_cols:
                    print(f"原始特征中发现以下列存在缺失值: {missing_cols}")
                    for col in missing_cols:
                        col_mean = X_seismic_orig[col].mean()
                        X_seismic_orig[col].fillna(col_mean, inplace=True)
                        print(f"  - 列 '{col}' 的缺失值已用均值 {col_mean:.4f} 填充")

                # 确保所有NaN都已处理
                if X_seismic_orig.isna().any().any():
                    print("警告：某些原始特征列的均值可能为NaN，使用0填充")
                    X_seismic_orig.fillna(0, inplace=True)

                # 应用AutoFeat转换
                X_seismic = autofeat_model.transform(X_seismic_orig)
                print(f"AutoFeat转换后的特征数量: {X_seismic.shape[1]}")

                # 检查AutoFeat转换后的NaN值并处理
                post_transform_missing = X_seismic.isna().sum().sum()
                if post_transform_missing > 0:
                    print(f"AutoFeat转换后发现 {post_transform_missing} 个缺失值，进行处理...")

                    # 查看哪些列有NaN
                    missing_cols = X_seismic.columns[X_seismic.isna().any()].tolist()
                    print(f"以下转换后的特征存在缺失值: {missing_cols}")

                    # 填充缺失值
                    for col in missing_cols:
                        col_mean = X_seismic[col].mean()
                        # 如果均值是NaN（可能整列都是NaN），则用0填充
                        if pd.isna(col_mean):
                            X_seismic[col].fillna(0, inplace=True)
                            print(f"  - 列 '{col}' 的所有值为NaN，用0填充")
                        else:
                            X_seismic[col].fillna(col_mean, inplace=True)
                            print(f"  - 列 '{col}' 的缺失值已用均值 {col_mean:.4f} 填充")

                # 只保留模型需要的特征
                common_features = [f for f in best_features if f in X_seismic.columns]
                if len(common_features) < len(best_features):
                    missing_features = set(best_features) - set(common_features)
                    print(f"警告: 有 {len(missing_features)} 个模型所需特征在转换后的数据中不存在:")
                    print(f"  缺失的特征: {missing_features}")

                X_seismic = X_seismic[common_features]

                # 再次检查是否还有NaN
                if X_seismic.isna().any().any():
                    print("警告: 处理后的特征数据仍有NaN值，用0填充")
                    X_seismic.fillna(0, inplace=True)

            else:
                print("错误: 无法找到原始特征列表，无法应用AutoFeat转换")
                # 尝试使用可用的特征继续
                seismic_features = [f for f in best_features if f in data_H6_2_attr.columns]
                if seismic_features:
                    print(f"尝试使用 {len(seismic_features)} 个可用的非组合特征继续...")
                    X_seismic = data_H6_2_attr[seismic_features].copy()
                    X_seismic = X_seismic.fillna(X_seismic.mean())
                    # 确保没有NaN
                    X_seismic.fillna(0, inplace=True)
                else:
                    print("致命错误: 无法构建预测所需的特征数据")
        else:
            # 使用原始特征而不是AutoFeat特征
            seismic_features = []
            for feature in best_features:
                if feature in data_H6_2_attr.columns:
                    seismic_features.append(feature)
                else:
                    print(f"警告: 特征 '{feature}' 在地震属性数据中不存在")

            if len(seismic_features) == 0:
                print("错误: 无法在地震属性数据中找到模型所需的特征")
            else:
                print(f"在地震属性数据中找到 {len(seismic_features)}/{len(best_features)} 个特征")

                # 提取特征数据
                X_seismic = data_H6_2_attr[seismic_features].copy()

                # 处理缺失值
                missing_cols = X_seismic.columns[X_seismic.isna().any()].tolist()
                if missing_cols:
                    print(f"发现以下列存在缺失值: {missing_cols}")
                    for col in missing_cols:
                        col_mean = X_seismic[col].mean()
                        if pd.isna(col_mean):  # 如果均值为NaN，用0填充
                            X_seismic[col].fillna(0, inplace=True)
                            print(f"  - 列 '{col}' 的所有值为NaN，用0填充")
                        else:
                            X_seismic[col].fillna(col_mean, inplace=True)
                            print(f"  - 列 '{col}' 的缺失值已用均值 {col_mean:.4f} 填充")

                # 确保没有NaN
                if X_seismic.isna().any().any():
                    print("警告：某些列的均值可能为NaN，使用0填充")
                    X_seismic.fillna(0, inplace=True)

        # 检查特征数据是否已正确准备
        if "X_seismic" in locals() and not X_seismic.empty:
            # 4. 预测目标属性
            print("\n使用最佳模型进行预测...")

            # 对于SVR模型，需要标准化数据
            if "SVR" in str(best_model.__class__) or best_model_name == "SVR":
                print("检测到SVR模型，进行数据标准化...")
                scaler = StandardScaler()
                X_seismic_scaled = scaler.fit_transform(X_seismic)

                # 将缩放后的数据转换回DataFrame以保持列名
                X_seismic = pd.DataFrame(X_seismic_scaled, columns=X_seismic.columns, index=X_seismic.index)

            # 获取预测值
            y_pred = best_model.predict(X_seismic)

            # 5. 将预测结果添加到地震属性数据中
            if is_classification:
                data_H6_2_attr["预测_标签"] = y_pred
                print(f"成功预测 {len(data_H6_2_attr)} 个点的类别")
            else:
                data_H6_2_attr["预测_砂厚"] = y_pred
                print(f"成功预测 {len(data_H6_2_attr)} 个点的砂厚")

            # 6. 保存预测结果
            output_file = f"output/预测结果_{best_model_name}.csv"
            result_df = data_H6_2_attr[["X", "Y", "Z"] + (["预测_标签"] if is_classification else ["预测_砂厚"])]
            result_df.to_csv(output_file, index=False)
            print(f"预测结果已保存到 {output_file}")

            # 7. 可视化预测结果分布
            plt.figure(figsize=(10, 6))

            if is_classification:
                # 分类任务的结果分布
                sns.countplot(x="预测_标签", data=data_H6_2_attr)
                plt.title(f"预测标签分布 (使用{best_model_name})")
                plt.xlabel("预测标签")
                plt.ylabel("数量")
            else:
                # 回归任务的结果分布
                sns.histplot(data_H6_2_attr["预测_砂厚"], kde=True, bins=30)
                plt.title(f"预测砂厚分布 (使用{best_model_name})")
                plt.xlabel("预测砂厚")
                plt.ylabel("频率")

                # 添加统计信息
                mean_val = data_H6_2_attr["预测_砂厚"].mean()
                median_val = data_H6_2_attr["预测_砂厚"].median()
                min_val = data_H6_2_attr["预测_砂厚"].min()
                max_val = data_H6_2_attr["预测_砂厚"].max()

                stats_text = (
                    f"平均值: {mean_val:.2f}\n中位数: {median_val:.2f}\n最小值: {min_val:.2f}\n最大值: {max_val:.2f}"
                )
                plt.text(
                    0.95,
                    0.95,
                    stats_text,
                    transform=plt.gca().transAxes,
                    verticalalignment="top",
                    horizontalalignment="right",
                    bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
                )

            plt.tight_layout()
            plt.savefig(f"output/预测分布_{best_model_name}.png", dpi=300)
            plt.show()

            # 8. 可视化预测结果的空间分布（优化色棒范围）
            # 创建平面图
            plt.figure(figsize=(12, 10))

            # 获取目标列
            target_col = "预测_标签" if is_classification else "预测_砂厚"

            # 计算5%和95%分位数以设置色棒范围
            if not is_classification:
                vmin = np.percentile(data_H6_2_attr[target_col], 5)
                vmax = np.percentile(data_H6_2_attr[target_col], 95)
                print(f"色棒范围限制: 5%分位数={vmin:.2f}, 95%分位数={vmax:.2f}")

                # 散点图，颜色表示预测值，并限制色彩范围
                scatter = plt.scatter(
                    data_H6_2_attr["X"],
                    data_H6_2_attr["Y"],
                    c=data_H6_2_attr[target_col],
                    cmap="viridis",
                    s=1,
                    alpha=0.7,
                    vmin=vmin,
                    vmax=vmax,
                )
            else:
                # 分类数据不需要限制范围
                scatter = plt.scatter(
                    data_H6_2_attr["X"],
                    data_H6_2_attr["Y"],
                    c=data_H6_2_attr[target_col],
                    cmap="Set1",
                    s=1,
                    alpha=0.7,
                )

            # 添加颜色条
            cbar = plt.colorbar(scatter)
            cbar.set_label("预测标签" if is_classification else "预测砂厚")

            # 设置图表标题和轴标签
            plt.title(f"预测结果的空间分布 ({best_model_name})")
            plt.xlabel("X坐标")
            plt.ylabel("Y坐标")
            plt.tight_layout()

            # 保存图片
            plt.savefig(f"output/预测空间分布_{best_model_name}.png", dpi=300)
            plt.show()

            # 如果有井点数据，可视化井点和预测结果的对比（同样优化色棒范围）
            if "data_H6_2_well_selected" in globals() and len(data_H6_2_well_selected) > 0:
                plt.figure(figsize=(12, 10))

                # 绘制预测背景，同样限制色彩范围
                if not is_classification:
                    scatter = plt.scatter(
                        data_H6_2_attr["X"],
                        data_H6_2_attr["Y"],
                        c=data_H6_2_attr[target_col],
                        cmap="viridis",
                        s=1,
                        alpha=0.3,
                        vmin=vmin,
                        vmax=vmax,
                    )
                else:
                    scatter = plt.scatter(
                        data_H6_2_attr["X"],
                        data_H6_2_attr["Y"],
                        c=data_H6_2_attr[target_col],
                        cmap="Set1",
                        s=1,
                        alpha=0.3,
                    )

                # 绘制井点位置和实际值
                target_well = "标签" if is_classification else target_column

                # 对于回归任务，也为井点限制色彩范围
                if not is_classification:
                    well_vmin = min(vmin, data_H6_2_well_selected[target_well].min())
                    well_vmax = max(vmax, data_H6_2_well_selected[target_well].max())
                    well_scatter = plt.scatter(
                        data_H6_2_well_selected["X"],
                        data_H6_2_well_selected["Y"],
                        c=data_H6_2_well_selected[target_well],
                        cmap="viridis",
                        s=50,
                        edgecolor="black",
                        linewidth=1,
                        vmin=well_vmin,
                        vmax=well_vmax,
                    )
                else:
                    well_scatter = plt.scatter(
                        data_H6_2_well_selected["X"],
                        data_H6_2_well_selected["Y"],
                        c=data_H6_2_well_selected[target_well],
                        cmap="Set1",
                        s=50,
                        edgecolor="black",
                        linewidth=1,
                    )

                # 添加颜色条和图例
                cbar = plt.colorbar(scatter)
                cbar.set_label("预测值" if is_classification else "预测砂厚")

                # 添加井名标注
                for idx, row in data_H6_2_well_selected.iterrows():
                    plt.text(row["X"], row["Y"], row["Well"], fontsize=8)

                plt.title("预测结果与井点实际值对比")
                plt.xlabel("X坐标")
                plt.ylabel("Y坐标")
                plt.tight_layout()

                # 保存图片
                plt.savefig(f"output/井点对比_{best_model_name}.png", dpi=300)
                plt.show()
        else:
            print("错误: 无法构建有效的特征数据集")
    else:
        print("错误: 无法加载地震属性数据")