# 伪样本 + SVR 进行预测

In [None]:
# 确保src目录在Python路径中
import os
import random
import sys
import time
import warnings
from itertools import combinations
from math import comb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.feature_selection import group_features_by_correlation, select_features_from_groups
from src.visualization import (
    visualize_attribute_map,
)

data_dir = "..\\data"
data_tmp_dir = "data_tmp"
output_dir = "H5_2_ps_output"
if not os.path.exists(data_tmp_dir):
    os.makedirs(data_tmp_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据

修改 data_seismic_url = os.path.join(data_dir, "6.2") 这一行可以更换数据

In [None]:
data_seismic_path = os.path.join(data_dir, "H5-2")

data_seismic_attr = parse_petrel_file(data_seismic_path)

## 处理属性缺失值

In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_path)

processed_features, stats, report = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,
    outlier_method="iqr",
    outlier_threshold=2.0,
    outlier_treatment="clip",  # 边界截断
    verbose=True,
)

# 提取筛选后的属性
attribute_names_filtered = [col for col in processed_features.columns]

# 将处理后的属性数据与原始坐标数据合并
processed_seismic_full = data_seismic_attr[["X", "Y"]].copy()  # type: ignore
for col in processed_features.columns:
    processed_seismic_full[col] = processed_features[col]

## 导入真实井、虚拟井

In [None]:
data_well_path = os.path.join(data_dir, "well_without_attr.xlsx")

data_well_position = pd.read_excel(data_well_path)

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == "H5-2"]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

print(f"井点数据导入完成，共 {len(data_well_purpose_surface_position)} 口井")
data_well_purpose_surface_position.head()

In [None]:
# 为井点提取地震属性
well_attr = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_position,
    seismic_data=processed_seismic_full,
    max_distance=50,
    num_points=5,
)

print(f"井点地震属性提取完成，共 {len(well_attr)} 口井")

# 保存处理结果
well_attr.to_excel(os.path.join(data_tmp_dir, "wells_attr.xlsx"), index=False)
print("井点的地震属性已保存到 wells_attr.xlsx")

In [None]:
pseudo_wells_path = os.path.join(data_tmp_dir, "optimized_pseudo_wells.csv")
if os.path.exists(pseudo_wells_path):
    pseudo_wells = pd.read_csv(pseudo_wells_path)
    print(f"虚拟井数据导入完成，共 {len(pseudo_wells)} 个虚拟井点")

    # 检查虚拟井数据中的砂厚列名
    if "Predicted_Sand_Thickness" in pseudo_wells.columns:
        pseudo_thickness_col = "Predicted_Sand_Thickness"
    elif "Mean_Pred" in pseudo_wells.columns:
        pseudo_thickness_col = "Mean_Pred"
    else:
        # 查找可能的砂厚列
        possible_cols = [col for col in pseudo_wells.columns if "thick" in col.lower() or "pred" in col.lower()]
        if possible_cols:
            pseudo_thickness_col = possible_cols[0]
            print(f"使用列 '{pseudo_thickness_col}' 作为虚拟井砂厚")
        else:
            raise ValueError("无法找到虚拟井砂厚列")

    print(f"虚拟井砂厚统计:")
    print(
        f"  - 范围: {pseudo_wells[pseudo_thickness_col].min():.2f} - {pseudo_wells[pseudo_thickness_col].max():.2f} m"
    )
    print(f"  - 平均值: {pseudo_wells[pseudo_thickness_col].mean():.2f} m")

    # 显示虚拟井数据的前几行
    print("\n虚拟井数据预览:")
    display(pseudo_wells.head())

else:
    raise FileNotFoundError(f"未找到虚拟井数据文件: {pseudo_wells_path}")

## 根据自相关性对特征进行分组

经验公式：1个特征对应10个样本。现在有30个真实井，所以最多选3个特征。

先根据自相关性对特征进行分组，然后再遍历**从多个组中选三个组**的特征组合（如果组里只有一个特征，选这个特征即可；有多个特征则随机选择）

In [None]:
# 执行特征分组
feature_groups, correlation_matrix = group_features_by_correlation(
    data=processed_seismic_full,
    feature_columns=attribute_names_filtered,
    correlation_threshold=0.9,
    verbose=True,
)

print(f"\n特征分组完成，共分成 {len(feature_groups)} 组")

In [None]:
# 计算组合数
n_groups = len(feature_groups)
n_select = 3  # 选择3组特征

if n_groups < n_select:
    print(f"警告: 特征组数 ({n_groups}) 少于需要选择的组数 ({n_select})")
    print("将使用所有组进行建模")
    n_select = n_groups


total_combinations = comb(n_groups, n_select)
print(f"\n=== SVR模型训练计划 ===")
print(f"特征组数: {n_groups}")
print(f"每次选择组数: {n_select}")
print(f"总组合数: C({n_groups}, {n_select}) = {total_combinations}")
print(f"预计训练 {total_combinations} 个SVR模型")

# 估算样本数
real_samples = len(well_attr)
pseudo_samples = len(pseudo_wells)
total_samples = real_samples + pseudo_samples
print(f"\n训练样本数: {real_samples} 个真实井 + {pseudo_samples} 个虚拟井 = {total_samples} 个样本")

# 调整SVR参数网格以减少过拟合
param_grid = [
    # RBF核参数
    {"C": [0.01, 0.1, 1], "gamma": ["scale", 0.001, 0.01, 0.1], "epsilon": [0.1, 0.2, 0.5], "kernel": ["rbf"]},
    # 线性核参数（线性核不需要gamma参数）
    {"C": [0.01, 0.1, 1, 10], "epsilon": [0.1, 0.2, 0.5], "kernel": ["linear"]},
]

print(f"\nSVR参数网格设置:")
print(f"  RBF核: C={[0.01, 0.1, 1]}, gamma={['scale', 0.001, 0.01, 0.1]}, epsilon={[0.1, 0.2, 0.5]}")
print(f"  线性核: C={[0.01, 0.1, 1, 10]}, epsilon={[0.1, 0.2, 0.5]}")

# 重新计算网格大小
rbf_grid_size = 3 * 4 * 3  # C * gamma * epsilon
linear_grid_size = 4 * 3  # C * epsilon
total_grid_size = rbf_grid_size + linear_grid_size

print(f"RBF核参数组合数: {rbf_grid_size}")
print(f"线性核参数组合数: {linear_grid_size}")
print(f"总参数组合数: {total_grid_size}")
print(f"总计需要训练: {total_combinations} × {total_grid_size} = {total_combinations * total_grid_size} 个SVR模型")

## 数据增强 & 权重设置

In [None]:
def dynamic_data_augmentation(X_real, y_real, X_pseudo, y_pseudo, target_samples_per_bin=10, noise_factor=0.05):
    """
    动态数据增强

    参数:
        X_real: 真实井特征数据
        y_real: 真实井砂厚数据
        X_pseudo: 虚拟井特征数据
        y_pseudo: 虚拟井砂厚数据
        target_samples_per_bin: 每个砂厚区间目标样本数（阈值）
        noise_factor: 噪声因子（相对于特征标准差）
    """
    print("\n=== 动态数据增强策略 ===")

    # 定义砂厚区间
    thickness_bins = [0, 1, 10, 20, np.inf]
    bin_labels = ["0-1m", "1-10m", "10-20m", ">20m"]

    # 分析当前真实井样本分布
    real_bin_counts = []
    real_bin_masks = []

    print("真实井样本分布分析:")
    for i in range(len(thickness_bins) - 1):
        mask = (y_real >= thickness_bins[i]) & (y_real < thickness_bins[i + 1])
        count = np.sum(mask)
        real_bin_counts.append(count)
        real_bin_masks.append(mask)
        print(f"  {bin_labels[i]}: {count} 个真实样本")

    # 分析虚拟井样本分布
    pseudo_bin_counts = []
    pseudo_bin_masks = []

    print("\n虚拟井样本分布分析:")
    for i in range(len(thickness_bins) - 1):
        mask = (y_pseudo >= thickness_bins[i]) & (y_pseudo < thickness_bins[i + 1])
        count = np.sum(mask)
        pseudo_bin_counts.append(count)
        pseudo_bin_masks.append(mask)
        print(f"  {bin_labels[i]}: {count} 个虚拟样本")

    # 初始化增强后的数据
    X_augmented = X_real.copy()
    y_augmented = y_real.copy()
    augmentation_sources = ["real"] * len(y_real)  # 记录样本来源

    # 计算特征标准差（用于噪声生成）
    all_features = np.vstack([X_real, X_pseudo])
    feature_stds = np.std(all_features, axis=0)

    print(f"\n动态增强决策（阈值: {target_samples_per_bin}）:")

    # 对每个区间进行动态增强
    for i in range(len(thickness_bins) - 1):
        real_count = real_bin_counts[i]
        pseudo_count = pseudo_bin_counts[i]

        print(f"\n{bin_labels[i]} 区间:")
        print(f"  真实样本: {real_count}, 虚拟样本: {pseudo_count}")

        if real_count >= target_samples_per_bin:
            print(f"  ✓ 真实样本充足（>={target_samples_per_bin}），无需增强")
            continue

        samples_needed = target_samples_per_bin - real_count
        print(f"  需要增强 {samples_needed} 个样本")

        # 策略1：如果有真实样本，优先基于真实样本增强
        if real_count > 0:
            print(f"  策略: 基于 {real_count} 个真实样本进行噪声增强")

            # 提取该区间的真实样本
            X_real_bin = X_real[real_bin_masks[i]]
            y_real_bin = y_real[real_bin_masks[i]]

            # 生成增强样本
            for j in range(samples_needed):
                # 随机选择一个真实样本作为基础
                base_idx = np.random.randint(0, len(X_real_bin))
                base_x = X_real_bin[base_idx].copy()
                base_y = y_real_bin[base_idx]

                # 添加适应性噪声
                # 根据砂厚大小调整噪声强度
                if base_y <= 1:  # 薄层，使用小噪声
                    adaptive_noise_factor = noise_factor * 0.5
                elif base_y <= 10:  # 中等厚度
                    adaptive_noise_factor = noise_factor * 1.0
                else:  # 厚层，使用稍大噪声
                    adaptive_noise_factor = noise_factor * 1.5

                noise = np.random.normal(0, feature_stds * adaptive_noise_factor)
                new_x = base_x + noise

                # 砂厚也加入轻微扰动，但保持在区间内
                if i == 0:  # 0-1m区间
                    thickness_noise = np.random.uniform(-0.1, 0.1)
                    new_y = np.clip(base_y + thickness_noise, 0, 0.99)
                elif i == 1:  # 1-10m区间
                    thickness_noise = np.random.uniform(-0.5, 0.5)
                    new_y = np.clip(base_y + thickness_noise, 1, 9.99)
                elif i == 2:  # 10-20m区间
                    thickness_noise = np.random.uniform(-1.0, 1.0)
                    new_y = np.clip(base_y + thickness_noise, 10, 19.99)
                else:  # >20m区间
                    thickness_noise = np.random.uniform(-2.0, 2.0)
                    new_y = max(20, base_y + thickness_noise)

                X_augmented = np.vstack([X_augmented, new_x.reshape(1, -1)])
                y_augmented = np.append(y_augmented, new_y)
                augmentation_sources.append("real_augmented")

        # 策略2：如果没有真实样本但有虚拟样本，从虚拟样本采样
        elif pseudo_count > 0:
            print(f"  策略: 从 {pseudo_count} 个虚拟样本中采样")

            # 提取该区间的虚拟样本
            X_pseudo_bin = X_pseudo[pseudo_bin_masks[i]]
            y_pseudo_bin = y_pseudo[pseudo_bin_masks[i]]

            # 从虚拟样本中随机采样
            sample_indices = np.random.choice(
                len(X_pseudo_bin),
                size=min(samples_needed, len(X_pseudo_bin)),
                replace=True if samples_needed > len(X_pseudo_bin) else False,
            )

            for idx in sample_indices:
                # 对虚拟样本也添加轻微噪声以增加多样性
                base_x = X_pseudo_bin[idx].copy()
                base_y = y_pseudo_bin[idx]

                # 添加小幅噪声
                noise = np.random.normal(0, feature_stds * noise_factor * 0.3)
                new_x = base_x + noise

                X_augmented = np.vstack([X_augmented, new_x.reshape(1, -1)])
                y_augmented = np.append(y_augmented, base_y)
                augmentation_sources.append("pseudo_sampled")

        else:
            print(f"  警告: 该区间既无真实样本也无虚拟样本，跳过增强")

    # 统计增强结果
    original_count = len(y_real)
    augmented_count = len(y_augmented)

    print(f"\n=== 数据增强完成 ===")
    print(f"原始真实井样本: {original_count}")
    print(f"增强后总样本: {augmented_count}")
    print(f"新增样本: {augmented_count - original_count}")

    # 按来源统计
    sources_count = {}
    for source in set(augmentation_sources):
        sources_count[source] = augmentation_sources.count(source)

    print(f"\n样本来源统计:")
    for source, count in sources_count.items():
        if source == "real":
            print(f"  原始真实井: {count}")
        elif source == "real_augmented":
            print(f"  真实井增强: {count}")
        elif source == "pseudo_sampled":
            print(f"  虚拟井采样: {count}")

    # 最终分布检查
    print(f"\n增强后各区间真实样本分布:")
    for i in range(len(thickness_bins) - 1):
        mask = (y_augmented >= thickness_bins[i]) & (y_augmented < thickness_bins[i + 1])
        count = np.sum(mask)
        print(f"  {bin_labels[i]}: {count} 个样本")

    return X_augmented, y_augmented, augmentation_sources

In [None]:
# 合并真实井和虚拟井数据
print("=== 准备训练数据 ===")

# 确保虚拟井数据包含所需的特征列
common_features = [col for col in attribute_names_filtered if col in pseudo_wells.columns]
missing_features = [col for col in attribute_names_filtered if col not in pseudo_wells.columns]

if missing_features:
    print(f"警告: 虚拟井数据中缺少以下特征: {missing_features}")
    print("将只使用两个数据集中都存在的特征")
    # 更新特征组，只保留共同特征
    updated_feature_groups = []
    for group in feature_groups:
        updated_group = [f for f in group if f in common_features]
        if updated_group:  # 只保留非空组
            updated_feature_groups.append(updated_group)
    feature_groups = updated_feature_groups
    print(f"更新后的特征组数: {len(feature_groups)}")

# 准备真实井数据
X_real = well_attr[common_features].values
y_real = well_attr["Sand Thickness"].values

# 准备虚拟井数据
X_pseudo = pseudo_wells[common_features].values  # type: ignore
y_pseudo = pseudo_wells[pseudo_thickness_col].values

# 执行动态数据增强
X_real_augmented, y_real_augmented, aug_sources = dynamic_data_augmentation(
    X_real,
    y_real,
    X_pseudo,
    y_pseudo,
    target_samples_per_bin=10,  # 阈值设为10
    noise_factor=0.03,
)

# 重新合并数据（用增强后的真实井数据）
X_combined_aug = np.vstack([X_real_augmented, X_pseudo])
y_combined_aug = np.concatenate([y_real_augmented, y_pseudo])  # type: ignore

# 动态调整样本权重
original_real_count = len(X_real)
real_augmented_count = aug_sources.count("real_augmented")
pseudo_sampled_count = aug_sources.count("pseudo_sampled")

sample_weights_aug = np.concatenate(
    [
        # 原始真实井权重最高
        np.ones(original_real_count) * 15.0,
        # 真实井增强样本权重较高
        np.ones(real_augmented_count) * 8.0,
        # 虚拟井采样权重中等
        np.ones(pseudo_sampled_count) * 3.0,
        # 原始虚拟井权重最低
        np.ones(len(X_pseudo)) * 1.0,
    ]
)

print(f"\n动态权重分配:")
print(f"  - 原始真实井权重: 15.0 (共{original_real_count}个)")
print(f"  - 真实井增强权重: 8.0 (共{real_augmented_count}个)")
print(f"  - 虚拟井采样权重: 3.0 (共{pseudo_sampled_count}个)")
print(f"  - 原始虚拟井权重: 1.0 (共{len(X_pseudo)}个)")

# 更新训练数据
X_combined = X_combined_aug
y_combined = y_combined_aug
sample_weights = sample_weights_aug

print(f"\n最终训练数据:")
print(f"  - 总样本数: {len(X_combined)}")
print(f"  - 目标变量范围: {y_combined.min():.2f} - {y_combined.max():.2f}")

## 遍历所有特征组合训练SVR模型

In [None]:
# 遍历所有特征组合训练SVR模型
all_combinations = list(combinations(range(len(feature_groups)), n_select))
print(f"=== 开始训练 {len(all_combinations)} 个SVR模型 ===")

# 存储所有模型结果
model_results = []
best_models = []  # 存储最佳模型信息

# 设置随机种子确保可重现性
np.random.seed(42)
random.seed(42)

start_time = time.time()

for i, combination in enumerate(all_combinations):
    print(f"\n训练模型 {i + 1}/{len(all_combinations)}: 组合 {combination}")

    # 从选定组中选择特征
    selected_features = select_features_from_groups(feature_groups, combination, random_seed=42 + i)
    print(f"  选择的特征: {selected_features}")

    # 获取特征索引
    feature_indices = [common_features.index(f) for f in selected_features if f in common_features]

    if len(feature_indices) != len(selected_features):
        print(f"  警告: 部分特征不在数据中，跳过此组合")
        continue

    # 提取特征数据
    X_train = X_combined[:, feature_indices]

    # 标准化特征
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    try:
        svr = SVR(kernel="rbf")

        # 调整交叉验证策略
        cv_folds = min(3, len(X_train) // 3)  # 减少CV折数，每折至少有样本数的1/3
        if cv_folds < 2:
            cv_folds = 2  # 最少2折

        grid_search = GridSearchCV(
            svr,
            param_grid,
            cv=cv_folds,
            scoring="r2",
            n_jobs=-1,
            return_train_score=True,  # 返回训练分数以便分析过拟合
        )

        print(f"  使用{cv_folds}折交叉验证")

        grid_search.fit(X_train_scaled, y_combined, sample_weight=sample_weights)

        best_svr = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_cv_score = grid_search.best_score_

        # 在训练集上的表现
        train_pred = best_svr.predict(X_train_scaled)
        train_r2 = r2_score(y_combined, train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_combined, train_pred))

        # 在存储结果之前添加过拟合检测
        overfitting_score = train_r2 - best_cv_score
        if overfitting_score > 0.5:  # 如果训练集和验证集R²差异过大
            print(f"  警告: 检测到严重过拟合 (差异: {overfitting_score:.3f})")

        result = {
            "combination_index": i,
            "group_combination": combination,
            "selected_features": selected_features,
            "best_params": best_params,
            "cv_r2": best_cv_score,
            "train_r2": train_r2,
            "train_rmse": train_rmse,
            "overfitting_score": overfitting_score,  # 添加过拟合指标
            "model": best_svr,
            "scaler": scaler,
            "feature_indices": feature_indices,
        }

        model_results.append(result)
        print(f"  最佳参数: {best_params}")
        print(f"  CV R^2: {best_cv_score:.4f}")
        print(f"  训练集 R^2: {train_r2:.4f}")
        print(f"  训练集 RMSE: {train_rmse:.4f}")

    except Exception as e:
        print(f"  训练失败: {str(e)}")
        continue

    # 每10个模型报告一次进度
    if (i + 1) % 10 == 0:
        elapsed_time = time.time() - start_time
        avg_time_per_model = elapsed_time / (i + 1)
        remaining_time = avg_time_per_model * (len(all_combinations) - i - 1)
        print(f"\n进度: {i + 1}/{len(all_combinations)}, 已用时: {elapsed_time:.1f}s, 预计剩余: {remaining_time:.1f}s")

total_time = time.time() - start_time
print(f"\n=== 所有模型训练完成 ===")
print(f"成功训练: {len(model_results)}/{len(all_combinations)} 个模型")
print(f"总用时: {total_time:.1f} 秒")

## 选择最佳模型并进行全区预测

In [None]:
# 按CV R²排序，选择前5个模型
model_results.sort(key=lambda x: x["cv_r2"], reverse=True)
top_models = model_results[:5]

print("=== 前5个最佳模型 ===")
for i, result in enumerate(top_models):
    print(f"\n模型 {i + 1}:")
    print(f"  特征组合: {result['group_combination']}")
    print(f"  选择特征: {result['selected_features']}")
    print(f"  最佳参数: {result['best_params']}")
    print(f"  CV R^2: {result['cv_r2']:.4f}")
    print(f"  训练集 R^2: {result['train_r2']:.4f}")
    print(f"  训练集 RMSE: {result['train_rmse']:.4f}")

# 为整个工区生成预测
print("\n=== 开始全区预测 ===")

# 准备全区数据
seismic_data = processed_seismic_full.copy()
seismic_features = seismic_data[common_features].fillna(seismic_data[common_features].mean())

# 为每个最佳模型生成预测
for i, result in enumerate(top_models):
    print(f"\n使用模型 {i + 1} 进行全区预测...")

    # 提取对应特征
    X_seismic = seismic_features.iloc[:, result["feature_indices"]].values

    # 标准化
    X_seismic_scaled = result["scaler"].transform(X_seismic)

    # 预测
    predictions = result["model"].predict(X_seismic_scaled)

    # 将负值设为0
    predictions = np.maximum(predictions, 0)

    # 添加到地震数据
    column_name = f"SVR_Model_{i + 1}_Prediction"
    seismic_data[column_name] = predictions

    print(f"  预测完成，预测值范围: {predictions.min():.2f} - {predictions.max():.2f}")
    print(f"  平均预测值: {predictions.mean():.2f}")

# 创建综合预测（前5个模型的平均）
ensemble_predictions = np.mean(
    [seismic_data[f"SVR_Model_{i + 1}_Prediction"].values for i in range(len(top_models))],  # type: ignore
    axis=0,
)

seismic_data["SVR_Ensemble_Prediction"] = ensemble_predictions

print(f"\n集成预测完成:")
print(f"  预测值范围: {ensemble_predictions.min():.2f} - {ensemble_predictions.max():.2f}")
print(f"  平均预测值: {ensemble_predictions.mean():.2f}")

# 保存结果
output_file = os.path.join(output_dir, "svr_predictions_all_models.csv")
seismic_data.to_csv(output_file, index=False)
print(f"\n全区预测结果已保存到: {output_file}")

# 生成Petrel格式的txt文件
print("\n=== 生成Petrel格式文件 ===")

# 提取基础坐标数据
base_coords = seismic_data[["X", "Y"]].copy()

# 为每个最佳模型生成txt文件
for i, result in enumerate(top_models):
    model_num = i + 1
    column_name = f"SVR_Model_{model_num}_Prediction"

    # 准备数据：X, Y, Sand Thickness（砂厚预测值）
    petrel_data = base_coords.copy()
    petrel_data["Sand Thickness"] = seismic_data[column_name]

    # 生成文件名
    txt_filename = f"SVR_Model_{model_num}_Prediction.txt"
    txt_filepath = os.path.join(output_dir, txt_filename)

    # 保存为txt文件，使用空格分隔
    petrel_data.to_csv(
        txt_filepath,
        sep=" ",  # 使用空格分隔
        index=False,  # 不保存索引
        header=True,  # 保存列名
        float_format="%.6f",  # 保留6位小数
    )

    print(f"  模型{model_num}已保存: {txt_filename}")
    print(f"    预测点数: {len(petrel_data):,}")
    print(f"    砂厚范围: {petrel_data['Sand Thickness'].min():.2f} - {petrel_data['Sand Thickness'].max():.2f} m")

# 生成集成预测的txt文件
ensemble_petrel_data = base_coords.copy()
ensemble_petrel_data["Sand Thickness"] = ensemble_predictions

ensemble_txt_filename = "SVR_Ensemble_Prediction.txt"
ensemble_txt_filepath = os.path.join(output_dir, ensemble_txt_filename)

ensemble_petrel_data.to_csv(
    ensemble_txt_filepath,
    sep=" ",  # 使用空格分隔
    index=False,  # 不保存索引
    header=True,  # 保存列名
    float_format="%.6f",  # 保留6位小数
)

print(f"  集成模型已保存: {ensemble_txt_filename}")
print(f"    预测点数: {len(ensemble_petrel_data):,}")
print(f"    砂厚范围: {ensemble_predictions.min():.2f} - {ensemble_predictions.max():.2f} m")

print(f"\n=== Petrel格式文件生成完成 ===")
print(f"生成的文件列表:")
for i in range(len(top_models)):
    print(f"  • SVR_Model_{i + 1}_Prediction.txt")
print(f"  • SVR_Ensemble_Prediction.txt")
print(f"\n所有文件均保存在目录: {output_dir}")
print(f"文件格式: X Y Sand_Thickness (空格分隔)")
print(f"可直接导入Petrel进行可视化和分析")

In [None]:
# 可视化预测结果

print("\n=== 开始生成预测结果可视化图 ===")

# 设置可视化参数
class_thresholds = [1.0, 10.0]  # 砂厚分类阈值，可根据实际情况调整
figsize = (14, 14)
point_size = 10
well_size = 60
# vrange = (0, max(ensemble_predictions.max(), 15))  # 设置色彩范围，最大值取集成预测最大值和15的较大者
vrange = (0, 30)  # 设置色彩范围，最大值取20

# 为每个最佳模型生成可视化图
for i, result in enumerate(top_models):
    model_num = i + 1
    column_name = f"SVR_Model_{model_num}_Prediction"

    print(f"\n生成模型 {model_num} 预测结果可视化图...")

    # 构建模型描述信息
    features_str = ", ".join(result["selected_features"])
    cv_r2 = result["cv_r2"]

    # 生成可视化图
    visualize_attribute_map(
        data_points=seismic_data,
        attribute_name=column_name,
        attribute_label=f"模型{model_num}砂厚预测值(m)\nCV R^2={cv_r2:.3f}\n特征: {features_str}",
        real_wells=well_attr,  # 使用真实井数据
        pseudo_wells=None,
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix=f"svr_model_{model_num}_prediction",
        class_thresholds=class_thresholds,
        figsize=figsize,
        point_size=point_size,
        well_size=well_size,
        vrange=vrange,
        cmap="viridis",
    )

# 生成集成预测可视化图
print(f"\n生成集成预测结果可视化图...")

# 计算集成模型的平均CV R²
avg_cv_r2 = np.mean([result["cv_r2"] for result in top_models])

visualize_attribute_map(
    data_points=seismic_data,
    attribute_name="SVR_Ensemble_Prediction",
    attribute_label=f"SVR集成砂厚预测值(m)\n平均CV R^2={avg_cv_r2:.3f}\n(前5个最佳模型平均)",
    real_wells=well_attr,  # 使用真实井数据
    pseudo_wells=None,
    target_column="Sand Thickness",
    output_dir=output_dir,
    filename_prefix="svr_ensemble_prediction",
    class_thresholds=class_thresholds,
    figsize=figsize,
    point_size=point_size,
    well_size=well_size,
    vrange=vrange,
    cmap="viridis",
)

print("\n=== 预测结果可视化完成 ===")

In [None]:
# 保存模型结果摘要
results_summary = []
for i, result in enumerate(model_results):
    summary = {
        "model_rank": i + 1,
        "combination_index": result["combination_index"],
        "group_combination": str(result["group_combination"]),
        "selected_features": str(result["selected_features"]),
        "cv_r2": result["cv_r2"],
        "train_r2": result["train_r2"],
        "train_rmse": result["train_rmse"],
        "best_params": str(result["best_params"]),
    }
    results_summary.append(summary)

summary_df = pd.DataFrame(results_summary)
summary_file = os.path.join(output_dir, "svr_model_results_summary.csv")
summary_df.to_csv(summary_file, index=False)
print(f"模型结果摘要已保存到: {summary_file}")

# 生成预测统计报告
print("\n=== 预测结果统计报告 ===")

# 统计各模型预测结果
prediction_stats = []
for i, result in enumerate(top_models):
    model_num = i + 1
    column_name = f"SVR_Model_{model_num}_Prediction"
    predictions = seismic_data[column_name]

    stats = {
        "模型": f"模型{model_num}",
        "特征组合": str(result["group_combination"]),
        "选择特征": str(result["selected_features"]),
        "CV_R2": result["cv_r2"],
        "预测最小值": predictions.min(),
        "预测最大值": predictions.max(),
        "预测平均值": predictions.mean(),
        "预测标准差": predictions.std(),
        "预测中位数": predictions.median(),
        "大于1m的点数": (predictions > 1.0).sum(),
        "大于5m的点数": (predictions > 5.0).sum(),
        "大于10m的点数": (predictions > 10.0).sum(),
    }
    prediction_stats.append(stats)

# 添加集成预测统计
ensemble_stats = {
    "模型": "集成预测",
    "特征组合": "前5个模型平均",
    "选择特征": "多模型组合",
    "CV_R2": avg_cv_r2,
    "预测最小值": ensemble_predictions.min(),
    "预测最大值": ensemble_predictions.max(),
    "预测平均值": ensemble_predictions.mean(),
    "预测标准差": ensemble_predictions.std(),
    "预测中位数": np.median(ensemble_predictions),
    "大于1m的点数": (ensemble_predictions > 1.0).sum(),
    "大于5m的点数": (ensemble_predictions > 5.0).sum(),
    "大于10m的点数": (ensemble_predictions > 10.0).sum(),
}
prediction_stats.append(ensemble_stats)

# 转换为DataFrame并保存
stats_df = pd.DataFrame(prediction_stats)
stats_file = os.path.join(output_dir, "prediction_statistics.csv")
stats_df.to_csv(stats_file, index=False, encoding="utf-8-sig")
print(f"预测统计信息已保存到: {stats_file}")

# 打印统计摘要
print("\n预测结果统计摘要:")
for i, stats in enumerate(prediction_stats):
    print(f"\n{stats['模型']}:")
    if i < len(top_models):
        print(f"  CV R^2: {stats['CV_R2']:.4f}")
    print(f"  预测范围: {stats['预测最小值']:.2f} - {stats['预测最大值']:.2f} m")
    print(f"  平均值: {stats['预测平均值']:.2f} m")
    print(f"  标准差: {stats['预测标准差']:.2f} m")
    print(f"  厚度>1m的点数: {stats['大于1m的点数']} ({stats['大于1m的点数'] / len(seismic_data) * 100:.1f}%)")
    print(f"  厚度>5m的点数: {stats['大于5m的点数']} ({stats['大于5m的点数'] / len(seismic_data) * 100:.1f}%)")

print(f"\n=== 所有结果已保存到目录: {output_dir} ===")