# PCA + Sigmoid 生成伪样本

In [None]:
# 确保src目录在Python路径中
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.sigmoid import SigmoidModel
from src.visualization import (
    visualize_attribute_map,
    visualize_feature_distribution,
    visualize_gmm_clustering,
    visualize_pca_clustering,
)

data_dir = "..\\data"
data_tmp_dir = "data_tmp"
output_dir = "H5_2_ps_output"
if not os.path.exists(data_tmp_dir):
    os.makedirs(data_tmp_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据

修改 data_seismic_url = os.path.join(data_dir, "6.2") 这一行可以更换数据

In [None]:
data_seismic_url = os.path.join(data_dir, "H5-2")

data_seismic_attr = parse_petrel_file(data_seismic_url)

## 导入井点位置

修改 data_well_position[data_well_position["Surface"] == "H6-2"] 这一行更换对应层位

In [None]:
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == "H5-2"]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)
data_well_purpose_surface_position.head()

## 筛除离群井

只在井点密集的区域设置虚拟井

In [None]:
# 筛选离群井
data_well_purpose_surface_filtered = filter_outlier_wells(data_well_purpose_surface_position, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_purpose_surface_position)}")
print(f"筛选后井点数量: {len(data_well_purpose_surface_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_purpose_surface_position["X"].min()
x_max = data_well_purpose_surface_position["X"].max()
y_min = data_well_purpose_surface_position["Y"].min()
y_max = data_well_purpose_surface_position["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_purpose_surface_position["X"], data_well_purpose_surface_position["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_purpose_surface_filtered["X"], data_well_purpose_surface_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

## 处理属性缺失值

In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_url)

processed_features, stats, report = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,
    outlier_method="iqr",
    outlier_threshold=2.0,
    outlier_treatment="clip",  # 边界截断
    verbose=True,
)

# 提取筛选后的属性
attribute_names_filtered = [col for col in processed_features.columns]

# 将处理后的属性数据与原始坐标数据合并
processed_seismic_full = data_seismic_attr[["X", "Y"]].copy()  # type: ignore
for col in processed_features.columns:
    processed_seismic_full[col] = processed_features[col]

## 根据井点分布，缩小工区范围

大工区 → 小工区

In [None]:
# 限制工区范围
seismic_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=processed_seismic_full,
    well_data=data_well_purpose_surface_filtered,
    expansion_factor=2,  # 扩展100%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

## 提取井点处地震属性

In [None]:
# 为筛选后的井点提取地震属性
well_attr_filtered = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_filtered, seismic_data=processed_seismic_full, max_distance=50, num_points=5
)

# 保存处理结果
well_attr_filtered.to_excel(os.path.join(data_tmp_dir, "wells_attr_filtered.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 wells_attr_filtered.xlsx")

## 生成统计摘要

只应用于小工区范围内的井点

In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=seismic_attr_filtered,
    well_data=well_attr_filtered,
    common_attributes=attribute_names_filtered,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,
    verbose=True,
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## PCA 降维

只应用于小工区范围

In [None]:
pca_results = perform_pca_analysis(
    data=seismic_attr_filtered,
    attribute_columns=good_attributes,
    variance_threshold=0.75,
    output_dir=output_dir,
)

## GMM 聚类

只应用于小工区范围内的井点

In [None]:
# 首先评估最佳聚类数
# gmm_evaluation = evaluate_gmm_clusters(features_pca=pca_results["features_pca"], max_clusters=10, output_dir=output_dir)

# 使用不同的聚类数执行GMM聚类
# 根据BIC/AIC结果选择的最佳聚类数
# best_n = gmm_evaluation["best_n_components"]

In [None]:
best_n = 2  # 聚类数量

# 1. 执行GMM聚类
gmm_results = perform_gmm_clustering(
    features=pca_results["features_pca"],
    coords=pca_results["coords_clean"],
    n_clusters=best_n,
)
gmm_results["result_df"].to_csv(os.path.join(output_dir, "gmm_best_clusters.csv"), index=False)

# 2. PCA可视化，需要将井点数据投影到PCA空间
# 首先提取井点的属性列
well_features = well_attr_filtered[pca_results["features_clean"].columns].values
# 使用相同的标准化器和PCA模型变换井点数据
well_features_scaled = pca_results["scaler"].transform(well_features)
well_pca_features = pca_results["pca"].transform(well_features_scaled)

# 3. 在PCA空间中可视化聚类结果
visualize_pca_clustering(
    clustering_results=gmm_results,
    pca_results=pca_results,
    n_clusters=best_n,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    well_pca_features=well_pca_features,
    target_column="Sand Thickness",
    class_thresholds=[0.1, 10],
)

# 4. 在地理空间中可视化聚类结果
visualize_gmm_clustering(
    clustering_results=gmm_results,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    target_column="Sand Thickness",
    class_thresholds=[0.1, 10],
    point_size=10,
    well_size=50,
)

## 通过 PCA 进行 sigmoid 拟合

In [None]:
# 步骤0: 准备工作
print("=== 开始 Sigmoid 建模 ===")

# 1. 准备建模数据
sigmoid_data = pd.DataFrame()

# 添加PCA特征
n_components = min(3, well_pca_features.shape[1])
for i in range(n_components):
    sigmoid_data[f"PC{i + 1}"] = well_pca_features[:, i]

# 添加砂厚
sigmoid_data["Sand Thickness"] = data_well_purpose_surface_filtered["Sand Thickness"].values

print(f"Sigmoid建模数据形状: {sigmoid_data.shape}")
print(f"可用的PCA特征: {[col for col in sigmoid_data.columns if col.startswith('PC')]}")
print("\n数据预览:")
print(sigmoid_data.head())

# 2. 创建Sigmoid模型
pc_columns = [col for col in sigmoid_data.columns if col.startswith("PC")]
sigmoid_model = SigmoidModel(data=sigmoid_data, feature_columns=pc_columns, target_column="Sand Thickness")

In [None]:
# 步骤1: 可视化原始样本分布
print("\n=== 步骤1: 可视化原始样本分布 ===")

fig1 = visualize_feature_distribution(
    data=sigmoid_data,
    x_feature="PC1",
    y_feature="Sand Thickness",
    figsize=(10, 6),
    point_size=100,
    alpha=0.7,
    colormap="viridis",
    title="样本分布: PC1 vs Sand Thickness",
    save_path=os.path.join(output_dir, "sigmoid_original_distribution.png"),
)
plt.show()

# 分析数据特征
pc1_min, pc1_max = sigmoid_data["PC1"].min(), sigmoid_data["PC1"].max()
pc1_median = sigmoid_data["PC1"].median()
sand_thickness_max = sigmoid_data["Sand Thickness"].max()
sand_thickness_min = sigmoid_data["Sand Thickness"].min()

print(f"\n数据特征分析:")
print(f"PC1范围: {pc1_min:.2f} 到 {pc1_max:.2f}")
print(f"PC1中位数: {pc1_median:.2f}")
print(f"砂厚范围: {sand_thickness_min:.2f} 到 {sand_thickness_max:.2f} m")
print(f"样本数量: {len(sigmoid_data)}")

In [None]:
# 步骤2: 智能创建虚拟点并执行拟合
print(f"\n=== 步骤2: Sigmoid拟合 ===")

# 1. 手动指定泥岩区间
# virtual_config_manual_mud = {
#     "mud_range": (2, 4),  # 只设置泥岩虚拟点
#     "sand_range": None,  # 不设置砂岩虚拟点
#     "n_points": 10,
#     "noise_factor": 0.1,
# }

# 2. 手动指定砂岩区间
virtual_config_sand_only = {
    "mud_range": None,  # 不设置泥岩虚拟点
    "sand_range": (4, 5),  # 只设置砂岩虚拟点
    "n_points": 10,
    "noise_factor": 0.05,
}

# 3. 使用改进的保守自动策略（推荐）
# virtual_config_conservative = {
#     "placement_strategy": "conservative",  # 在数据范围内侧保守放置
#     "n_points": 10,
#     "noise_factor": 0.05,
#     "auto_detect": True,
# }

# 4. 使用传统的延伸策略
# virtual_config_extended = {
#     "placement_strategy": "extended",  # 在数据范围外侧延伸放置
#     "n_points": 10,
#     "noise_factor": 0.05,
#     "auto_detect": True,
# }

# 5. 手动指定砂岩和泥岩区间
# virtual_config_manual_both = {
#     "mud_range": (2, 3),      # 泥岩虚拟点范围
#     "sand_range": (-6, -5),      # 砂岩虚拟点范围
#     "n_points": 10,
#     "noise_factor": 3.0
# }

# 6. 不使用虚拟点（原始数据拟合）
# virtual_config_none = None

# print(f"智能虚拟点配置:")
# print(f"  模式: 智能自动检测")
# print(f"  每侧点数: {virtual_config_conservative['n_points']}")
# print(f"  噪音因子: {virtual_config_conservative['noise_factor']}")

# 执行拟合
fit_result = sigmoid_model.fit(
    use_features=["PC1"],
    virtual_points_config=virtual_config_sand_only,
    bounds=(
        [sand_thickness_max * 0.2, -10, pc1_min - (pc1_max - pc1_min)],  # 下界
        [sand_thickness_max * 3.0, 10, pc1_max + (pc1_max - pc1_min)],  # 上界
    ),
    initial_guess=[sand_thickness_max * 0.7, 1.0, pc1_median],
    max_iterations=3000,
)

In [None]:
# 步骤3: 可视化拟合结果
if fit_result["success"]:
    print("\n=== 拟合成功! ===")

    # 可视化拟合结果
    fig2 = sigmoid_model.visualize_fit(
        fit_result, figsize=(15, 6), save_path=os.path.join(output_dir, "sigmoid_fit_result.png")
    )
    plt.show()

    # 输出详细的拟合参数
    print("\n拟合参数:")
    params = fit_result["params"]
    param_errors = fit_result["param_errors"]
    for param in ["L", "k", "x0"]:
        print(f"  {param}: {params[param]:.4f} ± {param_errors[param + '_err']:.4f}")
    print(f"  R^2 score: {fit_result['r2_score']:.4f}")

    # 解释参数含义
    print(f"\n参数解释:")
    print(f"  L = {params['L']:.2f}: 最大砂厚渐近值 (m)")
    print(f"  k = {params['k']:.3f}: 增长率 ({'正向增长' if params['k'] > 0 else '负向增长'})")
    print(f"  x₀ = {params['x0']:.2f}: 中点位置（PC1值）")

    # 计算拟合质量指标
    rmse = np.sqrt(np.mean((fit_result["y"] - fit_result["y_pred"]) ** 2))
    mae = np.mean(np.abs(fit_result["y"] - fit_result["y_pred"]))
    print(f"\n拟合质量:")
    print(f"  RMSE: {rmse:.3f} m")
    print(f"  MAE: {mae:.3f} m")

else:
    print(f"\n=== 拟合失败 ===")
    print(f"错误信息: {fit_result['error']}")

In [None]:
# 步骤4: 多特征拟合尝试（如果有PC2）
if "PC2" in sigmoid_data.columns and fit_result["success"]:
    print(f"\n=== 步骤4: 多特征组合拟合 ===")

    # 使用PC1+PC2组合，权重根据方差贡献比设置
    explained_ratio = pca_results["explained_variance_ratio"]
    if len(explained_ratio) >= 2:
        # 根据方差贡献比设置权重
        total_var = explained_ratio[0] + explained_ratio[1]
        pc1_weight = explained_ratio[0] / total_var
        pc2_weight = explained_ratio[1] / total_var

        print(f"使用PC1+PC2组合:")
        print(f"  PC1权重: {pc1_weight:.3f} (方差贡献: {explained_ratio[0]:.3f})")
        print(f"  PC2权重: {pc2_weight:.3f} (方差贡献: {explained_ratio[1]:.3f})")

        # 多特征拟合 - 修正虚拟点配置变量名
        fit_result_multi = sigmoid_model.fit(
            use_features=["PC1", "PC2"],
            feature_weights=[pc1_weight, pc2_weight],
            virtual_points_config=virtual_config_sand_only,  # 使用智能虚拟点配置
            bounds=([sand_thickness_max * 0.2, -10, -5], [sand_thickness_max * 3.0, 10, 5]),
            max_iterations=3000,
        )

        if fit_result_multi["success"]:
            print("多特征拟合成功!")
            fig3 = sigmoid_model.visualize_fit(
                fit_result_multi, figsize=(15, 6), save_path=os.path.join(output_dir, "sigmoid_multi_feature_fit.png")
            )
            plt.show()

            # 比较单特征和多特征结果
            print(f"\n性能比较:")
            print(f"  单特征(PC1) R^2: {fit_result['r2_score']:.4f}")
            print(f"  多特征(PC1+PC2) R^2: {fit_result_multi['r2_score']:.4f}")
            print(f"  R^2提升: {fit_result_multi['r2_score'] - fit_result['r2_score']:.4f}")

            # 选择更好的模型
            if fit_result_multi["r2_score"] > fit_result["r2_score"]:
                print("  → 多特征模型表现更好")
                best_fit = fit_result_multi
                best_model_name = "多特征(PC1+PC2)"
            else:
                print("  → 单特征模型表现更好")
                best_fit = fit_result
                best_model_name = "单特征(PC1)"
        else:
            print(f"多特征拟合失败: {fit_result_multi['error']}")
            best_fit = fit_result
            best_model_name = "单特征(PC1)"
    else:
        best_fit = fit_result
        best_model_name = "单特征(PC1)"
else:
    best_fit = fit_result
    best_model_name = "单特征(PC1)"

In [None]:
# 步骤5: 保存模型结果并预测全工区
if best_fit["success"]:
    print(f"\n=== 步骤5: 保存结果并预测全工区 ===")

    # 保存拟合参数和模型信息
    fit_summary = {
        "model_type": "sigmoid",
        "best_model": best_model_name,
        "features_used": str(best_fit["use_features"]),
        "feature_weights": str(best_fit.get("feature_weights", "None")),
        "n_samples": len(sigmoid_data),
        "n_virtual_points": len(sigmoid_model.current_data) - len(sigmoid_data),  # type: ignore
        "virtual_config": str(virtual_config_sand_only),
        **best_fit["params"],
        **best_fit["param_errors"],
        "r2_score": best_fit["r2_score"],
        "rmse": np.sqrt(np.mean((best_fit["y"] - best_fit["y_pred"]) ** 2)),
        "mae": np.mean(np.abs(best_fit["y"] - best_fit["y_pred"])),
    }

    # 保存模型摘要
    summary_df = pd.DataFrame([fit_summary])
    summary_df.to_csv(os.path.join(output_dir, "sigmoid_model_summary.csv"), index=False)
    print(f"模型摘要已保存到: {os.path.join(output_dir, 'sigmoid_model_summary.csv')}")

    # 对全工区进行预测
    print("\n对全工区进行砂厚预测...")

    # 准备全工区PCA特征
    seismic_pca_features = pca_results["pca"].transform(pca_results["features_scaled"])
    seismic_pca_df = pd.DataFrame()

    # 根据最佳模型使用的特征数量准备数据
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        seismic_pca_df[f"PC{i + 1}"] = seismic_pca_features[:, i]

    # 使用最佳模型进行预测
    predicted_thickness = sigmoid_model.predict(
        seismic_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 创建预测结果DataFrame
    prediction_results = pca_results["coords_clean"].copy()
    prediction_results["Predicted_Sand_Thickness"] = predicted_thickness

    # 添加模型信息列
    prediction_results["Model_Type"] = best_model_name
    prediction_results["Model_R2"] = best_fit["r2_score"]

    # 保存预测结果
    prediction_results.to_csv(os.path.join(output_dir, "predicted_sand_thickness.csv"), index=False)
    print(f"预测结果已保存到: {os.path.join(output_dir, 'predicted_sand_thickness.csv')}")

    # 显示预测统计
    print(f"\n预测结果统计:")
    print(f"  预测样本数: {len(prediction_results)}")
    print(f"  预测砂厚范围: {predicted_thickness.min():.2f} - {predicted_thickness.max():.2f} m")
    print(f"  预测砂厚均值: {predicted_thickness.mean():.2f} m")
    print(f"  预测砂厚标准差: {predicted_thickness.std():.2f} m")
    print(f"  使用模型: {best_model_name}")
    print(f"  模型R^2: {best_fit['r2_score']:.4f}")

    print(f"\n=== Sigmoid建模完成! ===")
    print(f"最佳模型: {best_model_name} (R^2 = {best_fit['r2_score']:.4f})")

else:
    print("\n模型拟合失败，无法进行后续预测")

In [None]:
# 步骤6: 使用visualize_attribute_map复用可视化预测结果
if best_fit["success"]:
    print(f"\n=== 步骤6: 可视化预测结果 ===")

    # 1. 使用visualize_attribute_map函数可视化预测结果的空间分布
    print("生成预测砂厚空间分布图...")
    visualize_attribute_map(
        data_points=prediction_results,
        attribute_name="Predicted_Sand_Thickness",
        attribute_label="预测砂厚 (m)",
        real_wells=data_well_purpose_surface_filtered,
        pseudo_wells=None,  # 没有虚拟井点
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="sigmoid_prediction",
        class_thresholds=[1, 10],
        figsize=(14, 10),
        dpi=300,
        cmap="viridis",
        point_size=150,
        well_size=200,
        vrange=None,  # 使用数据自身范围
    )

    # 2. 创建详细的预测分析图表
    plt.figure(figsize=(15, 10))

    # 子图1: 预测值 vs 真实值（井点处）
    plt.subplot(2, 2, 1)
    # 在井点位置提取预测值进行对比
    well_coords = data_well_purpose_surface_filtered[["X", "Y"]].values
    pred_coords = prediction_results[["X", "Y"]].values

    # 找到最近的预测点
    from scipy.spatial.distance import cdist

    distances = cdist(well_coords, pred_coords)  # type: ignore
    closest_indices = np.argmin(distances, axis=1)
    well_predictions = predicted_thickness[closest_indices]

    plt.scatter(sigmoid_data["Sand Thickness"], well_predictions, alpha=0.7, s=80, edgecolors="black")

    # 添加1:1参考线
    min_val = min(sigmoid_data["Sand Thickness"].min(), well_predictions.min())
    max_val = max(sigmoid_data["Sand Thickness"].max(), well_predictions.max())
    plt.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)

    plt.xlabel("真实砂厚 (m)")
    plt.ylabel("预测砂厚 (m)")
    plt.title("预测 vs 真实砂厚")
    plt.grid(True, alpha=0.3)

    # 计算并显示相关系数
    correlation = np.corrcoef(sigmoid_data["Sand Thickness"], well_predictions)[0, 1]
    plt.text(
        0.05,
        0.95,
        f"相关系数: {correlation:.3f}",
        transform=plt.gca().transAxes,
        bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8),
    )

    # 子图2: 预测砂厚直方图对比
    plt.subplot(2, 2, 2)
    plt.hist(predicted_thickness, bins=50, alpha=0.7, color="skyblue", label=f"预测砂厚 (n={len(predicted_thickness)})")
    plt.hist(
        sigmoid_data["Sand Thickness"], bins=20, alpha=0.7, color="orange", label=f"井点砂厚 (n={len(sigmoid_data)})"
    )
    plt.xlabel("砂厚 (m)")
    plt.ylabel("频数")
    plt.title("砂厚分布对比")
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 子图3: 残差分布
    plt.subplot(2, 2, 3)
    residuals_well = sigmoid_data["Sand Thickness"].values - well_predictions
    plt.hist(residuals_well, bins=15, alpha=0.7, color="lightcoral")
    plt.axvline(x=0, color="red", linestyle="--", alpha=0.8)
    plt.xlabel("残差 (真实 - 预测)")
    plt.ylabel("频数")
    plt.title("井点处残差分布")
    plt.grid(True, alpha=0.3)

    # 添加残差统计
    residual_stats = f"均值: {np.mean(residuals_well):.3f}\n标准差: {np.std(residuals_well):.3f}"
    plt.text(
        0.05,
        0.95,
        residual_stats,
        transform=plt.gca().transAxes,
        verticalalignment="top",
        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
    )

    # 子图4: 模型性能摘要
    plt.subplot(2, 2, 4)
    plt.axis("off")  # 关闭坐标轴

    # 准备性能摘要文本
    performance_text = f"""
模型性能摘要

最佳模型: {best_model_name}
R^2 评分: {best_fit["r2_score"]:.4f}
RMSE: {np.sqrt(np.mean((best_fit["y"] - best_fit["y_pred"]) ** 2)):.3f} m
MAE: {np.mean(np.abs(best_fit["y"] - best_fit["y_pred"])):.3f} m

预测结果统计:
1. 预测样本数: {len(prediction_results):,}
2. 预测砂厚范围: {predicted_thickness.min():.2f} - {predicted_thickness.max():.2f} m
3. 预测砂厚均值: {predicted_thickness.mean():.2f} m
4. 预测砂厚标准差: {predicted_thickness.std():.2f} m

井点对比:
1. 井点砂厚范围: {sigmoid_data["Sand Thickness"].min():.2f} - {sigmoid_data["Sand Thickness"].max():.2f} m
2. 井点砂厚均值: {sigmoid_data["Sand Thickness"].mean():.2f} m
3. 预测-实际相关系数: {correlation:.3f}
    """

    plt.text(
        0.1,
        0.9,
        performance_text,
        transform=plt.gca().transAxes,
        fontsize=11,
        verticalalignment="top",
        bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8),
    )

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "sigmoid_prediction_detailed_analysis.png"), dpi=300, bbox_inches="tight")
    plt.show()

    print("预测结果可视化已完成")
    print(f"  - 空间分布图: sigmoid_prediction_map_with_wells.png")
    print(f"  - 属性直方图: sigmoid_prediction_attribute_histogram.png")
    print(f"  - 详细分析图: sigmoid_prediction_detailed_analysis.png")

## 设置虚拟井

In [None]:
# 提取样本，准备设置虚拟井
print("=== 提取样本，准备设置虚拟井 ===")

# 使用筛选后的地震数据区域提取等间距样本
seismic_samples = extract_uniform_seismic_samples(
    seismic_data=seismic_attr_filtered,
    n_rows=40,
    n_cols=40,
    area_bounds=area_bounds,  # 使用之前定义的区域边界
)

# 可视化真实井点和采样点
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样）
sample_ratio = min(1.0, 5000 / len(seismic_attr_filtered))
seismic_sample = seismic_attr_filtered.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="地震数据(抽样)")

# 绘制真实井点位置
plt.scatter(
    data_well_purpose_surface_filtered["X"],
    data_well_purpose_surface_filtered["Y"],
    color="red",
    s=100,
    marker="^",
    label="真实井点",
)

# 绘制等间距采样点位置
plt.scatter(seismic_samples["X"], seismic_samples["Y"], color="blue", s=50, marker="o", label="等间距采样点")

# 添加标题和图例
plt.title("真实井点与等间距采样点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "real_wells_and_seismic_samples.png"), dpi=300, bbox_inches="tight")
plt.show()

# 保存提取的样本数据
seismic_samples.to_csv(os.path.join(data_tmp_dir, "seismic_samples.csv"), index=False)
print(f"等间距地震样本数据已保存至 {os.path.join(data_tmp_dir, 'seismic_samples.csv')}")

print(f"提取的样本数量: {len(seismic_samples)}")
print(
    f"样本分布区域: X({seismic_samples['X'].min():.1f} - {seismic_samples['X'].max():.1f}), "
    f"Y({seismic_samples['Y'].min():.1f} - {seismic_samples['Y'].max():.1f})"
)

In [None]:
# 使用Sigmoid模型预测虚拟井砂厚
print("=== 使用Sigmoid模型预测虚拟井砂厚 ===")

if best_fit["success"]:
    # 为地震样本点提取PCA特征
    sample_features = seismic_samples[pca_results["features_clean"].columns].values

    # 使用相同的标准化器和PCA模型变换样本数据
    sample_features_scaled = pca_results["scaler"].transform(sample_features)
    sample_pca_features = pca_results["pca"].transform(sample_features_scaled)

    # 准备样本的PCA特征DataFrame
    sample_pca_df = pd.DataFrame()
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        sample_pca_df[f"PC{i + 1}"] = sample_pca_features[:, i]

    # 使用最佳Sigmoid模型进行预测
    predicted_sample_thickness = sigmoid_model.predict(
        sample_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 将预测结果添加到样本数据
    seismic_samples["Predicted_Sand_Thickness"] = predicted_sample_thickness

    # 将负值预测设为0
    negative_count = (predicted_sample_thickness < 0).sum()
    if negative_count > 0:
        print(f"注意: {negative_count} 个负的砂厚预测值已被替换为0")
        seismic_samples["Predicted_Sand_Thickness"] = seismic_samples["Predicted_Sand_Thickness"].clip(lower=0)

    # 显示预测统计
    print(f"\n虚拟井砂厚预测统计:")
    print(f"  样本数量: {len(seismic_samples)}")
    print(
        f"  预测砂厚范围: {seismic_samples['Predicted_Sand_Thickness'].min():.2f} - {seismic_samples['Predicted_Sand_Thickness'].max():.2f} m"
    )
    print(f"  预测砂厚均值: {seismic_samples['Predicted_Sand_Thickness'].mean():.2f} m")
    print(f"  预测砂厚标准差: {seismic_samples['Predicted_Sand_Thickness'].std():.2f} m")

    # 保存带预测结果的虚拟井数据
    # seismic_samples.to_csv(os.path.join(data_tmp_dir, "virtual_wells_with_predictions.csv"), index=False)
    # print(f"虚拟井预测结果已保存至 {os.path.join(data_tmp_dir, 'virtual_wells_with_predictions.csv')}")

else:
    print("Sigmoid模型拟合失败，无法生成虚拟井预测")

In [None]:
# 虚拟井优化选择（基于距离和砂厚分布）
print("=== 虚拟井优化选择（基于距离和砂厚分布）===")

if best_fit["success"]:
    # 准备数据
    pseudo_wells_data = seismic_samples.copy()  # 避免变量冲突
    real_wells_data = data_well_purpose_surface_filtered.copy()

    print(f"开始优化筛选，初始虚拟井数量: {len(pseudo_wells_data)}")

    # === 第一层筛选：排除靠近真实井点且砂厚差异大的点 ===
    print("\n第一层筛选：排除靠近真实井点且砂厚差异大的点...")

    proximity_radius = 200  # 米，设置排除半径
    max_thickness_diff = 5.0  # 米，最大允许砂厚差异

    # 获取真实井点的坐标和砂厚
    real_coords = real_wells_data[["X", "Y"]].values
    real_thickness = real_wells_data["Sand Thickness"].values

    # 获取虚拟井的坐标和预测砂厚
    pseudo_coords = pseudo_wells_data[["X", "Y"]].values
    pseudo_thickness = pseudo_wells_data["Predicted_Sand_Thickness"].values

    # 计算每个虚拟井到所有真实井的距离
    distances = cdist(pseudo_coords, real_coords)  # type: ignore
    min_distances = np.min(distances, axis=1)
    closest_well_indices = np.argmin(distances, axis=1)

    # 标记需要排除的虚拟井
    exclude_mask = np.zeros(len(pseudo_wells_data), dtype=bool)
    excluded_count = 0

    for i in range(len(pseudo_wells_data)):
        closest_well_idx = closest_well_indices[i]
        distance_to_closest = min_distances[i]

        if distance_to_closest <= proximity_radius:
            thickness_diff = abs(pseudo_thickness[i] - real_thickness[closest_well_idx])
            if thickness_diff > max_thickness_diff:
                exclude_mask[i] = True
                excluded_count += 1

    # 应用排除掩码
    layer1_filtered = pseudo_wells_data[~exclude_mask].copy().reset_index(drop=True)
    print(f"第一层筛选完成：排除了 {excluded_count} 个点，剩余 {len(layer1_filtered)} 个点")

    # === 第二层筛选：基于距离的贪心选择 ===
    print("\n第二层筛选：基于距离的贪心选择...")

    min_pseudo_distance = 200  # 虚拟井之间最小距离（米）

    # 计算虚拟井之间的距离矩阵
    layer1_coords = layer1_filtered[["X", "Y"]].values
    pseudo_distances = cdist(layer1_coords, layer1_coords)

    # 按砂厚预测值排序，优先选择有代表性的砂厚值
    thickness_values = layer1_filtered["Predicted_Sand_Thickness"].values
    thickness_order = np.argsort(thickness_values)  # type: ignore

    selected_indices = []

    for idx in thickness_order:
        # 检查与已选虚拟井的距离
        too_close = False
        for selected_idx in selected_indices:
            if pseudo_distances[idx, selected_idx] < min_pseudo_distance:
                too_close = True
                break

        if not too_close:
            selected_indices.append(idx)

    layer2_filtered = layer1_filtered.iloc[selected_indices].copy().reset_index(drop=True)
    print(f"第二层筛选完成：选择了 {len(layer2_filtered)} 个距离合适的点")

    # === 第三层筛选：基于砂厚分布的均衡选择 ===
    print("\n第三层筛选：基于砂厚分布的均衡选择...")

    # 定义砂厚区间
    thickness_bins = [0, 1.5, 10, 20, np.inf]
    bin_labels = ["0-1.5m", "1.5-10m", "10-20m", ">20m"]

    # 每个区间最多选择的样本数
    max_samples_per_bin = 20

    final_selected_indices = []

    for i in range(len(thickness_bins) - 1):
        # 获取该区间的虚拟井
        bin_mask = (layer2_filtered["Predicted_Sand_Thickness"] >= thickness_bins[i]) & (
            layer2_filtered["Predicted_Sand_Thickness"] < thickness_bins[i + 1]
        )
        bin_indices = layer2_filtered.index[bin_mask].tolist()

        if len(bin_indices) == 0:
            print(f"  区间 {bin_labels[i]}: 无可用样本")
            continue

        # 如果样本数超过最大限制，随机选择
        if len(bin_indices) > max_samples_per_bin:
            selected_bin_indices = np.random.choice(bin_indices, max_samples_per_bin, replace=False).tolist()
        else:
            selected_bin_indices = bin_indices

        final_selected_indices.extend(selected_bin_indices)
        print(f"  区间 {bin_labels[i]}: 从 {len(bin_indices)} 个中选择了 {len(selected_bin_indices)} 个")

    # 生成最终优化的虚拟井数据
    optimized_pseudo_wells = layer2_filtered.loc[final_selected_indices].copy().reset_index(drop=True)

    print(f"\n虚拟井优化筛选结果:")
    print(f"  原始虚拟井数量: {len(pseudo_wells_data)}")
    print(f"  第一层筛选后: {len(layer1_filtered)}")
    print(f"  第二层筛选后: {len(layer2_filtered)}")
    print(f"  最终优化数量: {len(optimized_pseudo_wells)}")

    # 保存优化后的虚拟井
    optimized_pseudo_wells.to_csv(os.path.join(data_tmp_dir, "optimized_pseudo_wells.csv"), index=False)

    # 统计最终分布
    print(f"\n最终砂厚分布:")
    for i in range(len(thickness_bins) - 1):
        bin_mask = (optimized_pseudo_wells["Predicted_Sand_Thickness"] >= thickness_bins[i]) & (
            optimized_pseudo_wells["Predicted_Sand_Thickness"] < thickness_bins[i + 1]
        )
        bin_count = bin_mask.sum()
        bin_percent = bin_count / len(optimized_pseudo_wells) * 100 if len(optimized_pseudo_wells) > 0 else 0
        print(f"  {bin_labels[i]}: {bin_count} 个 ({bin_percent:.1f}%)")

else:
    print("Sigmoid模型拟合失败，无法进行虚拟井优化")

In [None]:
# 虚拟井展示和分析
print("=== 虚拟井展示和分析 ===")

if best_fit["success"]:
    # 1. 为整个地震数据区域预测砂厚
    print("为整个地震数据区域预测砂厚...")

    # 为地震数据提取PCA特征
    seismic_features = seismic_attr_filtered[pca_results["features_clean"].columns].values

    # 使用相同的标准化器和PCA模型变换地震数据
    seismic_features_scaled = pca_results["scaler"].transform(seismic_features)
    seismic_pca_features = pca_results["pca"].transform(seismic_features_scaled)

    # 准备地震数据的PCA特征DataFrame
    seismic_pca_df = pd.DataFrame()
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        seismic_pca_df[f"PC{i + 1}"] = seismic_pca_features[:, i]

    # 使用最佳Sigmoid模型预测整个地震数据区域的砂厚
    seismic_predicted_thickness = sigmoid_model.predict(
        seismic_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 将负值预测设为0
    seismic_predicted_thickness = np.maximum(seismic_predicted_thickness, 0)

    # 将预测结果添加到地震数据中
    seismic_attr_filtered_with_pred = seismic_attr_filtered.copy()
    seismic_attr_filtered_with_pred["Predicted_Sand_Thickness"] = seismic_predicted_thickness

    print(f"整个地震数据区域预测完成，预测点数: {len(seismic_attr_filtered_with_pred)}")

    # 2. 使用优化后的虚拟井数据进行可视化
    print("生成虚拟井砂厚分布图...")

    # 准备真实井点数据
    real_wells = data_well_purpose_surface_filtered.copy()

    # 直接使用优化后的虚拟井数据
    pseudo_wells = optimized_pseudo_wells.copy()
    print(f"使用优化后的虚拟井数据: {len(pseudo_wells)} 个")

    # 可视化虚拟井砂厚分布
    visualize_attribute_map(
        data_points=seismic_attr_filtered_with_pred,
        attribute_name="Predicted_Sand_Thickness",
        attribute_label="预测砂厚 (m)",
        real_wells=real_wells,
        pseudo_wells=pseudo_wells,
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="pseudo_wells_optimized",
        class_thresholds=[1.5, 10],
        figsize=(16, 14),
        dpi=300,
        cmap="viridis",
        point_size=140,
        well_size=200,
    )

    # 3. 创建真实井和虚拟井砂厚分布对比
#     print("创建真实井和虚拟井砂厚分布对比...")

#     # 提取真实井和虚拟井的砂厚数据
#     real_thickness = real_wells["Sand Thickness"].values
#     pseudo_thickness = pseudo_wells["Predicted_Sand_Thickness"].values

#     # 设置砂厚区间
#     max_thickness = max(np.max(real_thickness), np.max(pseudo_thickness))  # type: ignore
#     thickness_bins = [0, 1.5, 10, 20, max_thickness + 1]
#     thickness_labels = ["0-1.5", "1.5-10", "10-20", f">20"]

#     # 计算各区间的井点数量
#     real_hist, _ = np.histogram(real_thickness, bins=thickness_bins)  # type: ignore
#     pseudo_hist, _ = np.histogram(pseudo_thickness, bins=thickness_bins)  # type: ignore

#     # 计算百分比
#     real_percent = real_hist / len(real_thickness) * 100
#     pseudo_percent = pseudo_hist / len(pseudo_thickness) * 100

#     # 创建直方图
#     plt.figure(figsize=(12, 8))

#     # 设置柱状图位置
#     bar_width = 0.35
#     r1 = np.arange(len(thickness_labels))
#     r2 = [x + bar_width for x in r1]

#     # 绘制真实井砂厚分布
#     plt.bar(r1, real_percent, width=bar_width, color="crimson", alpha=0.7, label="真实井点砂厚")

#     # 绘制虚拟井砂厚分布
#     plt.bar(r2, pseudo_percent, width=bar_width, color="royalblue", alpha=0.7, label="虚拟井点砂厚")

#     # 添加数据标签
#     for i, v in enumerate(real_percent):
#         plt.text(r1[i], v + 1, f"{v:.1f}%", ha="center", va="bottom", fontweight="bold", color="crimson")

#     for i, v in enumerate(pseudo_percent):
#         plt.text(r2[i], v + 1, f"{v:.1f}%", ha="center", va="bottom", fontweight="bold", color="royalblue")

#     # 设置图表属性
#     plt.xlabel("砂厚区间(米)", fontsize=14)
#     plt.ylabel("百分比(%)", fontsize=14)
#     plt.title("真实井点与虚拟井点砂厚分布对比", fontsize=16)
#     plt.xticks([r + bar_width / 2 for r in range(len(thickness_labels))], thickness_labels)
#     plt.ylim(0, max(max(real_percent), max(pseudo_percent)) * 1.2)

#     # 添加图例和网格
#     plt.legend(loc="upper right", fontsize=12)
#     plt.grid(True, linestyle="--", alpha=0.3)

#     # 保存图表
#     plt.tight_layout()
#     plt.savefig(os.path.join(output_dir, "real_vs_pseudo_thickness_histogram.png"), dpi=300, bbox_inches="tight")
#     plt.show()

#     print("虚拟井分析完成，所有结果已保存到输出目录")
#     print(f"  - 虚拟井空间分布图: pseudo_wells_optimized_map_with_wells.png")
#     print(f"  - 砂厚分布对比图: real_vs_pseudo_thickness_histogram.png")
#     print(f"  - 虚拟井数据: optimized_pseudo_wells.csv")

# else:
#     print("Sigmoid模型拟合失败，无法进行虚拟井分析")

## Bonus：使用当前模型进行全区范围砂厚预测

In [None]:
# Bonus：全区范围砂厚预测
print("=== Bonus: 全区范围砂厚预测 ===")

if best_fit["success"]:
    # 1. 为全区地震数据提取PCA特征
    print("为全区地震数据提取PCA特征...")

    # 使用原始的processed_seismic_full数据（全区范围）
    # 提取特征列
    full_region_features = processed_seismic_full[pca_results["features_clean"].columns].values

    # 使用已训练的标准化器和PCA模型
    full_region_features_scaled = pca_results["scaler"].transform(full_region_features)
    full_region_pca_features = pca_results["pca"].transform(full_region_features_scaled)

    # 准备PCA特征DataFrame
    full_region_pca_df = pd.DataFrame()
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        full_region_pca_df[f"PC{i + 1}"] = full_region_pca_features[:, i]

    print(f"全区地震数据点数: {len(processed_seismic_full)}")

    # 2. 使用Sigmoid模型预测全区砂厚
    print("使用Sigmoid模型预测全区砂厚...")

    full_region_predicted_thickness = sigmoid_model.predict(
        full_region_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 将负值预测设为0
    full_region_predicted_thickness = np.maximum(full_region_predicted_thickness, 0)

    # 3. 创建全区预测结果
    full_region_results = processed_seismic_full[["X", "Y"]].copy()
    full_region_results["Predicted_Sand_Thickness"] = full_region_predicted_thickness
    full_region_results["Model_Type"] = best_model_name
    full_region_results["Model_R2"] = best_fit["r2_score"]

    # 4. 保存全区预测结果
    full_region_results.to_csv(os.path.join(output_dir, "full_region_predicted_sand_thickness.csv"), index=False)
    print(f"全区预测结果已保存: full_region_predicted_sand_thickness.csv")

    # 5. 显示预测统计
    print(f"\n全区预测统计:")
    print(f"  预测点数: {len(full_region_results):,}")
    print(
        f"  预测砂厚范围: {full_region_predicted_thickness.min():.2f} - {full_region_predicted_thickness.max():.2f} m"
    )
    print(f"  预测砂厚均值: {full_region_predicted_thickness.mean():.2f} m")
    print(f"  预测砂厚标准差: {full_region_predicted_thickness.std():.2f} m")

    # 6. 生成全区砂厚分布图
    print("生成全区砂厚分布图...")

    # 使用全区井点数据（包括被筛选掉的离群井）
    all_wells = data_well_purpose_surface_position.copy()

    # 可视化全区预测结果
    visualize_attribute_map(
        data_points=full_region_results,
        attribute_name="Predicted_Sand_Thickness",
        attribute_label="预测砂厚 (m)",
        real_wells=all_wells,
        pseudo_wells=None,
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="full_region_prediction",
        class_thresholds=[1, 10],
        figsize=(16, 14),
        dpi=300,
        cmap="viridis",
        point_size=50,
        well_size=150,
    )

    # 7. 创建对比分析图
    plt.figure(figsize=(15, 10))

    # 子图1: 全区预测砂厚直方图
    plt.subplot(2, 2, 1)
    plt.hist(full_region_predicted_thickness, bins=100, alpha=0.7, color="skyblue", edgecolor="black")
    plt.xlabel("预测砂厚 (m)")
    plt.ylabel("频数")
    plt.title("全区预测砂厚分布")
    plt.grid(True, alpha=0.3)

    # 添加统计信息
    stats_text = f"总点数: {len(full_region_predicted_thickness):,}\n"
    stats_text += f"均值: {full_region_predicted_thickness.mean():.2f} m\n"
    stats_text += f"标准差: {full_region_predicted_thickness.std():.2f} m\n"
    stats_text += f"最大值: {full_region_predicted_thickness.max():.2f} m"

    plt.text(
        0.6,
        0.8,
        stats_text,
        transform=plt.gca().transAxes,
        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
    )

    # 子图2: 井点砂厚 vs 全区预测砂厚对比
    plt.subplot(2, 2, 2)
    plt.hist(
        all_wells["Sand Thickness"].dropna(), bins=20, alpha=0.7, color="orange", label=f"井点砂厚 (n={len(all_wells)})"
    )
    plt.hist(
        full_region_predicted_thickness,
        bins=100,
        alpha=0.5,
        color="skyblue",
        label=f"全区预测 (n={len(full_region_predicted_thickness):,})",
    )
    plt.xlabel("砂厚 (m)")
    plt.ylabel("频数")
    plt.title("井点 vs 全区砂厚分布对比")
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 子图3: 砂厚分级统计
    plt.subplot(2, 2, 3)
    thickness_bins = [0, 1, 10, 20, np.inf]
    thickness_labels = ["0-1m", "1-10m", "10-20m", ">20m"]

    # 计算各区间的数量
    full_region_hist, _ = np.histogram(full_region_predicted_thickness, bins=thickness_bins)
    well_hist, _ = np.histogram(all_wells["Sand Thickness"].dropna(), bins=thickness_bins)

    # 计算百分比
    full_region_percent = full_region_hist / len(full_region_predicted_thickness) * 100
    well_percent = well_hist / len(all_wells.dropna(subset=["Sand Thickness"])) * 100

    x = np.arange(len(thickness_labels))
    width = 0.35

    plt.bar(x - width / 2, well_percent, width, label="井点分布", color="orange", alpha=0.7)
    plt.bar(x + width / 2, full_region_percent, width, label="全区预测", color="skyblue", alpha=0.7)

    plt.xlabel("砂厚区间")
    plt.ylabel("百分比 (%)")
    plt.title("砂厚分级统计对比")
    plt.xticks(x, thickness_labels)
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 子图4: 模型信息摘要
    plt.subplot(2, 2, 4)
    plt.axis("off")

    summary_text = f"""
全区砂厚预测摘要

模型信息:
1. 最佳模型: {best_model_name}
2. R^2 评分: {best_fit["r2_score"]:.4f}
3. 使用特征: {", ".join(best_fit["use_features"])}

预测区域:
1. 全区点数: {len(full_region_predicted_thickness):,}
2. 工区范围: X({processed_seismic_full["X"].min():.0f}-{processed_seismic_full["X"].max():.0f}),
           Y({processed_seismic_full["Y"].min():.0f}-{processed_seismic_full["Y"].max():.0f})

预测结果:
1. 砂厚范围: {full_region_predicted_thickness.min():.2f} - {full_region_predicted_thickness.max():.2f} m
2. 平均砂厚: {full_region_predicted_thickness.mean():.2f} m
3. 有利区域(>10m): {(full_region_predicted_thickness > 10).sum():,} 个点
  ({(full_region_predicted_thickness > 10).sum() / len(full_region_predicted_thickness) * 100:.1f}%)

训练数据对比:
1. 井点数量: {len(all_wells)}
2. 井点砂厚均值: {all_wells["Sand Thickness"].mean():.2f} m
3. 预测-实际相关性: 基于{len(sigmoid_data)}个有效井点训练
    """

    plt.text(
        0.05,
        0.95,
        summary_text,
        transform=plt.gca().transAxes,
        fontsize=10,
        verticalalignment="top",
        bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8),
    )

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "full_region_prediction_analysis.png"), dpi=300, bbox_inches="tight")
    plt.show()

    print(f"\n=== 全区预测完成 ===")
    print(f"输出文件:")
    print(f"  • 预测数据: full_region_predicted_sand_thickness.csv")
    print(f"  • 空间分布图: full_region_prediction_map_with_wells.png")
    print(f"  • 属性直方图: full_region_prediction_attribute_histogram.png")
    print(f"  • 详细分析图: full_region_prediction_analysis.png")

else:
    print("Sigmoid模型拟合失败，无法进行全区预测")