# PCA + GMM 再测试


In [None]:
# 确保src目录在Python路径中
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.feature_selection import select_best_features
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.visualization import visualize_attribute_map, visualize_gmm_clustering, visualize_pca_clustering

data_dir = "..\\data"
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [None]:
data_seismic_url = os.path.join(data_dir, "6.2")

data_seismic_attr = parse_petrel_file(data_seismic_url)

## 导入井点位置


In [None]:
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == "H6-2"]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)
data_well_purpose_surface_position.head()

## 筛除离群井


In [None]:
# 筛选离群井
data_well_purpose_surface_filtered = filter_outlier_wells(data_well_purpose_surface_position, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_purpose_surface_position)}")
print(f"筛选后井点数量: {len(data_well_purpose_surface_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_purpose_surface_position["X"].min()
x_max = data_well_purpose_surface_position["X"].max()
y_min = data_well_purpose_surface_position["Y"].min()
y_max = data_well_purpose_surface_position["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_purpose_surface_position["X"], data_well_purpose_surface_position["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_purpose_surface_filtered["X"], data_well_purpose_surface_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

## 处理属性缺失值


In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_url)

# 使用preprocess_features处理地震数据
processed_seismic, attr_stats = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,  # 缺失值超过60%的列将被删除
    outlier_method="iqr",
    outlier_threshold=1.5,
    verbose=True,
)

# 提取筛选后的属性
attribute_names_filtered = [col for col in processed_seismic.columns]

# 将处理后的属性数据与原始坐标数据合并
processed_seismic_full = data_seismic_attr[["X", "Y"]].copy()
for col in processed_seismic.columns:
    processed_seismic_full[col] = processed_seismic[col]

## 根据井点分布，缩小工区范围


In [None]:
# 限制工区范围
seismic_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=processed_seismic_full,
    well_data=data_well_purpose_surface_filtered,
    expansion_factor=1.5,  # 扩展50%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

## 提取井点处地震属性


In [None]:
# 为筛选前的井点提取地震属性
well_attr = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_position,
    seismic_data=processed_seismic_full,
    max_distance=50,
    num_points=5,
)

# 为筛选后的井点提取地震属性
well_attr_filtered = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_filtered, seismic_data=processed_seismic_full, max_distance=50, num_points=5
)

# 保存处理结果
well_attr.to_excel(os.path.join(data_dir, "wells_attr.xlsx"), index=False)
print("筛选前井点的地震属性已保存到 wells_attr.xlsx")
well_attr_filtered.to_excel(os.path.join(data_dir, "wells_attr_filtered.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 wells_attr_filtered.xlsx")

## 生成统计摘要


In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=seismic_attr_filtered,
    well_data=well_attr_filtered,
    common_attributes=attribute_names_filtered,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## PCA 降维


In [None]:
pca_results = perform_pca_analysis(
    data=seismic_attr_filtered,
    attribute_columns=good_attributes,
    variance_threshold=0.75,
    output_dir=output_dir,
)

## GMM 聚类


In [None]:
# 首先评估最佳聚类数
# gmm_evaluation = evaluate_gmm_clusters(features_pca=pca_results["features_pca"], max_clusters=10, output_dir=output_dir)

# 使用不同的聚类数执行GMM聚类
# 根据BIC/AIC结果选择的最佳聚类数
# best_n = gmm_evaluation["best_n_components"]

In [None]:
best_n = 3  # 聚类数量

# 1. 执行GMM聚类
gmm_results = perform_gmm_clustering(
    features=pca_results["features_pca"],
    coords=pca_results["coords_clean"],
    n_clusters=best_n,
)
gmm_results["result_df"].to_csv(os.path.join(output_dir, "gmm_best_clusters.csv"), index=False)

# 2. PCA可视化，需要将井点数据投影到PCA空间
# 首先提取井点的属性列
well_features = well_attr_filtered[pca_results["features_clean"].columns].values
# 使用相同的标准化器和PCA模型变换井点数据
well_features_scaled = pca_results["scaler"].transform(well_features)
well_pca_features = pca_results["pca"].transform(well_features_scaled)

# 3. 在PCA空间中可视化聚类结果
visualize_pca_clustering(
    clustering_results=gmm_results,
    pca_results=pca_results,
    n_clusters=best_n,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    well_pca_features=well_pca_features,
    target_column="Sand Thickness",
    class_thresholds=[0.1, 10],
)

# 4. 在地理空间中可视化聚类结果
visualize_gmm_clustering(
    clustering_results=gmm_results,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    target_column="Sand Thickness",
    class_thresholds=[0.1, 10],
    point_size=120,
    well_size=200,
)

## 依靠 PCA 进行 sigmoid 拟合

In [None]:
def visualize_feature_distribution(
    data,
    x_feature,
    y_feature,
    figsize=(10, 6),
    point_size=50,
    alpha=0.6,
    colormap="viridis",
    title=None,
    save_path=None,
):
    """
    通用的特征分布可视化函数

    Parameters:
    -----------
    data : pd.DataFrame
        包含特征数据的DataFrame
    x_feature : str
        x轴特征名
    y_feature : str
        y轴特征名（用作颜色映射）
    figsize : tuple, default=(10, 6)
        图形大小
    point_size : int, default=50
        点的大小
    alpha : float, default=0.6
        透明度
    colormap : str, default="viridis"
        颜色映射
    title : str, optional
        图表标题，如果None则自动生成
    save_path : str, optional
        保存路径

    Returns:
    --------
    matplotlib.figure.Figure : 生成的图形对象
    """
    plt.figure(figsize=figsize)

    # 创建散点图，颜色表示y特征
    scatter = plt.scatter(
        data[x_feature],
        data[y_feature],
        c=data[y_feature],
        s=point_size,
        alpha=alpha,
        cmap=colormap,
        edgecolors="black",
        linewidth=0.5,
    )

    plt.colorbar(scatter, label=f"{y_feature}")
    plt.xlabel(f"{x_feature}")
    plt.ylabel(f"{y_feature}")

    if title is None:
        title = f"特征分布: {x_feature} vs {y_feature}"
    plt.title(title)
    plt.grid(True, alpha=0.3)

    # 添加统计信息
    stats_text = f"样本数: {len(data)}\n"
    stats_text += f"{y_feature}范围: {data[y_feature].min():.2f} - {data[y_feature].max():.2f}\n"
    stats_text += f"{x_feature}范围: {data[x_feature].min():.2f} - {data[x_feature].max():.2f}"

    plt.text(
        0.02,
        0.98,
        stats_text,
        transform=plt.gca().transAxes,
        verticalalignment="top",
        bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8),
    )

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")

    return plt.gcf()


In [None]:
class SigmoidModel:
    """
    智能Sigmoid拟合模型

    支持自动检测PC值与地质类型关系，智能添加虚拟点稳定拟合过程。

    Attributes:
    -----------
    data : pd.DataFrame
        原始输入数据
    feature_columns : list
        特征列名列表
    target_column : str
        目标变量列名
    fit_params : np.array or None
        拟合参数 [L, k, x0]
    r2_score : float or None
        模型R²评分
    current_data : pd.DataFrame or None
        包含虚拟点的当前工作数据
    """

    def __init__(self, data, feature_columns, target_column):
        """
        初始化Sigmoid模型

        Parameters:
        -----------
        data : pd.DataFrame
            输入数据，必须包含特征列和目标列
        feature_columns : list
            特征列名列表，通常为PCA组件['PC1', 'PC2', ...]
        target_column : str
            目标变量列名，如'Sand Thickness'

        Raises:
        -------
        ValueError
            当数据中缺少必要列时抛出异常
        """
        self.data = data.copy()
        self.feature_columns = feature_columns
        self.target_column = target_column
        self.fit_params = None
        self.r2_score = None
        self.current_data = None  # 添加虚拟点后的数据

        # 检查必要的列是否存在
        missing_cols = [col for col in feature_columns + [target_column] if col not in data.columns]
        if missing_cols:
            raise ValueError(f"数据中缺少以下列: {missing_cols}")

    @staticmethod
    def sigmoid(x, L, k, x0):
        """
        标准三参数Sigmoid函数

        Parameters:
        -----------
        x : array-like
            输入变量
        L : float
            最大渐近值，表示砂厚的理论上限
        k : float
            增长率，正值表示正向增长，负值表示负向增长
        x0 : float
            中点位置，Sigmoid函数的拐点

        Returns:
        --------
        array-like
            Sigmoid函数值，范围在[0, L]之间

        Notes:
        ------
        函数形式: f(x) = L / (1 + exp(-k * (x - x0)))
        """
        return L / (1 + np.exp(-k * (x - x0)))

    def auto_detect_pc_geology_relationship(self, primary_feature="PC1", threshold_percentile=25):
        """
        自动检测PC值与地质类型的关系

        通过分析PC值的分布与砂厚的关系，自动判断低PC值和高PC值分别对应
        泥岩还是砂岩，避免虚拟点添加错误。

        Parameters:
        -----------
        primary_feature : str, default="PC1"
            用于分析的主要特征名称
        threshold_percentile : float, default=25
            用于划分低值和高值区间的百分位数阈值

        Returns:
        --------
        dict
            包含关系映射的字典
            - 'low_pc_type': str, 低PC值对应的地质类型 ('mud' 或 'sand')
            - 'high_pc_type': str, 高PC值对应的地质类型 ('mud' 或 'sand')
            - 'low_threshold': float, 低值区间阈值
            - 'high_threshold': float, 高值区间阈值
            - 'low_avg_thickness': float, 低PC值区间平均砂厚
            - 'high_avg_thickness': float, 高PC值区间平均砂厚

        Notes:
        ------
        分析逻辑：
        1. 计算指定百分位数的PC值阈值
        2. 比较低PC值区间和高PC值区间的平均砂厚
        3. 砂厚较小的区间判定为泥岩，砂厚较大的区间判定为砂岩
        """
        # 计算低PC1值和高PC1值区间的平均砂厚
        pc_values = self.data[primary_feature]
        sand_thickness = self.data[self.target_column]

        low_threshold = np.percentile(pc_values, threshold_percentile)
        high_threshold = np.percentile(pc_values, 100 - threshold_percentile)

        # 低PC1区间的平均砂厚
        low_pc_mask = pc_values <= low_threshold
        low_pc_avg_thickness = sand_thickness[low_pc_mask].mean()

        # 高PC1区间的平均砂厚
        high_pc_mask = pc_values >= high_threshold
        high_pc_avg_thickness = sand_thickness[high_pc_mask].mean()

        print(f"PC值与地质类型关系分析({primary_feature}):")
        print(f"  低PC值区间({primary_feature} ≤ {low_threshold:.2f}): 平均砂厚 {low_pc_avg_thickness:.2f}m")
        print(f"  高PC值区间({primary_feature} ≥ {high_threshold:.2f}): 平均砂厚 {high_pc_avg_thickness:.2f}m")

        # 判断关系
        if low_pc_avg_thickness < high_pc_avg_thickness:
            # 标准关系：低PC1=泥岩，高PC1=砂岩
            relationship = {"low_pc_type": "mud", "high_pc_type": "sand", "relationship_type": "standard"}
            print("  → 检测到标准关系：低PC值=泥岩，高PC值=砂岩")
        else:
            # 反向关系：低PC1=砂岩，高PC1=泥岩
            relationship = {"low_pc_type": "sand", "high_pc_type": "mud", "relationship_type": "reversed"}
            print("  → 检测到反向关系：低PC值=砂岩，高PC值=泥岩")

        # 添加统计信息
        relationship.update(
            {
                "low_threshold": low_threshold,
                "high_threshold": high_threshold,
                "low_avg_thickness": low_pc_avg_thickness,
                "high_avg_thickness": high_pc_avg_thickness,
            }
        )

        return relationship

    def add_virtual_points_smart(
        self,
        mud_range=None,  # 手动指定泥岩区间 (start, end)
        sand_range=None,  # 手动指定砂岩区间 (start, end)
        n_points=20,
        noise_factor=0.1,
        auto_detect=True,
        primary_feature=None,
        placement_strategy="conservative",  # "conservative" 或 "extended"
    ):
        """
        智能添加虚拟点，支持手动设置和自动策略

        Parameters:
        -----------
        mud_range : tuple or None, default=None
            手动指定泥岩虚拟点范围 (start, end)
            例如: (-2, 0) 表示在PC1值-2到0之间添加泥岩虚拟点
        sand_range : tuple or None, default=None
            手动指定砂岩虚拟点范围 (start, end)
            例如: (2, 4) 表示在PC1值2到4之间添加砂岩虚拟点
        n_points : int, default=20
            每个区间生成的虚拟点数量
        noise_factor : float, default=0.1
            噪音因子，用于为虚拟点添加随机变化
        auto_detect : bool, default=True
            是否自动检测PC值与地质类型的关系
        primary_feature : str or None, default=None
            用于添加虚拟点的主要特征，如果为None则使用第一个特征
        placement_strategy : str, default="conservative"
            自动放置策略（仅在未手动指定范围时生效）:
            - "conservative": 在数据范围内侧保守放置（推荐）
            - "extended": 在数据范围外侧延伸放置

        Returns:
        --------
        tuple
            (enhanced_data, pc_geology_relationship)
            - enhanced_data: pd.DataFrame, 包含虚拟点的增强数据集
            - pc_geology_relationship: dict, PC值与地质类型的关系信息

        Examples:
        ---------
        # 手动指定泥岩区间
        data, relationship = model.add_virtual_points_smart(mud_range=(-5, 0), n_points=10)

        # 手动指定砂岩和泥岩区间
        data, relationship = model.add_virtual_points_smart(
            mud_range=(-2, 0), sand_range=(2, 5), n_points=10
        )

        # 使用保守的自动策略
        data, relationship = model.add_virtual_points_smart(
            placement_strategy="conservative", n_points=15
        )
        """

        if primary_feature is None:
            primary_feature = self.feature_columns[0]

        # 自动检测PC值与地质类型的关系
        if auto_detect:
            pc_geology_relationship = self.auto_detect_pc_geology_relationship(primary_feature)
        else:
            # 使用默认关系
            pc_geology_relationship = {"low_pc_type": "mud", "high_pc_type": "sand", "relationship_type": "default"}
            print(f"使用默认PC值关系：低PC值=泥岩，高PC值=砂岩")

        feature_min = self.data[primary_feature].min()
        feature_max = self.data[primary_feature].max()
        feature_range = feature_max - feature_min
        max_target = self.data[self.target_column].max()

        virtual_data = []

        print(f"虚拟点生成配置:")
        print(f"  主要特征: {primary_feature}")
        print(f"  数据范围: [{feature_min:.2f}, {feature_max:.2f}]")
        print(f"  每个区间点数: {n_points}")
        print(f"  噪音因子: {noise_factor}")

        # === 处理泥岩虚拟点 ===
        if mud_range is not None:
            # 手动指定泥岩区间
            print(f"  手动设置泥岩虚拟点范围: {mud_range}")
            mud_start, mud_end = mud_range

            # 生成泥岩虚拟点
            mud_x_values = np.linspace(mud_start, mud_end, n_points)
            for x_val in mud_x_values:
                virtual_point = {col: 0 for col in self.feature_columns}
                virtual_point[primary_feature] = x_val
                virtual_point[self.target_column] = abs(np.random.normal(0, noise_factor))
                virtual_point["is_virtual"] = True
                virtual_point["virtual_type"] = "mud"
                virtual_data.append(virtual_point)

        else:
            # 自动策略：根据PC-地质关系自动设置泥岩区间
            if placement_strategy == "conservative":
                # 保守策略：在数据范围内侧放置
                margin = feature_range * 0.15  # 15%的内缩边距

                if pc_geology_relationship["low_pc_type"] == "mud":
                    # 低PC值对应泥岩：在最小值右侧设置泥岩虚拟点
                    mud_start = feature_min
                    mud_end = feature_min + margin
                    print(f"  自动设置泥岩虚拟点（低PC=泥岩）: [{mud_start:.2f}, {mud_end:.2f}]")
                else:
                    # 高PC值对应泥岩：在最大值左侧设置泥岩虚拟点
                    mud_start = feature_max - margin
                    mud_end = feature_max
                    print(f"  自动设置泥岩虚拟点（高PC=泥岩）: [{mud_start:.2f}, {mud_end:.2f}]")

            else:  # extended strategy
                # 延伸策略：在数据范围外侧放置
                expansion = feature_range * 0.2  # 20%的外延

                if pc_geology_relationship["low_pc_type"] == "mud":
                    # 低PC值对应泥岩：在最小值左侧延伸
                    mud_start = feature_min - expansion
                    mud_end = feature_min
                    print(f"  自动设置泥岩虚拟点（低PC=泥岩，延伸）: [{mud_start:.2f}, {mud_end:.2f}]")
                else:
                    # 高PC值对应泥岩：在最大值右侧延伸
                    mud_start = feature_max
                    mud_end = feature_max + expansion
                    print(f"  自动设置泥岩虚拟点（高PC=泥岩，延伸）: [{mud_start:.2f}, {mud_end:.2f}]")

            # 生成泥岩虚拟点
            mud_x_values = np.linspace(mud_start, mud_end, n_points)
            for x_val in mud_x_values:
                virtual_point = {col: 0 for col in self.feature_columns}
                virtual_point[primary_feature] = x_val
                virtual_point[self.target_column] = abs(np.random.normal(0, noise_factor))
                virtual_point["is_virtual"] = True
                virtual_point["virtual_type"] = "mud"
                virtual_data.append(virtual_point)

        # === 处理砂岩虚拟点 ===
        if sand_range is not None:
            # 手动指定砂岩区间
            print(f"  手动设置砂岩虚拟点范围: {sand_range}")
            sand_start, sand_end = sand_range

            # 生成砂岩虚拟点
            sand_x_values = np.linspace(sand_start, sand_end, n_points)
            for x_val in sand_x_values:
                virtual_point = {col: 0 for col in self.feature_columns}
                virtual_point[primary_feature] = x_val
                virtual_point[self.target_column] = max_target + abs(np.random.normal(max_target * 0.1, noise_factor))
                virtual_point["is_virtual"] = True
                virtual_point["virtual_type"] = "sand"
                virtual_data.append(virtual_point)

        else:
            # 自动策略：根据PC-地质关系自动设置砂岩区间
            if placement_strategy == "conservative":
                # 保守策略：在数据范围内侧放置
                margin = feature_range * 0.15  # 15%的内缩边距

                if pc_geology_relationship["high_pc_type"] == "sand":
                    # 高PC值对应砂岩：在最大值左侧设置砂岩虚拟点
                    sand_start = feature_max - margin
                    sand_end = feature_max
                    print(f"  自动设置砂岩虚拟点（高PC=砂岩）: [{sand_start:.2f}, {sand_end:.2f}]")
                else:
                    # 低PC值对应砂岩：在最小值右侧设置砂岩虚拟点
                    sand_start = feature_min
                    sand_end = feature_min + margin
                    print(f"  自动设置砂岩虚拟点（低PC=砂岩）: [{sand_start:.2f}, {sand_end:.2f}]")

            else:  # extended strategy
                # 延伸策略：在数据范围外侧放置
                expansion = feature_range * 0.2  # 20%的外延

                if pc_geology_relationship["high_pc_type"] == "sand":
                    # 高PC值对应砂岩：在最大值右侧延伸
                    sand_start = feature_max
                    sand_end = feature_max + expansion
                    print(f"  自动设置砂岩虚拟点（高PC=砂岩，延伸）: [{sand_start:.2f}, {sand_end:.2f}]")
                else:
                    # 低PC值对应砂岩：在最小值左侧延伸
                    sand_start = feature_min - expansion
                    sand_end = feature_min
                    print(f"  自动设置砂岩虚拟点（低PC=砂岩，延伸）: [{sand_start:.2f}, {sand_end:.2f}]")

            # 生成砂岩虚拟点
            sand_x_values = np.linspace(sand_start, sand_end, n_points)
            for x_val in sand_x_values:
                virtual_point = {col: 0 for col in self.feature_columns}
                virtual_point[primary_feature] = x_val
                virtual_point[self.target_column] = max_target + abs(np.random.normal(max_target * 0.1, noise_factor))
                virtual_point["is_virtual"] = True
                virtual_point["virtual_type"] = "sand"
                virtual_data.append(virtual_point)

        # 合并数据
        enhanced_data = self.data.copy()
        enhanced_data["is_virtual"] = False
        enhanced_data["virtual_type"] = "real"

        if virtual_data:
            virtual_df = pd.DataFrame(virtual_data)
            enhanced_data = pd.concat([enhanced_data, virtual_df], ignore_index=True)
            print(f"  成功添加 {len(virtual_data)} 个虚拟点")

            # 统计虚拟点分布
            mud_count = sum(1 for vp in virtual_data if vp["virtual_type"] == "mud")
            sand_count = sum(1 for vp in virtual_data if vp["virtual_type"] == "sand")
            print(f"    - 泥岩虚拟点: {mud_count}")
            print(f"    - 砂岩虚拟点: {sand_count}")

        return enhanced_data, pc_geology_relationship

    def prepare_features(self, data, use_features=None, feature_weights=None):
        """
        准备特征，支持多维特征组合

        将多个PCA特征线性组合为单一输入特征，用于Sigmoid拟合。

        Parameters:
        -----------
        data : pd.DataFrame
            包含特征的数据源
        use_features : list or None, optional
            使用的特征列表，如['PC1', 'PC2']
            如果为None，则使用第一个特征
        feature_weights : list or None, optional
            特征权重列表，与use_features对应
            如果为None，则使用等权重

        Returns:
        --------
        np.array
            组合后的1D特征数组

        Notes:
        ------
        多特征组合公式：
        combined_feature = w1*PC1 + w2*PC2 + ... + wn*PCn
        其中 wi 为权重，通常基于PCA的方差贡献比设置
        """
        if use_features is None:
            use_features = [self.feature_columns[0]]

        if len(use_features) == 1:
            return data[use_features[0]].values

        # 多维特征线性组合
        if feature_weights is None:
            feature_weights = [1.0 / len(use_features)] * len(use_features)

        combined_features = np.zeros(len(data))
        for i, feature in enumerate(use_features):
            combined_features += feature_weights[i] * data[feature].values

        return combined_features

    def fit(
        self,
        use_features=None,
        feature_weights=None,
        virtual_points_config=None,
        bounds=None,
        initial_guess=None,
        max_iterations=2000,
    ):
        """
        拟合Sigmoid函数

        使用非线性最小二乘法拟合三参数Sigmoid函数到数据。

        Parameters:
        -----------
        use_features : list or None, optional
            使用的特征列表，如['PC1']或['PC1', 'PC2']
        feature_weights : list or None, optional
            特征权重，与use_features对应
        virtual_points_config : dict or None, optional
            虚拟点配置，支持两种模式：
            1. 智能模式: {'smart': True, 'n_points': int, 'noise_factor': float}
            2. 传统模式: {'x_mud': value, 'x_sand': value, 'n_points': int}
        bounds : tuple or None, optional
            参数边界 ((L_min, k_min, x0_min), (L_max, k_max, x0_max))
        initial_guess : tuple or None, optional
            初始参数猜测 (L, k, x0)
        max_iterations : int, default=2000
            优化算法最大迭代次数

        Returns:
        --------
        dict
            拟合结果字典，包含以下键：
            - 'success': bool, 拟合是否成功
            - 'params': dict, 拟合参数 {'L': float, 'k': float, 'x0': float}
            - 'param_errors': dict, 参数标准误差
            - 'r2_score': float, 决定系数
            - 'X': np.array, 输入特征
            - 'y': np.array, 目标值
            - 'y_pred': np.array, 预测值
            - 'use_features': list, 使用的特征
            - 'feature_weights': list, 特征权重
            如果失败，包含 'error': str

        Notes:
        ------
        拟合流程：
        1. 数据准备和虚拟点添加
        2. 特征组合和参数设置
        3. 非线性最小二乘拟合
        4. 结果评估和误差计算
        """
        # 准备数据
        working_data = self.data.copy()

        # 添加虚拟点
        if virtual_points_config:
            if virtual_points_config.get("smart", False):
                # 使用智能模式
                config = virtual_points_config.copy()
                config.pop("smart")  # 移除smart标志
                working_data, pc_relationship = self.add_virtual_points_smart(**config)
            else:
                # 使用传统模式
                working_data = self.add_virtual_points(**virtual_points_config)

        # 保存当前工作数据
        self.current_data = working_data

        # 准备特征
        X = self.prepare_features(working_data, use_features, feature_weights)
        y = working_data[self.target_column].values

        # 设置默认参数
        y_max = y.max()
        x_min, x_max = X.min(), X.max()
        x_range = x_max - x_min

        if bounds is None:
            bounds = (
                [y_max * 0.5, -10, x_min - x_range],  # 下界
                [y_max * 2.0, 10, x_max + x_range],  # 上界
            )

        if initial_guess is None:
            initial_guess = [y_max, 1.0, np.median(X)]

        try:
            # 拟合sigmoid函数
            self.fit_params, covariance = curve_fit(
                self.sigmoid, X, y, p0=initial_guess, bounds=bounds, maxfev=max_iterations
            )

            # 计算拟合质量
            y_pred = self.sigmoid(X, *self.fit_params)
            self.r2_score = r2_score(y, y_pred)

            # 计算参数标准误差
            param_errors = np.sqrt(np.diag(covariance))

            return {
                "success": True,
                "params": dict(zip(["L", "k", "x0"], self.fit_params)),
                "param_errors": dict(zip(["L_err", "k_err", "x0_err"], param_errors)),
                "r2_score": self.r2_score,
                "X": X,
                "y": y,
                "y_pred": y_pred,
                "use_features": use_features or [self.feature_columns[0]],
                "feature_weights": feature_weights,
            }

        except Exception as e:
            return {"success": False, "error": str(e), "X": X, "y": y}

    def predict(self, new_data, use_features=None, feature_weights=None):
        """
        使用拟合的模型进行预测

        对新数据应用已拟合的Sigmoid模型进行砂厚预测。

        Parameters:
        -----------
        new_data : pd.DataFrame or np.array
            新的输入数据
            - 如果是DataFrame，必须包含use_features中指定的列
            - 如果是numpy数组，视为已处理的1D特征
        use_features : list or None, optional
            使用的特征列表，应与拟合时一致
        feature_weights : list or None, optional
            特征权重，应与拟合时一致

        Returns:
        --------
        np.array
            预测的砂厚值数组

        Raises:
        -------
        ValueError
            当模型尚未拟合时抛出异常

        Notes:
        ------
        预测流程：
        1. 检查模型是否已拟合
        2. 特征准备和组合
        3. 应用Sigmoid函数
        """
        if self.fit_params is None:
            raise ValueError("模型尚未拟合，请先调用fit方法")

        if isinstance(new_data, pd.DataFrame):
            X_new = self.prepare_features(new_data, use_features, feature_weights)
        else:
            X_new = new_data

        return self.sigmoid(X_new, *self.fit_params)

    def visualize_fit(self, fit_result, figsize=(15, 8), save_path=None):
        """
        可视化拟合结果

        生成包含拟合曲线、残差分析和模型信息的综合可视化图表。

        Parameters:
        -----------
        fit_result : dict
            fit方法返回的拟合结果字典
        figsize : tuple, default=(15, 8)
            图形大小 (width, height)
        save_path : str or None, optional
            图片保存路径，如果为None则不保存

        Returns:
        --------
        matplotlib.figure.Figure or None
            生成的图形对象，如果拟合失败则返回None

        Notes:
        ------
        可视化内容：
        1. 左图：散点图 + Sigmoid拟合曲线 + 虚拟点标识
        2. 右图：残差分析图
        3. 模型参数和质量指标文本框
        """
        if not fit_result["success"]:
            print(f"拟合失败: {fit_result['error']}")
            return None

        # 提取数据
        X = fit_result["X"]
        y = fit_result["y"]
        y_pred = fit_result["y_pred"]
        params = fit_result["params"]
        r2_score_val = fit_result["r2_score"]

        # 创建图形
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

        # 左图：拟合结果
        if self.current_data is not None and "is_virtual" in self.current_data.columns:
            # 区分真实点和虚拟点
            real_mask = ~self.current_data["is_virtual"]
            virtual_mask = self.current_data["is_virtual"]

            # 真实点
            ax1.scatter(
                X[real_mask],
                y[real_mask],
                c="blue",
                alpha=0.7,
                s=60,
                label="真实样本",
                edgecolors="black",
                linewidth=0.5,
            )

            # 虚拟点
            if virtual_mask.any():
                mud_mask = self.current_data["virtual_type"] == "mud"
                sand_mask = self.current_data["virtual_type"] == "sand"

                if mud_mask.any():
                    ax1.scatter(X[mud_mask], y[mud_mask], c="brown", alpha=0.5, s=30, marker="^", label="虚拟点(泥岩)")
                if sand_mask.any():
                    ax1.scatter(
                        X[sand_mask], y[sand_mask], c="orange", alpha=0.5, s=30, marker="v", label="虚拟点(砂岩)"
                    )
        else:
            ax1.scatter(X, y, c="blue", alpha=0.7, s=60, label="样本点", edgecolors="black", linewidth=0.5)

        # 绘制拟合曲线
        X_curve = np.linspace(X.min(), X.max(), 300)
        y_curve = self.sigmoid(X_curve, *self.fit_params)
        ax1.plot(X_curve, y_curve, "red", linewidth=2, label="Sigmoid拟合")

        # 添加模型信息
        param_text = f"L = {params['L']:.2f}\n"
        param_text += f"k = {params['k']:.3f}\n"
        param_text += f"x₀ = {params['x0']:.2f}\n"
        param_text += f"R² = {r2_score_val:.3f}"

        ax1.text(
            0.02,
            0.98,
            param_text,
            transform=ax1.transAxes,
            verticalalignment="top",
            bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
        )

        ax1.set_xlabel("特征值")
        ax1.set_ylabel("砂厚 (m)")
        ax1.set_title("Sigmoid函数拟合结果")
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # 右图：残差分析
        residuals = y - y_pred
        ax2.scatter(y_pred, residuals, alpha=0.6, c="green", s=40)
        ax2.axhline(y=0, color="red", linestyle="--", alpha=0.8)
        ax2.set_xlabel("预测值 (m)")
        ax2.set_ylabel("残差 (m)")
        ax2.set_title("残差分析")
        ax2.grid(True, alpha=0.3)

        # 添加残差统计
        residual_stats = f"残差均值: {np.mean(residuals):.3f}\n"
        residual_stats += f"残差标准差: {np.std(residuals):.3f}\n"
        residual_stats += f"RMSE: {np.sqrt(np.mean(residuals**2)):.3f}"

        ax2.text(
            0.02,
            0.98,
            residual_stats,
            transform=ax2.transAxes,
            verticalalignment="top",
            bbox=dict(boxstyle="round", facecolor="lightgreen", alpha=0.8),
        )

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches="tight")

        return fig

In [None]:
# 将prepare_sigmoid_data函数合并到主代码中，并复用可视化函数

print("=== 开始 Sigmoid 建模 ===")

# 1. 准备建模数据（合并原prepare_sigmoid_data函数功能）
sigmoid_data = pd.DataFrame()

# 添加PCA特征
n_components = min(3, well_pca_features.shape[1])
for i in range(n_components):
    sigmoid_data[f"PC{i + 1}"] = well_pca_features[:, i]

# 添加砂厚
sigmoid_data["Sand Thickness"] = data_well_purpose_surface_filtered["Sand Thickness"].values

print(f"Sigmoid建模数据形状: {sigmoid_data.shape}")
print(f"可用的PCA特征: {[col for col in sigmoid_data.columns if col.startswith('PC')]}")
print("\n数据预览:")
print(sigmoid_data.head())

# 2. 创建Sigmoid模型
pc_columns = [col for col in sigmoid_data.columns if col.startswith("PC")]
sigmoid_model = SigmoidModel(data=sigmoid_data, feature_columns=pc_columns, target_column="Sand Thickness")

In [None]:
# 步骤1: 可视化原始样本分布
print("\n=== 步骤1: 可视化原始样本分布 ===")

# 使用visualize_feature_distribution函数（保持原有功能）
fig1 = visualize_feature_distribution(
    data=sigmoid_data,
    x_feature="PC1",
    y_feature="Sand Thickness",
    figsize=(10, 6),
    point_size=100,
    alpha=0.7,
    colormap="viridis",
    title="样本分布: PC1 vs Sand Thickness",
    save_path=os.path.join(output_dir, "sigmoid_original_distribution.png"),
)
plt.show()

# 分析数据特征
pc1_min, pc1_max = sigmoid_data["PC1"].min(), sigmoid_data["PC1"].max()
pc1_median = sigmoid_data["PC1"].median()
sand_thickness_max = sigmoid_data["Sand Thickness"].max()
sand_thickness_min = sigmoid_data["Sand Thickness"].min()

print(f"\n数据特征分析:")
print(f"PC1范围: {pc1_min:.2f} 到 {pc1_max:.2f}")
print(f"PC1中位数: {pc1_median:.2f}")
print(f"砂厚范围: {sand_thickness_min:.2f} 到 {sand_thickness_max:.2f} m")
print(f"样本数量: {len(sigmoid_data)}")

In [None]:
# 步骤2: 使用智能虚拟点并执行拟合
print(f"\n=== 步骤2: Sigmoid拟合（智能虚拟点稳定器） ===")

# # 1. 手动指定泥岩区间（在PC1的[-5, 0]范围设置10个泥岩虚拟点）
# virtual_config_manual_mud = {
#     "smart": True,
#     "mud_range": (-5, 0),      # 只设置泥岩虚拟点
#     "sand_range": None,        # 不设置砂岩虚拟点
#     "n_points": 10,
#     "noise_factor": 0.05
# }

# # 2. 手动指定砂岩和泥岩区间
# virtual_config_manual_both = {
#     "smart": True,
#     "mud_range": (-2, 0),      # 泥岩虚拟点范围
#     "sand_range": (2, 5),      # 砂岩虚拟点范围
#     "n_points": 10,
#     "noise_factor": 0.05
# }

# 3. 使用改进的保守自动策略（推荐）
virtual_config_conservative = {
    "smart": True,
    "placement_strategy": "conservative",  # 在数据范围内侧保守放置
    "n_points": 15,
    "noise_factor": 0.05,
    "auto_detect": True,
}

# # 4. 使用传统的延伸策略
# virtual_config_extended = {
#     "smart": True,
#     "placement_strategy": "extended",  # 在数据范围外侧延伸放置
#     "n_points": 10,
#     "noise_factor": 0.05,
#     "auto_detect": True
# }

# # 5. 只设置砂岩虚拟点
# virtual_config_sand_only = {
#     "smart": True,
#     "mud_range": None,         # 不设置泥岩虚拟点
#     "sand_range": (2, 4),      # 只设置砂岩虚拟点
#     "n_points": 10,
#     "noise_factor": 0.05
# }

print(f"智能虚拟点配置:")
print(f"  模式: 智能自动检测")
print(f"  每侧点数: {virtual_config_conservative['n_points']}")
print(f"  噪音因子: {virtual_config_conservative['noise_factor']}")

# 执行拟合
fit_result = sigmoid_model.fit(
    use_features=["PC1"],
    virtual_points_config=virtual_config_conservative,
    bounds=(
        [sand_thickness_max * 0.2, -10, pc1_min - (pc1_max - pc1_min)],  # 下界
        [sand_thickness_max * 3.0, 10, pc1_max + (pc1_max - pc1_min)],  # 上界
    ),
    initial_guess=[sand_thickness_max * 0.7, 1.0, pc1_median],
    max_iterations=3000,
)

In [None]:
# 步骤3: 可视化拟合结果
if fit_result["success"]:
    print("\n=== 拟合成功! ===")

    # 可视化拟合结果
    fig2 = sigmoid_model.visualize_fit(
        fit_result, figsize=(15, 6), save_path=os.path.join(output_dir, "sigmoid_fit_result.png")
    )
    plt.show()

    # 输出详细的拟合参数
    print("\n拟合参数:")
    params = fit_result["params"]
    param_errors = fit_result["param_errors"]
    for param in ["L", "k", "x0"]:
        print(f"  {param}: {params[param]:.4f} ± {param_errors[param + '_err']:.4f}")
    print(f"  R² score: {fit_result['r2_score']:.4f}")

    # 解释参数含义
    print(f"\n参数解释:")
    print(f"  L = {params['L']:.2f}: 最大砂厚渐近值 (m)")
    print(f"  k = {params['k']:.3f}: 增长率 ({'正向增长' if params['k'] > 0 else '负向增长'})")
    print(f"  x₀ = {params['x0']:.2f}: 中点位置（PC1值）")

    # 计算拟合质量指标
    rmse = np.sqrt(np.mean((fit_result["y"] - fit_result["y_pred"]) ** 2))
    mae = np.mean(np.abs(fit_result["y"] - fit_result["y_pred"]))
    print(f"\n拟合质量:")
    print(f"  RMSE: {rmse:.3f} m")
    print(f"  MAE: {mae:.3f} m")

else:
    print(f"\n=== 拟合失败 ===")
    print(f"错误信息: {fit_result['error']}")

In [None]:
# 步骤4: 多特征拟合尝试（如果有PC2）
if "PC2" in sigmoid_data.columns and fit_result["success"]:
    print(f"\n=== 步骤4: 多特征组合拟合 ===")

    # 使用PC1+PC2组合，权重根据方差贡献比设置
    explained_ratio = pca_results["explained_variance_ratio"]
    if len(explained_ratio) >= 2:
        # 根据方差贡献比设置权重
        total_var = explained_ratio[0] + explained_ratio[1]
        pc1_weight = explained_ratio[0] / total_var
        pc2_weight = explained_ratio[1] / total_var

        print(f"使用PC1+PC2组合:")
        print(f"  PC1权重: {pc1_weight:.3f} (方差贡献: {explained_ratio[0]:.3f})")
        print(f"  PC2权重: {pc2_weight:.3f} (方差贡献: {explained_ratio[1]:.3f})")

        # 多特征拟合 - 修正虚拟点配置变量名
        fit_result_multi = sigmoid_model.fit(
            use_features=["PC1", "PC2"],
            feature_weights=[pc1_weight, pc2_weight],
            virtual_points_config=virtual_config_conservative,  # 使用智能虚拟点配置
            bounds=([sand_thickness_max * 0.2, -10, -5], [sand_thickness_max * 3.0, 10, 5]),
            max_iterations=3000,
        )

        if fit_result_multi["success"]:
            print("多特征拟合成功!")
            fig3 = sigmoid_model.visualize_fit(
                fit_result_multi, figsize=(15, 6), save_path=os.path.join(output_dir, "sigmoid_multi_feature_fit.png")
            )
            plt.show()

            # 比较单特征和多特征结果
            print(f"\n性能比较:")
            print(f"  单特征(PC1) R²: {fit_result['r2_score']:.4f}")
            print(f"  多特征(PC1+PC2) R²: {fit_result_multi['r2_score']:.4f}")
            print(f"  R²提升: {fit_result_multi['r2_score'] - fit_result['r2_score']:.4f}")

            # 选择更好的模型
            if fit_result_multi["r2_score"] > fit_result["r2_score"]:
                print("  → 多特征模型表现更好")
                best_fit = fit_result_multi
                best_model_name = "多特征(PC1+PC2)"
            else:
                print("  → 单特征模型表现更好")
                best_fit = fit_result
                best_model_name = "单特征(PC1)"
        else:
            print(f"多特征拟合失败: {fit_result_multi['error']}")
            best_fit = fit_result
            best_model_name = "单特征(PC1)"
    else:
        best_fit = fit_result
        best_model_name = "单特征(PC1)"
else:
    best_fit = fit_result
    best_model_name = "单特征(PC1)"

In [None]:
# 步骤5: 保存模型结果并预测全工区
if best_fit["success"]:
    print(f"\n=== 步骤5: 保存结果并预测全工区 ===")

    # 保存拟合参数和模型信息
    fit_summary = {
        "model_type": "sigmoid",
        "best_model": best_model_name,
        "features_used": str(best_fit["use_features"]),
        "feature_weights": str(best_fit.get("feature_weights", "None")),
        "n_samples": len(sigmoid_data),
        "n_virtual_points": len(sigmoid_model.current_data) - len(sigmoid_data),
        "virtual_config": str(virtual_config_conservative),
        **best_fit["params"],
        **best_fit["param_errors"],
        "r2_score": best_fit["r2_score"],
        "rmse": np.sqrt(np.mean((best_fit["y"] - best_fit["y_pred"]) ** 2)),
        "mae": np.mean(np.abs(best_fit["y"] - best_fit["y_pred"])),
    }

    # 保存模型摘要
    summary_df = pd.DataFrame([fit_summary])
    summary_df.to_csv(os.path.join(output_dir, "sigmoid_model_summary.csv"), index=False)
    print(f"模型摘要已保存到: {os.path.join(output_dir, 'sigmoid_model_summary.csv')}")

    # 对全工区进行预测
    print("\n对全工区进行砂厚预测...")

    # 准备全工区PCA特征
    seismic_pca_features = pca_results["pca"].transform(pca_results["features_scaled"])
    seismic_pca_df = pd.DataFrame()

    # 根据最佳模型使用的特征数量准备数据
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        seismic_pca_df[f"PC{i + 1}"] = seismic_pca_features[:, i]

    # 使用最佳模型进行预测
    predicted_thickness = sigmoid_model.predict(
        seismic_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 创建预测结果DataFrame
    prediction_results = pca_results["coords_clean"].copy()
    prediction_results["Predicted_Sand_Thickness"] = predicted_thickness

    # 添加模型信息列
    prediction_results["Model_Type"] = best_model_name
    prediction_results["Model_R2"] = best_fit["r2_score"]

    # 保存预测结果
    prediction_results.to_csv(os.path.join(output_dir, "predicted_sand_thickness.csv"), index=False)
    print(f"预测结果已保存到: {os.path.join(output_dir, 'predicted_sand_thickness.csv')}")

    # 显示预测统计
    print(f"\n预测结果统计:")
    print(f"  预测样本数: {len(prediction_results)}")
    print(f"  预测砂厚范围: {predicted_thickness.min():.2f} - {predicted_thickness.max():.2f} m")
    print(f"  预测砂厚均值: {predicted_thickness.mean():.2f} m")
    print(f"  预测砂厚标准差: {predicted_thickness.std():.2f} m")
    print(f"  使用模型: {best_model_name}")
    print(f"  模型R²: {best_fit['r2_score']:.4f}")

    # 与真实砂厚对比（在预测范围内）
    real_thickness = sigmoid_data["Sand Thickness"].values
    print(f"\n与井点砂厚对比:")
    print(f"  井点砂厚范围: {real_thickness.min():.2f} - {real_thickness.max():.2f} m")
    print(f"  井点砂厚均值: {real_thickness.mean():.2f} m")
    print(
        f"  预测覆盖率: {100 * (predicted_thickness.max() >= real_thickness.max() and predicted_thickness.min() <= real_thickness.min()):.0f}%"
    )

    print(f"\n=== Sigmoid建模完成! ===")
    print(f"最佳模型: {best_model_name} (R² = {best_fit['r2_score']:.4f})")

else:
    print("\n模型拟合失败，无法进行后续预测")

In [None]:
# 步骤6: 使用visualize_attribute_map复用可视化预测结果
if best_fit["success"]:
    print(f"\n=== 步骤6: 可视化预测结果 ===")

    # 1. 使用visualize_attribute_map函数可视化预测结果的空间分布
    print("生成预测砂厚空间分布图...")
    visualize_attribute_map(
        data_points=prediction_results,
        attribute_name="Predicted_Sand_Thickness",
        attribute_label="预测砂厚 (m)",
        real_wells=data_well_purpose_surface_filtered,
        pseudo_wells=None,  # 没有虚拟井点
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="sigmoid_prediction",
        class_thresholds=[1, 10],
        figsize=(14, 10),
        dpi=300,
        cmap="viridis",
        point_size=150,
        well_size=200,
        vrange=None,  # 使用数据自身范围
    )

    # 2. 创建详细的预测分析图表
    plt.figure(figsize=(15, 10))

    # 子图1: 预测值 vs 真实值（井点处）
    plt.subplot(2, 2, 1)
    # 在井点位置提取预测值进行对比
    well_coords = data_well_purpose_surface_filtered[["X", "Y"]].values
    pred_coords = prediction_results[["X", "Y"]].values

    # 找到最近的预测点
    from scipy.spatial.distance import cdist

    distances = cdist(well_coords, pred_coords)
    closest_indices = np.argmin(distances, axis=1)
    well_predictions = predicted_thickness[closest_indices]

    plt.scatter(sigmoid_data["Sand Thickness"], well_predictions, alpha=0.7, s=80, edgecolors="black")

    # 添加1:1参考线
    min_val = min(sigmoid_data["Sand Thickness"].min(), well_predictions.min())
    max_val = max(sigmoid_data["Sand Thickness"].max(), well_predictions.max())
    plt.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)

    plt.xlabel("真实砂厚 (m)")
    plt.ylabel("预测砂厚 (m)")
    plt.title("预测 vs 真实砂厚")
    plt.grid(True, alpha=0.3)

    # 计算并显示相关系数
    correlation = np.corrcoef(sigmoid_data["Sand Thickness"], well_predictions)[0, 1]
    plt.text(
        0.05,
        0.95,
        f"相关系数: {correlation:.3f}",
        transform=plt.gca().transAxes,
        bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8),
    )

    # 子图2: 预测砂厚直方图对比
    plt.subplot(2, 2, 2)
    plt.hist(predicted_thickness, bins=50, alpha=0.7, color="skyblue", label=f"预测砂厚 (n={len(predicted_thickness)})")
    plt.hist(
        sigmoid_data["Sand Thickness"], bins=20, alpha=0.7, color="orange", label=f"井点砂厚 (n={len(sigmoid_data)})"
    )
    plt.xlabel("砂厚 (m)")
    plt.ylabel("频数")
    plt.title("砂厚分布对比")
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 子图3: 残差分布
    plt.subplot(2, 2, 3)
    residuals_well = sigmoid_data["Sand Thickness"].values - well_predictions
    plt.hist(residuals_well, bins=15, alpha=0.7, color="lightcoral")
    plt.axvline(x=0, color="red", linestyle="--", alpha=0.8)
    plt.xlabel("残差 (真实 - 预测)")
    plt.ylabel("频数")
    plt.title("井点处残差分布")
    plt.grid(True, alpha=0.3)

    # 添加残差统计
    residual_stats = f"均值: {np.mean(residuals_well):.3f}\n标准差: {np.std(residuals_well):.3f}"
    plt.text(
        0.05,
        0.95,
        residual_stats,
        transform=plt.gca().transAxes,
        verticalalignment="top",
        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
    )

    # 子图4: 模型性能摘要
    plt.subplot(2, 2, 4)
    plt.axis("off")  # 关闭坐标轴

    # 准备性能摘要文本
    performance_text = f"""
模型性能摘要

最佳模型: {best_model_name}
R^2 评分: {best_fit["r2_score"]:.4f}
RMSE: {np.sqrt(np.mean((best_fit["y"] - best_fit["y_pred"]) ** 2)):.3f} m
MAE: {np.mean(np.abs(best_fit["y"] - best_fit["y_pred"])):.3f} m

预测结果统计:
1. 预测样本数: {len(prediction_results):,}
2. 预测砂厚范围: {predicted_thickness.min():.2f} - {predicted_thickness.max():.2f} m
3. 预测砂厚均值: {predicted_thickness.mean():.2f} m
4. 预测砂厚标准差: {predicted_thickness.std():.2f} m

井点对比:
1. 井点砂厚范围: {sigmoid_data["Sand Thickness"].min():.2f} - {sigmoid_data["Sand Thickness"].max():.2f} m
2. 井点砂厚均值: {sigmoid_data["Sand Thickness"].mean():.2f} m
3. 预测-实际相关系数: {correlation:.3f}
    """

    plt.text(
        0.1,
        0.9,
        performance_text,
        transform=plt.gca().transAxes,
        fontsize=11,
        verticalalignment="top",
        bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8),
    )

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "sigmoid_prediction_detailed_analysis.png"), dpi=300, bbox_inches="tight")
    plt.show()

    print("预测结果可视化已完成")
    print(f"  - 空间分布图: sigmoid_prediction_map_with_wells.png")
    print(f"  - 属性直方图: sigmoid_prediction_attribute_histogram.png")
    print(f"  - 详细分析图: sigmoid_prediction_detailed_analysis.png")

In [None]:
# 提取样本，准备设置虚拟井
print("=== 提取样本，准备设置虚拟井 ===")

# 使用筛选后的地震数据区域提取等间距样本
seismic_samples = extract_uniform_seismic_samples(
    seismic_data=seismic_attr_filtered,
    n_rows=20,
    n_cols=20,
    area_bounds=area_bounds,  # 使用之前定义的区域边界
)

# 可视化真实井点和采样点
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样）
sample_ratio = min(1.0, 5000 / len(seismic_attr_filtered))
seismic_sample = seismic_attr_filtered.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="地震数据(抽样)")

# 绘制真实井点位置
plt.scatter(
    data_well_purpose_surface_filtered["X"],
    data_well_purpose_surface_filtered["Y"],
    color="red",
    s=100,
    marker="^",
    label="真实井点",
)

# 绘制等间距采样点位置
plt.scatter(seismic_samples["X"], seismic_samples["Y"], color="blue", s=50, marker="o", label="等间距采样点")

# 添加标题和图例
plt.title("真实井点与等间距采样点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "real_wells_and_seismic_samples.png"), dpi=300, bbox_inches="tight")
plt.show()

# 保存提取的样本数据
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples.csv"), index=False)
print(f"等间距地震样本数据已保存至 {os.path.join(output_dir, 'seismic_samples.csv')}")

print(f"提取的样本数量: {len(seismic_samples)}")
print(
    f"样本分布区域: X({seismic_samples['X'].min():.1f} - {seismic_samples['X'].max():.1f}), "
    f"Y({seismic_samples['Y'].min():.1f} - {seismic_samples['Y'].max():.1f})"
)

In [None]:
# 使用Sigmoid模型预测虚拟井砂厚
print("=== 使用Sigmoid模型预测虚拟井砂厚 ===")

if best_fit["success"]:
    # 为地震样本点提取PCA特征
    sample_features = seismic_samples[pca_results["features_clean"].columns].values

    # 使用相同的标准化器和PCA模型变换样本数据
    sample_features_scaled = pca_results["scaler"].transform(sample_features)
    sample_pca_features = pca_results["pca"].transform(sample_features_scaled)

    # 准备样本的PCA特征DataFrame
    sample_pca_df = pd.DataFrame()
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        sample_pca_df[f"PC{i + 1}"] = sample_pca_features[:, i]

    # 使用最佳Sigmoid模型进行预测
    predicted_sample_thickness = sigmoid_model.predict(
        sample_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 将预测结果添加到样本数据
    seismic_samples["Predicted_Sand_Thickness"] = predicted_sample_thickness

    # 将负值预测设为0
    negative_count = (predicted_sample_thickness < 0).sum()
    if negative_count > 0:
        print(f"注意: {negative_count} 个负的砂厚预测值已被替换为0")
        seismic_samples["Predicted_Sand_Thickness"] = seismic_samples["Predicted_Sand_Thickness"].clip(lower=0)

    # 显示预测统计
    print(f"\n虚拟井砂厚预测统计:")
    print(f"  样本数量: {len(seismic_samples)}")
    print(
        f"  预测砂厚范围: {seismic_samples['Predicted_Sand_Thickness'].min():.2f} - {seismic_samples['Predicted_Sand_Thickness'].max():.2f} m"
    )
    print(f"  预测砂厚均值: {seismic_samples['Predicted_Sand_Thickness'].mean():.2f} m")
    print(f"  预测砂厚标准差: {seismic_samples['Predicted_Sand_Thickness'].std():.2f} m")

    # 保存带预测结果的虚拟井数据
    seismic_samples.to_csv(os.path.join(output_dir, "virtual_wells_with_predictions.csv"), index=False)
    print(f"虚拟井预测结果已保存至 {os.path.join(output_dir, 'virtual_wells_with_predictions.csv')}")

else:
    print("Sigmoid模型拟合失败，无法生成虚拟井预测")

In [None]:
# 虚拟井优化选择（基于距离和砂厚分布）
print("=== 虚拟井优化选择（基于距离和砂厚分布）===")

if best_fit["success"]:
    from scipy.spatial.distance import cdist
    import numpy as np

    # 准备数据
    virtual_wells_data = seismic_samples.copy()  # 避免变量冲突
    real_wells_data = data_well_purpose_surface_filtered.copy()

    print(f"开始优化筛选，初始虚拟井数量: {len(virtual_wells_data)}")

    # === 第一层筛选：排除靠近真实井点且砂厚差异大的点 ===
    print("\n第一层筛选：排除靠近真实井点且砂厚差异大的点...")

    proximity_radius = 100  # 米，设置排除半径
    max_thickness_diff = 5.0  # 米，最大允许砂厚差异（适当放宽）

    # 获取真实井点的坐标和砂厚
    real_coords = real_wells_data[["X", "Y"]].values
    real_thickness = real_wells_data["Sand Thickness"].values

    # 获取虚拟井的坐标和预测砂厚
    virtual_coords = virtual_wells_data[["X", "Y"]].values
    virtual_thickness = virtual_wells_data["Predicted_Sand_Thickness"].values

    # 计算每个虚拟井到所有真实井的距离
    distances = cdist(virtual_coords, real_coords)
    min_distances = np.min(distances, axis=1)
    closest_well_indices = np.argmin(distances, axis=1)

    # 标记需要排除的虚拟井
    exclude_mask = np.zeros(len(virtual_wells_data), dtype=bool)
    excluded_count = 0

    for i in range(len(virtual_wells_data)):
        closest_well_idx = closest_well_indices[i]
        distance_to_closest = min_distances[i]

        if distance_to_closest <= proximity_radius:
            thickness_diff = abs(virtual_thickness[i] - real_thickness[closest_well_idx])
            if thickness_diff > max_thickness_diff:
                exclude_mask[i] = True
                excluded_count += 1

    # 应用排除掩码
    layer1_filtered = virtual_wells_data[~exclude_mask].copy().reset_index(drop=True)
    print(f"第一层筛选完成：排除了 {excluded_count} 个点，剩余 {len(layer1_filtered)} 个点")

    # === 第二层筛选：基于距离的贪心选择 ===
    print("\n第二层筛选：基于距离的贪心选择...")

    min_virtual_distance = 200  # 虚拟井之间最小距离（米）
    min_real_distance = 150  # 虚拟井与真实井最小距离（米）

    # 计算虚拟井之间的距离矩阵
    layer1_coords = layer1_filtered[["X", "Y"]].values
    virtual_distances = cdist(layer1_coords, layer1_coords)

    # 计算虚拟井到真实井的距离
    distances_to_real = cdist(layer1_coords, real_coords)
    min_distances_to_real = np.min(distances_to_real, axis=1)

    # 按砂厚预测值排序，优先选择有代表性的砂厚值
    thickness_values = layer1_filtered["Predicted_Sand_Thickness"].values
    thickness_order = np.argsort(thickness_values)

    selected_indices = []

    for idx in thickness_order:
        # 检查与真实井的距离
        if min_distances_to_real[idx] < min_real_distance:
            continue

        # 检查与已选虚拟井的距离
        too_close = False
        for selected_idx in selected_indices:
            if virtual_distances[idx, selected_idx] < min_virtual_distance:
                too_close = True
                break

        if not too_close:
            selected_indices.append(idx)

    layer2_filtered = layer1_filtered.iloc[selected_indices].copy().reset_index(drop=True)
    print(f"第二层筛选完成：选择了 {len(layer2_filtered)} 个距离合适的点")

    # === 第三层筛选：基于砂厚分布的均衡选择 ===
    print("\n第三层筛选：基于砂厚分布的均衡选择...")

    # 定义砂厚区间
    thickness_bins = [0, 1, 10, 20, np.inf]
    bin_labels = ["0-1m", "1-10m", "10-20m", ">20m"]

    # 每个区间最多选择的样本数
    max_samples_per_bin = 30
    min_samples_per_bin = 5

    final_selected_indices = []

    for i in range(len(thickness_bins) - 1):
        # 获取该区间的虚拟井
        bin_mask = (layer2_filtered["Predicted_Sand_Thickness"] >= thickness_bins[i]) & (
            layer2_filtered["Predicted_Sand_Thickness"] < thickness_bins[i + 1]
        )
        bin_indices = layer2_filtered.index[bin_mask].tolist()

        if len(bin_indices) == 0:
            print(f"  区间 {bin_labels[i]}: 无可用样本")
            continue

        # 如果样本数超过最大限制，随机选择
        if len(bin_indices) > max_samples_per_bin:
            selected_bin_indices = np.random.choice(bin_indices, max_samples_per_bin, replace=False).tolist()
        else:
            selected_bin_indices = bin_indices

        final_selected_indices.extend(selected_bin_indices)
        print(f"  区间 {bin_labels[i]}: 从 {len(bin_indices)} 个中选择了 {len(selected_bin_indices)} 个")

    # 生成最终优化的虚拟井数据
    optimized_virtual_wells = layer2_filtered.loc[final_selected_indices].copy().reset_index(drop=True)

    print(f"\n虚拟井优化筛选结果:")
    print(f"  原始虚拟井数量: {len(virtual_wells_data)}")
    print(f"  第一层筛选后: {len(layer1_filtered)}")
    print(f"  第二层筛选后: {len(layer2_filtered)}")
    print(f"  最终优化数量: {len(optimized_virtual_wells)}")

    # 保存优化后的虚拟井
    optimized_virtual_wells.to_csv(os.path.join(output_dir, "optimized_virtual_wells.csv"), index=False)

    # 统计最终分布
    print(f"\n最终砂厚分布:")
    for i in range(len(thickness_bins) - 1):
        bin_mask = (optimized_virtual_wells["Predicted_Sand_Thickness"] >= thickness_bins[i]) & (
            optimized_virtual_wells["Predicted_Sand_Thickness"] < thickness_bins[i + 1]
        )
        bin_count = bin_mask.sum()
        bin_percent = bin_count / len(optimized_virtual_wells) * 100 if len(optimized_virtual_wells) > 0 else 0
        print(f"  {bin_labels[i]}: {bin_count} 个 ({bin_percent:.1f}%)")

else:
    print("Sigmoid模型拟合失败，无法进行虚拟井优化")

In [None]:
# 虚拟井展示和分析
print("=== 虚拟井展示和分析 ===")

if best_fit["success"]:
    # 1. 为整个地震数据区域预测砂厚
    print("为整个地震数据区域预测砂厚...")

    # 为地震数据提取PCA特征
    seismic_features = seismic_attr_filtered[pca_results["features_clean"].columns].values

    # 使用相同的标准化器和PCA模型变换地震数据
    seismic_features_scaled = pca_results["scaler"].transform(seismic_features)
    seismic_pca_features = pca_results["pca"].transform(seismic_features_scaled)

    # 准备地震数据的PCA特征DataFrame
    seismic_pca_df = pd.DataFrame()
    max_components = len(best_fit["use_features"])
    for i in range(max_components):
        seismic_pca_df[f"PC{i + 1}"] = seismic_pca_features[:, i]

    # 使用最佳Sigmoid模型预测整个地震数据区域的砂厚
    seismic_predicted_thickness = sigmoid_model.predict(
        seismic_pca_df, use_features=best_fit["use_features"], feature_weights=best_fit.get("feature_weights")
    )

    # 将负值预测设为0
    seismic_predicted_thickness = np.maximum(seismic_predicted_thickness, 0)

    # 将预测结果添加到地震数据中
    seismic_attr_filtered_with_pred = seismic_attr_filtered.copy()
    seismic_attr_filtered_with_pred["Predicted_Sand_Thickness"] = seismic_predicted_thickness

    print(f"整个地震数据区域预测完成，预测点数: {len(seismic_attr_filtered_with_pred)}")

    # 2. 使用优化后的虚拟井数据进行可视化
    print("生成虚拟井砂厚分布图...")

    # 准备真实井点数据
    real_wells = data_well_purpose_surface_filtered.copy()

    # 使用优化后的虚拟井点数据（而不是原始的seismic_samples）
    if "optimized_virtual_wells" in locals() and len(optimized_virtual_wells) > 0:
        pseudo_wells = optimized_virtual_wells.copy()
        print(f"使用优化后的虚拟井数据: {len(pseudo_wells)} 个")
    else:
        pseudo_wells = seismic_samples.copy()
        print(f"使用原始虚拟井数据: {len(pseudo_wells)} 个")
        print("警告：未找到优化后的虚拟井数据，使用原始数据")

    # 可视化虚拟井砂厚分布
    visualize_attribute_map(
        data_points=seismic_attr_filtered_with_pred,
        attribute_name="Predicted_Sand_Thickness",
        attribute_label="预测砂厚 (m)",
        real_wells=real_wells,
        pseudo_wells=pseudo_wells,  # 现在使用的是优化后的数据
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="virtual_wells_optimized",  # 修改文件名以区分
        class_thresholds=[1, 10],
        figsize=(16, 14),
        dpi=300,
        cmap="viridis",
        point_size=140,
        well_size=200,
    )

    # 3. 创建真实井和虚拟井砂厚分布对比
    print("创建真实井和虚拟井砂厚分布对比...")

    # 提取真实井和虚拟井的砂厚数据
    real_thickness = real_wells["Sand Thickness"].values
    virtual_thickness = pseudo_wells["Predicted_Sand_Thickness"].values

    # 设置砂厚区间
    max_thickness = max(np.max(real_thickness), np.max(virtual_thickness))
    thickness_bins = [0, 1, 10, 20, max_thickness + 1]
    thickness_labels = ["0-1", "1-10", "10-20", f">20"]

    # 计算各区间的井点数量
    real_hist, _ = np.histogram(real_thickness, bins=thickness_bins)
    virtual_hist, _ = np.histogram(virtual_thickness, bins=thickness_bins)

    # 计算百分比
    real_percent = real_hist / len(real_thickness) * 100
    virtual_percent = virtual_hist / len(virtual_thickness) * 100

    # 创建直方图
    plt.figure(figsize=(12, 8))

    # 设置柱状图位置
    bar_width = 0.35
    r1 = np.arange(len(thickness_labels))
    r2 = [x + bar_width for x in r1]

    # 绘制真实井砂厚分布
    plt.bar(r1, real_percent, width=bar_width, color="crimson", alpha=0.7, label="真实井点砂厚")

    # 绘制虚拟井砂厚分布
    plt.bar(r2, virtual_percent, width=bar_width, color="royalblue", alpha=0.7, label="虚拟井点砂厚")

    # 添加数据标签
    for i, v in enumerate(real_percent):
        plt.text(r1[i], v + 1, f"{v:.1f}%", ha="center", va="bottom", fontweight="bold", color="crimson")

    for i, v in enumerate(virtual_percent):
        plt.text(r2[i], v + 1, f"{v:.1f}%", ha="center", va="bottom", fontweight="bold", color="royalblue")

    # 设置图表属性
    plt.xlabel("砂厚区间(米)", fontsize=14)
    plt.ylabel("百分比(%)", fontsize=14)
    plt.title("真实井点与虚拟井点砂厚分布对比", fontsize=16)
    plt.xticks([r + bar_width / 2 for r in range(len(thickness_labels))], thickness_labels)
    plt.ylim(0, max(max(real_percent), max(virtual_percent)) * 1.2)

    # 添加图例和网格
    plt.legend(loc="upper right", fontsize=12)
    plt.grid(True, linestyle="--", alpha=0.3)

    # 保存图表
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "real_vs_virtual_thickness_histogram.png"), dpi=300, bbox_inches="tight")
    plt.show()

    # 4. 详细统计信息对比
    stats_data = {
        "统计指标": ["样本数量", "平均值(米)", "中位数(米)", "标准差(米)", "最小值(米)", "最大值(米)"],
        "真实井点": [
            len(real_thickness),
            np.mean(real_thickness),
            np.median(real_thickness),
            np.std(real_thickness),
            np.min(real_thickness),
            np.max(real_thickness),
        ],
        "虚拟井点": [
            len(virtual_thickness),
            np.mean(virtual_thickness),
            np.median(virtual_thickness),
            np.std(virtual_thickness),
            np.min(virtual_thickness),
            np.max(virtual_thickness),
        ],
    }

    # 创建DataFrame并打印
    stats_df = pd.DataFrame(stats_data)
    print("\n砂厚统计信息对比:")
    print(
        stats_df.to_string(
            index=False,
            formatters={
                "真实井点": lambda x: f"{x:.2f}" if isinstance(x, float) else str(x),
                "虚拟井点": lambda x: f"{x:.2f}" if isinstance(x, float) else str(x),
            },
        )
    )

    # 保存统计信息
    stats_df.to_csv(os.path.join(output_dir, "real_vs_virtual_thickness_stats.csv"), index=False)

    # 5. 虚拟井质量评估
    print("\n虚拟井质量评估:")

    # 计算虚拟井与真实井的距离分布
    from scipy.spatial.distance import cdist

    real_coords = real_wells[["X", "Y"]].values
    virtual_coords = pseudo_wells[["X", "Y"]].values

    # 计算每个虚拟井到最近真实井的距离
    distances = cdist(virtual_coords, real_coords)
    min_distances = np.min(distances, axis=1)

    print(f"  虚拟井到最近真实井的距离统计:")
    print(f"    平均距离: {np.mean(min_distances):.1f} m")
    print(f"    最小距离: {np.min(min_distances):.1f} m")
    print(f"    最大距离: {np.max(min_distances):.1f} m")
    print(f"    中位数距离: {np.median(min_distances):.1f} m")

    # 距离分布直方图
    plt.figure(figsize=(10, 6))
    plt.hist(min_distances, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
    plt.xlabel("到最近真实井的距离 (m)")
    plt.ylabel("虚拟井数量")
    plt.title("虚拟井到最近真实井的距离分布")
    plt.grid(True, alpha=0.3)
    plt.savefig(os.path.join(output_dir, "virtual_wells_distance_distribution.png"), dpi=300, bbox_inches="tight")
    plt.show()

    print("虚拟井分析完成，所有结果已保存到输出目录")
    print(f"  - 虚拟井空间分布图: virtual_wells_map_with_wells.png")
    print(f"  - 砂厚分布对比图: real_vs_virtual_thickness_histogram.png")
    print(f"  - 统计信息表: real_vs_virtual_thickness_stats.csv")
    print(f"  - 距离分布图: virtual_wells_distance_distribution.png")
    print(f"  - 虚拟井数据: virtual_wells_with_predictions.csv")

else:
    print("Sigmoid模型拟合失败，无法进行虚拟井分析")