# 聚类内部 Sigmoid 建模 + 生成伪样本

In [None]:
# 确保src目录在Python路径中
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.sigmoid import SigmoidModel
from src.visualization import (
    visualize_attribute_map,
    visualize_feature_distribution,
    visualize_gmm_clustering,
    visualize_pca_clustering,
)

# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

In [None]:
# ==================== 层位配置 ====================
# 修改这里可以更换不同的层位
SURFACE_NAME = "H2-4"  # 例如: "H2-4", "H5-1", "H6-2"等
# ================================================

## 输入、输出

In [None]:
# 根据层位名称生成相关路径和配置
data_dir = "../../data"
data_tmp_dir = "data_tmp"
output_dir = f"{SURFACE_NAME.replace('-', '_')}_ps_output"  # H2-4 -> H2_4_ps_output

# 创建目录
if not os.path.exists(data_tmp_dir):
    os.makedirs(data_tmp_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"当前处理层位: {SURFACE_NAME}")
print(f"输出目录: {output_dir}")

In [None]:
# 导入地震数据
data_seismic_url = os.path.join(data_dir, SURFACE_NAME)
print(f"地震数据路径: {data_seismic_url}")

data_seismic_attr = parse_petrel_file(data_seismic_url)

# 导入井点位置
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == SURFACE_NAME]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

print(f"层位 {SURFACE_NAME} 的井点数量: {len(data_well_purpose_surface_position)}")
data_well_purpose_surface_position.head()

## 缩小工区范围到井控区

In [None]:
# 筛选离群井
data_well_purpose_surface_filtered = filter_outlier_wells(data_well_purpose_surface_position, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_purpose_surface_position)}")
print(f"筛选后井点数量: {len(data_well_purpose_surface_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_purpose_surface_position["X"].min()
x_max = data_well_purpose_surface_position["X"].max()
y_min = data_well_purpose_surface_position["Y"].min()
y_max = data_well_purpose_surface_position["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_purpose_surface_position["X"], data_well_purpose_surface_position["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_purpose_surface_filtered["X"], data_well_purpose_surface_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_url)

processed_features, stats, report = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,
    outlier_method="iqr",
    outlier_threshold=2.0,
    outlier_treatment="clip",  # 边界截断
    verbose=True,
)

# 提取筛选后的属性
attribute_names_filtered = [col for col in processed_features.columns]

# 将处理后的属性数据与原始坐标数据合并
processed_seismic_full = data_seismic_attr[["X", "Y"]].copy()  # type: ignore
for col in processed_features.columns:
    processed_seismic_full[col] = processed_features[col]

In [None]:
# 限制工区范围
seismic_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=processed_seismic_full,
    well_data=data_well_purpose_surface_filtered,
    expansion_factor=2,  # 扩展100%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

In [None]:
# 为筛选后的井点提取地震属性
well_attr_filtered = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_filtered, seismic_data=processed_seismic_full, max_distance=50, num_points=5
)

# 保存处理结果
well_attr_filtered.to_excel(os.path.join(data_tmp_dir, "wells_attr_filtered.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 wells_attr_filtered.xlsx")

In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=seismic_attr_filtered,
    well_data=well_attr_filtered,
    common_attributes=attribute_names_filtered,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,
    verbose=True,
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## PCA 降维属性用于聚类，并分类建模

In [None]:
pca_results = perform_pca_analysis(
    data=seismic_attr_filtered,
    attribute_columns=good_attributes,
    variance_threshold=0.75,
    output_dir=output_dir,
)

# 使用visualize_attribute_map函数可视化PC1在地理空间的分布
print("\n使用visualize_attribute_map函数可视化PC1分布...")

# 准备PC1数据 - 创建包含坐标和PC1值的DataFrame
pc1_data = pca_results["features_pca"][:, 0]  # 第一主成分
coords = pca_results["coords_clean"]

# 创建用于可视化的数据框
pc1_visualization_data = pd.DataFrame({"X": coords["X"].values, "Y": coords["Y"].values, "PC1": pc1_data})

# 使用visualize_attribute_map函数
visualize_attribute_map(
    data_points=pc1_visualization_data,
    attribute_name="PC1",  # 要可视化的属性列名
    attribute_label="第一主成分 (PC1)",  # 在图例和颜色条中的显示名称
    real_wells=data_well_purpose_surface_filtered,  # 真实井点数据
    pseudo_wells=None,  # 暂时没有虚拟井点
    target_column="Sand Thickness",  # 井点的目标列
    output_dir=output_dir,
    filename_prefix="pc1",  # 输出文件前缀
    class_thresholds=[15, 20],  # 砂厚分类阈值
    figsize=(14, 12),
    dpi=300,
    cmap="viridis",  # 使用viridis色彩图
    point_size=15,  # 地震数据点大小
    well_size=80,  # 井点标记大小
    vrange=None,  # 使用数据自身的范围
)

# 如果需要，也可以可视化PC2
if pca_results["features_pca"].shape[1] >= 2:
    print("\n可视化PC2分布...")

    pc2_data = pca_results["features_pca"][:, 1]  # 第二主成分

    # 创建PC2可视化数据
    pc2_visualization_data = pd.DataFrame({"X": coords["X"].values, "Y": coords["Y"].values, "PC2": pc2_data})

    # 可视化PC2
    visualize_attribute_map(
        data_points=pc2_visualization_data,
        attribute_name="PC2",
        attribute_label="第二主成分 (PC2)",
        real_wells=data_well_purpose_surface_filtered,
        pseudo_wells=None,
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="pc2",
        class_thresholds=[15, 20],
        figsize=(14, 12),
        dpi=300,
        cmap="plasma",  # 使用不同的色彩图区分PC1和PC2
        point_size=15,
        well_size=80,
        vrange=None,
    )

In [None]:
# 首先评估最佳聚类数
# gmm_evaluation = evaluate_gmm_clusters(features_pca=pca_results["features_pca"], max_clusters=10, output_dir=output_dir)

# 使用不同的聚类数执行GMM聚类
# 根据BIC/AIC结果选择的最佳聚类数
# best_n = gmm_evaluation["best_n_components"]

In [None]:
best_n = 2  # 井控区的聚类数量暂时固定为2

# 1. 执行GMM聚类
gmm_results = perform_gmm_clustering(
    features=pca_results["features_pca"],
    coords=pca_results["coords_clean"],
    n_clusters=best_n,
)
gmm_results["result_df"].to_csv(os.path.join(output_dir, "gmm_best_clusters.csv"), index=False)

# 2. PCA可视化，需要将井点数据投影到PCA空间
# 首先提取井点的属性列
well_features = well_attr_filtered[pca_results["features_clean"].columns].values
# 使用相同的标准化器和PCA模型变换井点数据
well_features_scaled = pca_results["scaler"].transform(well_features)
well_pca_features = pca_results["pca"].transform(well_features_scaled)

# 3. 在PCA空间中可视化聚类结果
visualize_pca_clustering(
    clustering_results=gmm_results,
    pca_results=pca_results,
    n_clusters=best_n,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    well_pca_features=well_pca_features,
    target_column="Sand Thickness",
    class_thresholds=[15, 20],
)

# 4. 在地理空间中可视化聚类结果
visualize_gmm_clustering(
    clustering_results=gmm_results,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_purpose_surface_filtered,
    target_column="Sand Thickness",
    class_thresholds=[15, 20],
    point_size=10,
    well_size=40,
)