# PCA + 根据现有井点进行聚类解释

In [None]:
# 确保src目录在Python路径中
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.sigmoid import SigmoidModel
from src.visualization import (
    visualize_attribute_map,
    visualize_feature_distribution,
    visualize_gmm_clustering,
    visualize_pca_clustering,
)

# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

In [None]:
# ==================== 层位配置 ====================
# 修改这里可以更换不同的层位
SURFACE_NAME = "H2-4"  # 例如: "H2-4", "H5-1", "H6-2"等
# ================================================

## 导入数据

In [None]:
# 根据层位名称生成相关路径和配置
data_dir = "../../data"
data_tmp_dir = "data_tmp"
output_dir = f"{SURFACE_NAME.replace('-', '_')}_ps_output"  # H2-4 -> H2_4_ps_output

# 创建目录
if not os.path.exists(data_tmp_dir):
    os.makedirs(data_tmp_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"当前处理层位: {SURFACE_NAME}")
print(f"输出目录: {output_dir}")

In [None]:
# 导入地震数据
data_seismic_url = os.path.join(data_dir, SURFACE_NAME)
print(f"地震数据路径: {data_seismic_url}")

data_seismic_attr = parse_petrel_file(data_seismic_url)

# 导入井点位置
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == SURFACE_NAME]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

print(f"层位 {SURFACE_NAME} 的井点数量: {len(data_well_purpose_surface_position)}")
data_well_purpose_surface_position.head()

## 数据预处理

In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_url)

processed_features, stats, report = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,
    outlier_method="iqr",
    outlier_threshold=2.0,
    outlier_treatment="clip",  # 边界截断
    verbose=True,
)

# 提取筛选后的属性
attribute_names_processed = [col for col in processed_features.columns]

# 将处理后的属性数据与原始坐标数据合并
data_seismic_attr_processed = data_seismic_attr[["X", "Y"]].copy()  # type: ignore
for col in processed_features.columns:
    data_seismic_attr_processed[col] = processed_features[col]

In [None]:
# 为筛选后的井点提取地震属性
data_well_attr = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_position,
    seismic_data=data_seismic_attr_processed,
    max_distance=50,
    num_points=5,
)

# 保存处理结果
data_well_attr.to_excel(os.path.join(data_tmp_dir, f"{SURFACE_NAME.replace('-', '_')}_wells_attr.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 {data_tmp_dir}")

## PCA 降维，PC1 作为融合属性

In [None]:
pca_results = perform_pca_analysis(
    data=data_seismic_attr_processed,
    attribute_columns=attribute_names_processed,
    variance_threshold=0.9,
    output_dir=output_dir,
)

In [None]:
# 可视化PC1在地理空间的分布
print("\n可视化PC1分布...")

# 创建包含主成分的新数据框
data_seismic_attr_with_PC = data_seismic_attr_processed.copy()

# 添加主成分列到地震数据中
pc1_data = pca_results["features_pca"][:, 0]  # 第一主成分
data_seismic_attr_with_PC["PC1"] = pc1_data

# 如果有PC2，也添加进去
if pca_results["features_pca"].shape[1] >= 2:
    pc2_data = pca_results["features_pca"][:, 1]  # 第二主成分
    data_seismic_attr_with_PC["PC2"] = pc2_data

print(f"添加主成分后数据形状: {data_seismic_attr_with_PC.shape}")
print(f"新增列: {[col for col in data_seismic_attr_with_PC.columns if col.startswith('PC')]}")

# 可视化PC1空间分布
visualize_attribute_map(
    data_points=data_seismic_attr_with_PC,
    attribute_name="PC1",  # 要可视化的属性列名
    attribute_label="第一主成分 (PC1)",  # 在图例和颜色条中的显示名称
    real_wells=data_well_purpose_surface_position,  # 真实井点数据
    pseudo_wells=None,  # 暂时没有虚拟井点
    target_column="Sand Thickness",  # 井点的目标列
    output_dir=output_dir,
    filename_prefix="pc1",  # 输出文件前缀
    class_thresholds=[1, 13.75],  # 砂厚分类阈值
    figsize=(14, 12),
    dpi=300,
    cmap="viridis",  # 使用viridis色彩图
    point_size=10,  # 地震数据点大小
    well_size=50,  # 井点标记大小
    vrange=(-2, 2),  # 使用数据自身的范围
)

# ========== 导出PC1为Petrel格式txt文件 ==========
print("\n导出PC1为Petrel格式...")

# 准备Petrel格式数据：直接从完整数据框中选择需要的列
petrel_pc1_data = data_seismic_attr_with_PC[["X", "Y", "PC1"]].copy()

# 生成文件名
pc1_filename = f"{SURFACE_NAME.replace('-', '_')}_PC1.txt"
pc1_filepath = os.path.join(output_dir, pc1_filename)

# 保存为txt文件，使用空格分隔
petrel_pc1_data.to_csv(
    pc1_filepath,
    sep=" ",  # 使用空格分隔
    index=False,  # 不保存索引
    header=True,  # 保存列名
    float_format="%.6f",  # 保留6位小数
)

print(f"PC1数据已导出: {pc1_filename}")
print(f"  文件路径: {pc1_filepath}")
print(f"  数据点数: {len(petrel_pc1_data):,}")
print(f"  PC1值范围: {data_seismic_attr_with_PC['PC1'].min():.4f} 到 {data_seismic_attr_with_PC['PC1'].max():.4f}")

# 如果需要，也可以可视化PC2
if "PC2" in data_seismic_attr_with_PC.columns:
    print("\n可视化PC2分布...")

    # 可视化PC2
    visualize_attribute_map(
        data_points=data_seismic_attr_with_PC,
        attribute_name="PC2",
        attribute_label="第二主成分 (PC2)",
        real_wells=data_well_purpose_surface_position,
        pseudo_wells=None,
        target_column="Sand Thickness",
        output_dir=output_dir,
        filename_prefix="pc2",
        class_thresholds=[1, 13.75],
        figsize=(14, 12),
        dpi=300,
        cmap="plasma",  # 使用不同的色彩图区分PC1和PC2
        point_size=10,
        well_size=50,
        vrange=None,
    )

    # ========== 可选：同时导出PC2为Petrel格式 ==========
    print("\n导出PC2为Petrel格式...")

    # 准备PC2的Petrel格式数据：直接从完整数据框中选择需要的列
    petrel_pc2_data = data_seismic_attr_with_PC[["X", "Y", "PC2"]].copy()

    # 生成PC2文件名
    pc2_filename = f"{SURFACE_NAME.replace('-', '_')}_PC2.txt"
    pc2_filepath = os.path.join(output_dir, pc2_filename)

    # 保存PC2为txt文件
    petrel_pc2_data.to_csv(pc2_filepath, sep=" ", index=False, header=True, float_format="%.6f")

    print(f"PC2数据已导出: {pc2_filename}")
    print(f"  PC2值范围: {data_seismic_attr_with_PC['PC2'].min():.4f} 到 {data_seismic_attr_with_PC['PC2'].max():.4f}")

print(f"\n=== 数据整合和导出完成 ===")
print(f"输出目录: {output_dir}")
print(f"完整数据变量: data_seismic_attr_with_PC (包含原始属性+主成分)")
print(f"数据形状: {data_seismic_attr_with_PC.shape}")
print(f"导出的文件:")
print(f"  • {pc1_filename}")
if "PC2" in data_seismic_attr_with_PC.columns:
    print(f"  • {pc2_filename}")

## 

## 在井控区进行 GMM 聚类

In [None]:
# 筛选离群井
data_well_attr_filtered = filter_outlier_wells(data_well_attr, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_attr)}")
print(f"筛选后井点数量: {len(data_well_attr_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_attr["X"].min()
x_max = data_well_attr["X"].max()
y_min = data_well_attr["Y"].min()
y_max = data_well_attr["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_attr["X"], data_well_attr["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_attr_filtered["X"], data_well_attr_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

In [None]:
# 限制工区范围
data_seismic_attr_with_PC_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=data_seismic_attr_with_PC,
    well_data=data_well_attr_filtered,
    expansion_factor=2,  # 扩展100%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

In [None]:
# 使用PC1和PC2进行聚类评估

# 提取用于聚类的特征
clustering_features = data_seismic_attr_with_PC_filtered[["PC1", "PC2"]].values

# 提取井控区的地震坐标
data_seismic_attr_with_PC_filtered_coords = data_seismic_attr_with_PC_filtered[["X", "Y"]]

# 评估聚类数
gmm_evaluation = evaluate_gmm_clusters(
    features_for_clustering=clustering_features, max_clusters=4, output_dir=output_dir
)

# 获取推荐的聚类数
best_n = gmm_evaluation["best_n_components"]
print(f"推荐使用 {best_n} 个聚类")

# 执行GMM聚类
gmm_results = perform_gmm_clustering(
    features=clustering_features,
    coords=data_seismic_attr_with_PC_filtered_coords,
    n_clusters=best_n,
)
gmm_results["result_df"].to_csv(os.path.join(output_dir, "gmm_best_clusters.csv"), index=False)

In [None]:
# 1. PCA可视化，需要将井点数据投影到PCA空间
# 首先提取井点的属性列
data_well_attr_filtered_features = data_well_attr_filtered[attribute_names_processed].values
# 使用相同的标准化器和PCA模型变换井点数据
data_well_attr_filtered_features_scaled = pca_results["scaler"].transform(data_well_attr_filtered_features)
data_well_attr_filtered_pca_features = pca_results["pca"].transform(data_well_attr_filtered_features_scaled)

# 2. 在PCA空间中可视化聚类结果
visualize_pca_clustering(
    features_pca=clustering_features,
    cluster_labels=gmm_results["cluster_labels"],
    n_clusters=best_n,
    output_dir=output_dir,
    well_data=data_well_attr_filtered,
    well_pca_features=data_well_attr_filtered_pca_features,
    target_column="Sand Thickness",
    class_thresholds=[1, 13.75],
)

# 3. 在地理空间中可视化聚类结果
visualize_gmm_clustering(
    clustering_results=gmm_results,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_attr_filtered,
    target_column="Sand Thickness",
    class_thresholds=[1, 13.75],
    point_size=100,
    well_size=200,
)

## 结果分析

In [None]:
import numpy as np
from scipy.stats import pearsonr

print("======== 井控区结果分析 ========")

# 1. 为井点分配聚类标签
# 首先需要将井点的PCA特征分配到相应的聚类中
well_cluster_labels = gmm_results["gmm"].predict(data_well_attr_filtered_pca_features[:, :2])  # 使用前两个主成分

# 2. 创建用于可视化的数据框
well_analysis_data = data_well_attr_filtered.copy()
well_analysis_data["PC1"] = data_well_attr_filtered_pca_features[:, 0]  # 第一主成分
well_analysis_data["PC2"] = data_well_attr_filtered_pca_features[:, 1]  # 第二主成分（如果需要）
well_analysis_data["Cluster"] = well_cluster_labels  # 聚类标签

print(f"井点数据形状: {well_analysis_data.shape}")
print(f"井点聚类分布:")
for cluster in sorted(np.unique(well_cluster_labels)):
    count = sum(well_cluster_labels == cluster)
    print(f"  聚类 {cluster}: {count} 个井点")

# 3. 可视化PC1 vs 砂厚，用聚类标签着色
print("\n可视化PC1 vs 砂厚（按聚类着色）...")

# 创建聚类标签映射
cluster_labels_dict = {cluster: f"聚类 {cluster}" for cluster in sorted(np.unique(well_cluster_labels))}

# 使用visualize_feature_distribution函数
fig1 = visualize_feature_distribution(
    data=well_analysis_data,
    x_feature="PC1",
    y_feature="Sand Thickness",
    color_feature="Cluster",  # 使用聚类作为颜色特征
    figsize=(12, 8),
    point_size=100,
    alpha=0.7,
    colormap="tab10",  # 使用离散颜色映射
    title="井控区：PC1 vs 砂厚（按聚类分类）",
    save_path=os.path.join(output_dir, "well_pc1_vs_sand_thickness_by_cluster.png"),
    discrete_colors=True,  # 使用离散颜色
    color_labels=cluster_labels_dict,
)

# 4. 计算PC1与砂厚的相关性
pc1_values = well_analysis_data["PC1"].values
sand_thickness_values = well_analysis_data["Sand Thickness"].values

# 计算皮尔逊相关系数
correlation_coef, p_value = pearsonr(pc1_values, sand_thickness_values)

print(f"\n======== 相关性分析结果 ========")
print(f"PC1 与砂厚的皮尔逊相关系数: {correlation_coef:.4f}")
print(f"p值: {p_value:.6f}")
print(f"相关性阈值: 0.3")

# 判断相关性
if abs(correlation_coef) >= 0.3:
    if correlation_coef > 0:
        correlation_strength = "正相关"
    else:
        correlation_strength = "负相关"

    if abs(correlation_coef) >= 0.7:
        correlation_level = "强"
    elif abs(correlation_coef) >= 0.5:
        correlation_level = "中等"
    else:
        correlation_level = "弱"

    print(f"结论: PC1与砂厚存在{correlation_level}{correlation_strength} (|r| = {abs(correlation_coef):.4f} >= 0.3)")
else:
    print(f"结论: PC1与砂厚相关性不显著 (|r| = {abs(correlation_coef):.4f} < 0.3)")

# 5. 额外的可视化：PC1 vs 砂厚，直接用砂厚着色（连续色彩）
print("\n可视化PC1 vs 砂厚（连续色彩）...")

fig2 = visualize_feature_distribution(
    data=well_analysis_data,
    x_feature="PC1",
    y_feature="Sand Thickness",
    color_feature="Sand Thickness",  # 使用砂厚作为颜色特征
    figsize=(12, 8),
    point_size=100,
    alpha=0.7,
    colormap="viridis",  # 使用连续颜色映射
    title="井控区：PC1 vs 砂厚（连续色彩）",
    save_path=os.path.join(output_dir, "well_pc1_vs_sand_thickness_continuous.png"),
    discrete_colors=False,  # 使用连续颜色
)

# 6. 分聚类统计分析
print(f"\n======== 分聚类统计分析 ========")
for cluster in sorted(np.unique(well_cluster_labels)):
    cluster_mask = well_analysis_data["Cluster"] == cluster
    cluster_data = well_analysis_data[cluster_mask]

    pc1_mean = cluster_data["PC1"].mean()
    pc1_std = cluster_data["PC1"].std()
    sand_mean = cluster_data["Sand Thickness"].mean()
    sand_std = cluster_data["Sand Thickness"].std()

    print(f"聚类 {cluster} (n={len(cluster_data)}):")
    print(f"  PC1: {pc1_mean:.4f} ± {pc1_std:.4f}")
    print(f"  砂厚: {sand_mean:.4f} ± {sand_std:.4f}")

    # 计算该聚类内的相关性
    if len(cluster_data) >= 3:  # 至少需要3个点才能计算相关性
        cluster_corr, cluster_p = pearsonr(cluster_data["PC1"], cluster_data["Sand Thickness"])
        print(f"  聚类内相关性: {cluster_corr:.4f} (p={cluster_p:.4f})")
    else:
        print(f"  聚类内相关性: 样本数过少，无法计算")

# 7. 保存分析结果到文件
analysis_summary = {
    "total_wells": len(well_analysis_data),
    "pc1_sand_correlation": correlation_coef,
    "correlation_p_value": p_value,
    "correlation_significant": abs(correlation_coef) >= 0.35,
    "cluster_distribution": {
        int(cluster): int(count) for cluster, count in zip(*np.unique(well_cluster_labels, return_counts=True))
    },
}

# 保存井点分析数据
well_analysis_data.to_csv(os.path.join(output_dir, "well_analysis_with_clusters.csv"), index=False)

print(f"\n======== 分析完成 ========")
print(f"井点分析数据已保存到: well_analysis_with_clusters.csv")
print(f"可视化图片已保存到输出目录: {output_dir}")

# 显示数据预览
print(f"\n井点分析数据预览:")
display_cols = ["X", "Y", "Sand Thickness", "PC1", "PC2", "Cluster"]
available_cols = [col for col in display_cols if col in well_analysis_data.columns]
print(well_analysis_data[available_cols].head())