# PCA + 根据现有井点进行聚类解释

In [None]:
# 确保src目录在Python路径中
import os
import sys
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

warnings.filterwarnings("ignore")
sys.path.append(os.path.abspath("../../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.sigmoid import SigmoidModel
from src.visualization import (
    visualize_attribute_map,
    visualize_feature_distribution,
    visualize_gmm_clustering,
    visualize_pca_clustering,
)

# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

In [None]:
# ==================== 层位配置 ====================
# 修改这里可以更换不同的层位
SURFACE_NAME = "H6-2"  # 例如: "H2-4", "H5-1", "H6-2"等
# ================================================

## 导入数据

In [None]:
# 根据层位名称生成相关路径和配置
data_dir = "../../data"
data_tmp_dir = "data_tmp"
output_dir = f"{SURFACE_NAME.replace('-', '_')}_ps_output"  # H2-4 -> H2_4_ps_output

# 创建目录
if not os.path.exists(data_tmp_dir):
    os.makedirs(data_tmp_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"当前处理层位: {SURFACE_NAME}")
print(f"输出目录: {output_dir}")

In [None]:
# 导入地震数据
data_seismic_url = os.path.join(data_dir, SURFACE_NAME)
print(f"地震数据路径: {data_seismic_url}")

data_seismic_attr = parse_petrel_file(data_seismic_url)

# 导入井点位置
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == SURFACE_NAME]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

print(f"层位 {SURFACE_NAME} 的井点数量: {len(data_well_purpose_surface_position)}")
data_well_purpose_surface_position.head()

## 数据预处理

In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(data_seismic_url)

processed_features, stats, report = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,
    outlier_method="iqr",
    outlier_threshold=2.0,
    outlier_treatment="clip",  # 边界截断
    verbose=True,
)

# 提取筛选后的属性
attribute_names_processed = [col for col in processed_features.columns]

# 将处理后的属性数据与原始坐标数据合并
data_seismic_attr_processed = data_seismic_attr[["X", "Y"]].copy()  # type: ignore
for col in processed_features.columns:
    data_seismic_attr_processed[col] = processed_features[col]

In [None]:
# 为筛选后的井点提取地震属性
data_well_attr = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_position,
    seismic_data=data_seismic_attr_processed,
    max_distance=50,
    num_points=5,
)

# 保存处理结果
data_well_attr.to_excel(os.path.join(data_tmp_dir, f"{SURFACE_NAME.replace('-', '_')}_wells_attr.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 {data_tmp_dir}")

## PCA 降维，PC1 作为融合属性

In [None]:
pca_results = perform_pca_analysis(
    data=data_seismic_attr_processed,
    attribute_columns=attribute_names_processed,
    variance_threshold=0.8,
    output_dir=output_dir,
)

In [None]:
# 可视化所有主成分在地理空间的分布
print("\n可视化主成分分布...")

# 创建包含主成分的新数据框
data_seismic_attr_with_PC = data_seismic_attr_processed.copy()

# 添加所有主成分列到地震数据中
n_components = pca_results["features_pca"].shape[1]
for i in range(n_components):
    pc_data = pca_results["features_pca"][:, i]  # 第i个主成分
    data_seismic_attr_with_PC[f"PC{i + 1}"] = pc_data

print(f"添加主成分后数据形状: {data_seismic_attr_with_PC.shape}")
print(f"新增列: {[col for col in data_seismic_attr_with_PC.columns if col.startswith('PC')]}")
print(f"总共添加了 {n_components} 个主成分")

# 定义不同主成分使用的色彩图谱，确保视觉区分
colormaps = ["viridis", "plasma", "inferno", "magma", "cividis", "turbo", "coolwarm", "seismic", "RdYlBu", "Spectral"]

# 循环可视化所有主成分
for i in range(n_components):
    pc_name = f"PC{i + 1}"
    print(f"\n可视化 {pc_name} 分布...")

    # 选择色彩图谱（循环使用）
    cmap = colormaps[i % len(colormaps)]

    # 计算该主成分的数值范围
    pc_min = data_seismic_attr_with_PC[pc_name].min()
    pc_max = data_seismic_attr_with_PC[pc_name].max()
    pc_range = pc_max - pc_min

    # 设置合适的色彩范围（可以根据数据分布调整）
    if pc_range > 0:
        # 使用数据的95%分位数范围，避免极值影响可视化效果
        pc_5th = data_seismic_attr_with_PC[pc_name].quantile(0.05)
        pc_95th = data_seismic_attr_with_PC[pc_name].quantile(0.95)
        vrange = (pc_5th, pc_95th)
    else:
        vrange = None

    # 可视化该主成分的空间分布
    visualize_attribute_map(
        data_points=data_seismic_attr_with_PC,
        attribute_name=pc_name,  # 要可视化的属性列名
        attribute_label=f"第{i + 1}主成分 ({pc_name})",  # 在图例和颜色条中的显示名称
        real_wells=data_well_purpose_surface_position,  # 真实井点数据
        pseudo_wells=None,  # 暂时没有虚拟井点
        target_column="Sand Thickness",  # 井点的目标列
        output_dir=output_dir,
        filename_prefix=f"pc{i + 1}",  # 输出文件前缀
        class_thresholds=[1, 13.75],  # 砂厚分类阈值
        figsize=(14, 12),
        dpi=300,
        cmap=cmap,  # 使用不同的色彩图区分不同主成分
        point_size=10,  # 地震数据点大小
        well_size=50,  # 井点标记大小
        vrange=vrange,  # 使用计算得到的合适范围
    )

    # 打印该主成分的统计信息
    print(f"  {pc_name} 值范围: {pc_min:.4f} 到 {pc_max:.4f}")
    print(f"  {pc_name} 均值: {data_seismic_attr_with_PC[pc_name].mean():.4f}")
    print(f"  {pc_name} 标准差: {data_seismic_attr_with_PC[pc_name].std():.4f}")

    # 显示该主成分的解释方差比
    explained_variance = pca_results["explained_variance_ratio"][i]
    cumulative_variance = pca_results["explained_variance_ratio_cumsum"][i]
    print(f"  {pc_name} 解释方差比: {explained_variance:.4f} ({explained_variance * 100:.2f}%)")
    print(f"  累积解释方差比: {cumulative_variance:.4f} ({cumulative_variance * 100:.2f}%)")

print(f"\n=== 主成分可视化完成 ===")
print(f"输出目录: {output_dir}")
print(f"完整数据变量: data_seismic_attr_with_PC (包含原始属性+所有主成分)")
print(f"数据形状: {data_seismic_attr_with_PC.shape}")
print(f"主成分列: {[col for col in data_seismic_attr_with_PC.columns if col.startswith('PC')]}")

# 显示主成分贡献度摘要
print(f"\n=== 主成分贡献度摘要 ===")
for i in range(n_components):
    pc_name = f"PC{i + 1}"
    explained_var = pca_results["explained_variance_ratio"][i]
    cumulative_var = pca_results["explained_variance_ratio_cumsum"][i]
    print(
        f"{pc_name}: 解释方差 {explained_var:.3f} ({explained_var * 100:.1f}%), 累积 {cumulative_var:.3f} ({cumulative_var * 100:.1f}%)"
    )

## 

## 在井控区进行 GMM 聚类

In [None]:
# 筛选离群井
data_well_attr_filtered = filter_outlier_wells(data_well_attr, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_attr)}")
print(f"筛选后井点数量: {len(data_well_attr_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_attr["X"].min()
x_max = data_well_attr["X"].max()
y_min = data_well_attr["Y"].min()
y_max = data_well_attr["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_attr["X"], data_well_attr["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_attr_filtered["X"], data_well_attr_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

In [None]:
# 限制工区范围
data_seismic_attr_with_PC_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=data_seismic_attr_with_PC,
    well_data=data_well_attr_filtered,
    expansion_factor=2,  # 扩展100%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

In [None]:
# 默认只使用PC1和PC2进行聚类评估

# 提取用于聚类的特征
clustering_features = data_seismic_attr_with_PC_filtered[["PC1", "PC2"]].values

# 提取井控区的地震坐标
data_seismic_attr_with_PC_filtered_coords = data_seismic_attr_with_PC_filtered[["X", "Y"]]

# 评估聚类数
gmm_evaluation = evaluate_gmm_clusters(
    features_for_clustering=clustering_features, max_clusters=4, output_dir=output_dir
)

# 获取推荐的聚类数
best_n = gmm_evaluation["best_n_components"]
print(f"推荐使用 {best_n} 个聚类")

# 执行GMM聚类
gmm_results = perform_gmm_clustering(
    features=clustering_features,
    coords=data_seismic_attr_with_PC_filtered_coords,
    n_clusters=best_n,
)
gmm_results["result_df"].to_csv(os.path.join(output_dir, "gmm_best_clusters.csv"), index=False)

In [None]:
# 1. PCA可视化，需要将井点数据投影到PCA空间
# 首先提取井点的属性列
data_well_attr_filtered_features = data_well_attr_filtered[attribute_names_processed].values
# 使用相同的标准化器和PCA模型变换井点数据
data_well_attr_filtered_features_scaled = pca_results["scaler"].transform(data_well_attr_filtered_features)
data_well_attr_filtered_pca_features = pca_results["pca"].transform(data_well_attr_filtered_features_scaled)

# 2. 在PCA空间中可视化聚类结果
visualize_pca_clustering(
    features_pca=clustering_features,
    cluster_labels=gmm_results["cluster_labels"],
    n_clusters=best_n,
    output_dir=output_dir,
    well_data=data_well_attr_filtered,
    well_pca_features=data_well_attr_filtered_pca_features,
    target_column="Sand Thickness",
    class_thresholds=[1, 13.75],
)

# 3. 在地理空间中可视化聚类结果
visualize_gmm_clustering(
    clustering_results=gmm_results,
    output_dir=output_dir,
    prefix="pca",
    well_data=data_well_attr_filtered,
    target_column="Sand Thickness",
    class_thresholds=[1, 13.75],
    point_size=100,
    well_size=200,
)

## 结果分析与解释

In [None]:
# PCA载荷解释分析
print("======== PCA载荷解释分析 ========")

# 定义属性解释映射
attribute_interpretations = {
    "Maximum amplitude": {
        "high_positive": "强波峰",
        "near_zero": "与波峰强度无关",
        "high_negative": "弱波峰",
    },
    "Maximum amplitude-dq": {
        "high_positive": "强波峰",
        "near_zero": "与波峰强度无关",
        "high_negative": "弱波峰",
    },
    "Minimum amplitude": {
        "high_positive": "弱波谷",
        "near_zero": "与波谷强度无关",
        "high_negative": "强波谷",
    },
    "Minimum amplitude-dq": {
        "high_positive": "弱波谷",
        "near_zero": "与波谷强度无关",
        "high_negative": "强波谷",
    },
    "RMS amplitude": {"high_positive": "强能量", "near_zero": "与能量无关", "high_negative": "弱能量"},
    "RMS amplitude-dq": {"high_positive": "强能量", "near_zero": "与能量无关", "high_negative": "弱能量"},
    "Sum of energy": {"high_positive": "强能量", "near_zero": "与能量无关", "high_negative": "弱能量"},
    "Sum of energy-dq": {"high_positive": "强能量", "near_zero": "与能量无关", "high_negative": "弱能量"},
    "Harmonic mean-ge": {
        "high_positive": "强遗传反演数值",
        "near_zero": "与遗传反演数值无关",
        "high_negative": "弱遗传反演数值",
    },
}


def get_loading_category(loading_value, threshold=0.3):
    """根据载荷值返回类别"""
    if loading_value > threshold:
        return "high_positive"
    elif loading_value < -threshold:
        return "high_negative"
    else:
        return "near_zero"


def resolve_conflicts(loadings_dict, attr_interpretations):
    """解决冲突的载荷解释"""
    resolved_interpretations = {}

    # 处理叠前叠后冲突
    pre_post_pairs = [
        ("Maximum amplitude", "Maximum amplitude-dq"),
        ("Minimum amplitude", "Minimum amplitude-dq"),
        ("RMS amplitude", "RMS amplitude-dq"),
        ("Sum of energy", "Sum of energy-dq"),
    ]

    # 处理能量属性冲突 - 修改为四元组，处理所有可能的能量属性组合
    energy_groups = [["RMS amplitude", "RMS amplitude-dq", "Sum of energy", "Sum of energy-dq"]]

    used_attributes = set()

    # 先处理能量属性冲突（优先级更高）
    for energy_group in energy_groups:
        available_energy_attrs = {attr: loadings_dict[attr] for attr in energy_group if attr in loadings_dict}

        if len(available_energy_attrs) > 1:  # 如果有多个能量属性
            # 选择载荷绝对值最大的
            selected_attr = max(available_energy_attrs.keys(), key=lambda x: abs(available_energy_attrs[x]))
            selected_loading = available_energy_attrs[selected_attr]

            if selected_attr in attr_interpretations:
                category = get_loading_category(selected_loading)
                conflict_note = f"从{len(available_energy_attrs)}个能量属性中选择载荷最大的 (|{selected_loading:.3f}|)"

                resolved_interpretations[selected_attr] = {
                    "loading": selected_loading,
                    "interpretation": attr_interpretations[selected_attr][category],
                    "conflict_resolved": conflict_note,
                }

                # 标记所有能量属性为已使用
                used_attributes.update(energy_group)

    # 再处理叠前叠后冲突
    for post_attr, pre_attr in pre_post_pairs:
        if (
            post_attr in loadings_dict
            and pre_attr in loadings_dict
            and post_attr not in used_attributes
            and pre_attr not in used_attributes
        ):
            post_loading = abs(loadings_dict[post_attr])
            pre_loading = abs(loadings_dict[pre_attr])

            if post_loading >= pre_loading:
                selected_attr = post_attr
                selected_loading = loadings_dict[post_attr]
            else:
                selected_attr = pre_attr
                selected_loading = loadings_dict[pre_attr]

            if selected_attr in attr_interpretations:
                category = get_loading_category(selected_loading)
                resolved_interpretations[selected_attr] = {
                    "loading": selected_loading,
                    "interpretation": attr_interpretations[selected_attr][category],
                    "conflict_resolved": f"选择载荷更大的属性 (|{selected_loading:.3f}|)",
                }
                used_attributes.add(post_attr)
                used_attributes.add(pre_attr)

    # 处理其他没有冲突的属性
    for attr, loading in loadings_dict.items():
        if attr not in used_attributes and attr in attr_interpretations:
            category = get_loading_category(loading)
            resolved_interpretations[attr] = {
                "loading": loading,
                "interpretation": attr_interpretations[attr][category],
                "conflict_resolved": None,
            }

    return resolved_interpretations


# 获取PC1和PC2的载荷
component_contributions = pca_results["component_contributions"]
available_attributes = list(component_contributions.columns)

print(f"可用的地震属性: {available_attributes}")

# 分析PC1和PC2
for pc_name in ["PC1", "PC2"]:
    if pc_name in component_contributions.index:
        print(f"\n======== {pc_name} 载荷解释 ========")

        # 获取该主成分的载荷
        pc_loadings = component_contributions.loc[pc_name].to_dict()

        # 过滤出绝对值大于0.1的载荷（避免显示太多微小载荷）
        significant_loadings = {attr: loading for attr, loading in pc_loadings.items() if abs(loading) > 0.1}

        if not significant_loadings:
            print(f"{pc_name} 没有显著的载荷 (>0.1)")
            continue

        # 解决冲突并生成解释
        resolved_interpretations = resolve_conflicts(significant_loadings, attribute_interpretations)

        if not resolved_interpretations:
            print(f"{pc_name} 没有可解释的地震属性")
            continue

        # 按载荷绝对值排序
        sorted_interpretations = sorted(
            resolved_interpretations.items(), key=lambda x: abs(x[1]["loading"]), reverse=True
        )

        # 输出表格
        print(f"\n{pc_name} 载荷解释表:")
        print("-" * 90)
        print(f"{'地震属性':<25} {'载荷值':<10} {'地质解释':<15} {'备注':<20}")
        print("-" * 90)

        for attr, info in sorted_interpretations:
            loading = info["loading"]
            interpretation = info["interpretation"]
            conflict_note = info["conflict_resolved"] if info["conflict_resolved"] else ""

            print(f"{attr:<25}  {loading:>8.3f}          {interpretation:<15} {conflict_note:<20}")

        print("-" * 90)

        # 生成PC总体解释
        all_interpretations = [info["interpretation"] for attr, info in sorted_interpretations]

        print(f"\n{pc_name} 总体地质意义:")
        print(f"主要反映: {', '.join(all_interpretations)}")

        # 计算该PC的解释方差比
        pc_index = int(pc_name.replace("PC", "")) - 1
        if pc_index < len(pca_results["explained_variance_ratio"]):
            variance_explained = pca_results["explained_variance_ratio"][pc_index]
            print(f"解释方差比: {variance_explained:.3f} ({variance_explained * 100:.1f}%)")


print("\n======== PCA载荷解释分析完成 ========")

In [None]:
print("======== 井控区结果分析 ========")

# 1. 为井点分配聚类标签
well_cluster_labels = gmm_results["gmm"].predict(data_well_attr_filtered_pca_features[:, :2])  # 使用前两个主成分

# 2. 创建用于可视化的数据框
well_analysis_data = data_well_attr_filtered.copy()
well_analysis_data["PC1"] = data_well_attr_filtered_pca_features[:, 0]  # 第一主成分
well_analysis_data["PC2"] = data_well_attr_filtered_pca_features[:, 1]  # 第二主成分
well_analysis_data["Cluster"] = well_cluster_labels  # 聚类标签

print(f"井点数据形状: {well_analysis_data.shape}")
print(f"井点聚类分布:")
cluster_stats = {}
for cluster in sorted(np.unique(well_cluster_labels)):
    count = sum(well_cluster_labels == cluster)
    cluster_stats[cluster] = count
    print(f"  聚类 {cluster}: {count} 个井点")


# 3. 定义相关性分析函数
def analyze_correlation(x_data, y_data, x_name, y_name, threshold=0.35):
    """分析两个变量的相关性"""
    correlation_coef, p_value = pearsonr(x_data, y_data)

    # 判断相关性显著性
    is_significant = abs(correlation_coef) >= threshold

    # 判断相关性强度和方向
    if is_significant:
        direction = "正相关" if correlation_coef > 0 else "负相关"
        if abs(correlation_coef) >= 0.7:
            strength = "强"
        elif abs(correlation_coef) >= 0.5:
            strength = "中等"
        else:
            strength = "弱"
        result = f"{strength}{direction}"
    else:
        result = "无显著相关性"

    return {
        "correlation": correlation_coef,
        "p_value": p_value,
        "is_significant": is_significant,
        "result": result,
        "description": f"{x_name}与{y_name}: r={correlation_coef:.4f}, {result}",
    }


# 4. 创建聚类标签映射
cluster_labels_dict = {cluster: f"聚类 {cluster}" for cluster in sorted(np.unique(well_cluster_labels))}

# 5. 分析结果存储
correlation_results = {}

print("\n======== PC1 相关性分析 ========")

# 5.1 PC1 vs 砂厚（聚类上色）
print("\n可视化PC1 vs 砂厚（按聚类着色）...")
fig1 = visualize_feature_distribution(
    data=well_analysis_data,
    x_feature="PC1",
    y_feature="Sand Thickness",
    color_feature="Cluster",
    figsize=(12, 8),
    point_size=100,
    alpha=0.7,
    colormap="tab10",
    title="井控区：PC1 vs 砂厚（按聚类分类）",
    save_path=os.path.join(output_dir, "well_pc1_vs_sand_thickness_by_cluster.png"),
    discrete_colors=True,
    color_labels=cluster_labels_dict,
)

# PC1整体相关性分析
pc1_overall = analyze_correlation(well_analysis_data["PC1"], well_analysis_data["Sand Thickness"], "PC1", "砂厚")
correlation_results["PC1_overall"] = pc1_overall
print(f"PC1整体相关性: {pc1_overall['description']}")

# 5.2 PC1 vs 砂厚（分聚类分析）
print("\n分聚类PC1相关性分析:")
for cluster in sorted(np.unique(well_cluster_labels)):
    cluster_mask = well_analysis_data["Cluster"] == cluster
    cluster_data = well_analysis_data[cluster_mask]

    if len(cluster_data) >= 5:  # 只有样本数>=5才进行可视化和分析
        print(f"\n--- 聚类 {cluster} (n={len(cluster_data)}) ---")

        # 可视化单独聚类
        fig_cluster = visualize_feature_distribution(
            data=cluster_data,
            x_feature="PC1",
            y_feature="Sand Thickness",
            color_feature="Sand Thickness",
            figsize=(10, 6),
            point_size=100,
            alpha=0.8,
            colormap="viridis",
            title=f"聚类 {cluster}：PC1 vs 砂厚",
            save_path=os.path.join(output_dir, f"well_pc1_vs_sand_thickness_cluster_{cluster}.png"),
            discrete_colors=False,
        )

        # 相关性分析
        pc1_cluster = analyze_correlation(
            cluster_data["PC1"], cluster_data["Sand Thickness"], f"PC1(聚类{cluster})", "砂厚"
        )
        correlation_results[f"PC1_cluster_{cluster}"] = pc1_cluster
        print(f"  {pc1_cluster['description']}")

    else:
        print(f"\n--- 聚类 {cluster} (n={len(cluster_data)}) ---")
        print(f"  样本数过少(<5)，跳过可视化和相关性分析")

print("\n======== PC2 相关性分析 ========")

# 5.3 PC2 vs 砂厚（聚类上色）
print("\n可视化PC2 vs 砂厚（按聚类着色）...")
fig2 = visualize_feature_distribution(
    data=well_analysis_data,
    x_feature="PC2",
    y_feature="Sand Thickness",
    color_feature="Cluster",
    figsize=(12, 8),
    point_size=100,
    alpha=0.7,
    colormap="tab10",
    title="井控区：PC2 vs 砂厚（按聚类分类）",
    save_path=os.path.join(output_dir, "well_pc2_vs_sand_thickness_by_cluster.png"),
    discrete_colors=True,
    color_labels=cluster_labels_dict,
)

# PC2整体相关性分析
pc2_overall = analyze_correlation(well_analysis_data["PC2"], well_analysis_data["Sand Thickness"], "PC2", "砂厚")
correlation_results["PC2_overall"] = pc2_overall
print(f"PC2整体相关性: {pc2_overall['description']}")

# 5.4 PC2 vs 砂厚（分聚类分析）
print("\n分聚类PC2相关性分析:")
for cluster in sorted(np.unique(well_cluster_labels)):
    cluster_mask = well_analysis_data["Cluster"] == cluster
    cluster_data = well_analysis_data[cluster_mask]

    if len(cluster_data) >= 5:  # 只有样本数>=5才进行可视化和分析
        print(f"\n--- 聚类 {cluster} (n={len(cluster_data)}) ---")

        # 可视化单独聚类
        fig_cluster = visualize_feature_distribution(
            data=cluster_data,
            x_feature="PC2",
            y_feature="Sand Thickness",
            color_feature="Sand Thickness",
            figsize=(10, 6),
            point_size=100,
            alpha=0.8,
            colormap="viridis",
            title=f"聚类 {cluster}：PC2 vs 砂厚",
            save_path=os.path.join(output_dir, f"well_pc2_vs_sand_thickness_cluster_{cluster}.png"),
            discrete_colors=False,
        )

        # 相关性分析
        pc2_cluster = analyze_correlation(
            cluster_data["PC2"], cluster_data["Sand Thickness"], f"PC2(聚类{cluster})", "砂厚"
        )
        correlation_results[f"PC2_cluster_{cluster}"] = pc2_cluster
        print(f"  {pc2_cluster['description']}")

    else:
        print(f"\n--- 聚类 {cluster} (n={len(cluster_data)}) ---")
        print(f"  样本数过少(<5)，跳过可视化和相关性分析")

# 6. 生成解释变量
interpretation_summary = {
    "correlation_analysis": {
        "PC1_overall_correlation": pc1_overall["correlation"],
        "PC1_overall_significant": pc1_overall["is_significant"],
        "PC1_overall_result": pc1_overall["result"],
        "PC2_overall_correlation": pc2_overall["correlation"],
        "PC2_overall_significant": pc2_overall["is_significant"],
        "PC2_overall_result": pc2_overall["result"],
    },
    "cluster_analysis": {
        "total_wells": len(well_analysis_data),
        "n_clusters": len(cluster_stats),
        "cluster_distribution": cluster_stats,
        "significant_correlations": {},
    },
    "detailed_results": correlation_results,
}

# 收集显著相关性结果
for key, result in correlation_results.items():
    if result["is_significant"]:
        interpretation_summary["cluster_analysis"]["significant_correlations"][key] = {
            "correlation": result["correlation"],
            "result": result["result"],
        }

# 7. 分聚类统计分析
print(f"\n======== 分聚类统计分析 ========")
for cluster in sorted(np.unique(well_cluster_labels)):
    cluster_mask = well_analysis_data["Cluster"] == cluster
    cluster_data = well_analysis_data[cluster_mask]

    pc1_mean = cluster_data["PC1"].mean()
    pc1_std = cluster_data["PC1"].std()
    pc2_mean = cluster_data["PC2"].mean()
    pc2_std = cluster_data["PC2"].std()
    sand_mean = cluster_data["Sand Thickness"].mean()
    sand_std = cluster_data["Sand Thickness"].std()

    print(f"聚类 {cluster} (n={len(cluster_data)}):")
    print(f"  PC1: {pc1_mean:.4f} ± {pc1_std:.4f}")
    print(f"  PC2: {pc2_mean:.4f} ± {pc2_std:.4f}")
    print(f"  砂厚: {sand_mean:.4f} ± {sand_std:.4f}")

# 8. 保存分析结果到文件
well_analysis_data.to_csv(os.path.join(output_dir, "well_analysis_with_clusters.csv"), index=False)

print(f"\n======== 相关性分析摘要 ========")
print(f"PC1整体相关性: {pc1_overall['result']} (r={pc1_overall['correlation']:.4f})")
print(f"PC2整体相关性: {pc2_overall['result']} (r={pc2_overall['correlation']:.4f})")

significant_count = len(interpretation_summary["cluster_analysis"]["significant_correlations"])
print(f"显著相关性数量: {significant_count}")

if significant_count > 0:
    print("显著相关性详情:")
    for key, result in interpretation_summary["cluster_analysis"]["significant_correlations"].items():
        print(f"  {key}: {result['result']} (r={result['correlation']:.4f})")

print(f"\n======== 分析完成 ========")
print(f"井点分析数据已保存到: well_analysis_with_clusters.csv")
print(f"可视化图片已保存到输出目录: {output_dir}")

## 输出主成分数据与解释到 Petrel 文件

In [None]:
print("======== 输出主成分数据到Petrel文件 ========")


# 1. 准备PCA载荷解释数据
def generate_pca_interpretation_text(pc_name, component_contributions, attribute_interpretations):
    """生成PCA载荷解释文本"""
    if pc_name not in component_contributions.index:
        return f"{pc_name} 数据不可用"

    # 获取该主成分的载荷
    pc_loadings = component_contributions.loc[pc_name].to_dict()

    # 过滤出绝对值大于0.1的载荷
    significant_loadings = {attr: loading for attr, loading in pc_loadings.items() if abs(loading) > 0.1}

    if not significant_loadings:
        return f"{pc_name} 没有显著的载荷 (>0.1)"

    # 解决冲突并生成解释
    resolved_interpretations = resolve_conflicts(significant_loadings, attribute_interpretations)

    if not resolved_interpretations:
        return f"{pc_name} 没有可解释的地震属性"

    # 按载荷绝对值排序
    sorted_interpretations = sorted(resolved_interpretations.items(), key=lambda x: abs(x[1]["loading"]), reverse=True)

    # 生成文本
    interpretation_text = []
    interpretation_text.append(f"{pc_name} 载荷解释:")
    for attr, info in sorted_interpretations:
        loading = info["loading"]
        interpretation = info["interpretation"]
        conflict_note = info["conflict_resolved"] if info["conflict_resolved"] else ""

        line = f"  {attr}: {loading:.3f} -> {interpretation}"
        if conflict_note:
            line += f" ({conflict_note})"
        interpretation_text.append(line)

    # 生成总体解释
    all_interpretations = [info["interpretation"] for attr, info in sorted_interpretations]
    interpretation_text.append(f"  总体地质意义: {', '.join(all_interpretations)}")

    # 添加解释方差比
    pc_index = int(pc_name.replace("PC", "")) - 1
    if pc_index < len(pca_results["explained_variance_ratio"]):
        variance_explained = pca_results["explained_variance_ratio"][pc_index]
        interpretation_text.append(f"  解释方差比: {variance_explained:.3f} ({variance_explained * 100:.1f}%)")

    return "\n".join(interpretation_text)


# 2. 直接使用当前环境中的相关性分析结果
correlation_summary = ""
try:
    # 直接使用前面生成的interpretation_summary变量
    correlation_summary = f"""
井控区相关性分析结果:
  PC1整体相关性: {interpretation_summary["correlation_analysis"]["PC1_overall_result"]} (r={interpretation_summary["correlation_analysis"]["PC1_overall_correlation"]:.4f})
  PC2整体相关性: {interpretation_summary["correlation_analysis"]["PC2_overall_result"]} (r={interpretation_summary["correlation_analysis"]["PC2_overall_correlation"]:.4f})
  总井点数: {interpretation_summary["cluster_analysis"]["total_wells"]}
  聚类数: {interpretation_summary["cluster_analysis"]["n_clusters"]}
  显著相关性数量: {len(interpretation_summary["cluster_analysis"]["significant_correlations"])}"""

    if interpretation_summary["cluster_analysis"]["significant_correlations"]:
        correlation_summary += "\n  显著相关性详情:"
        for key, result in interpretation_summary["cluster_analysis"]["significant_correlations"].items():
            correlation_summary += f"\n    {key}: {result['result']} (r={result['correlation']:.4f})"

except Exception as e:
    correlation_summary = f"井控区相关性分析结果读取失败: {str(e)}"

# 3. 为PC1和PC2分别创建输出文件
for pc_name in ["PC1", "PC2"]:
    if pc_name not in data_seismic_attr_with_PC.columns:
        print(f"警告: {pc_name} 数据不存在，跳过输出")
        continue

    # 生成文件名
    filename = f"{SURFACE_NAME.replace('-', '_')}_{pc_name}.txt"
    filepath = os.path.join(output_dir, filename)

    # 生成PCA载荷解释
    pca_interpretation = generate_pca_interpretation_text(
        pc_name, pca_results["component_contributions"], attribute_interpretations
    )

    # 准备输出数据
    output_data = data_seismic_attr_with_PC[["X", "Y", pc_name]].copy()

    # 写入文件
    with open(filepath, "w", encoding="gbk") as f:
        # 写入文件头注释
        f.write(f"# -*- coding: gbk -*-\n")
        f.write(f"# Petrel 地震属性数据文件\n")
        f.write(f"# 文件名: {filename}\n")
        f.write(f"# 创建时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"# 层位: {SURFACE_NAME}\n")
        f.write(f"# 属性: {pc_name} (第{pc_name[2:]}主成分)\n")
        f.write(f"# 数据点数: {len(output_data)}\n")
        f.write(f"# 坐标系统: 原始地震数据坐标系\n")
        f.write(f"#\n")
        f.write(f"# ==================== PCA分析结果 ====================\n")

        # 写入PCA解释
        for line in pca_interpretation.split("\n"):
            f.write(f"# {line}\n")

        f.write(f"#\n")
        f.write(f"# ==================== 数据统计信息 ====================\n")
        f.write(f"# {pc_name} 最小值: {output_data[pc_name].min():.6f}\n")
        f.write(f"# {pc_name} 最大值: {output_data[pc_name].max():.6f}\n")
        f.write(f"# {pc_name} 均值: {output_data[pc_name].mean():.6f}\n")
        f.write(f"# {pc_name} 标准差: {output_data[pc_name].std():.6f}\n")

        f.write(f"#\n")
        f.write(f"# ==================== 井控区分析结果 ====================\n")
        for line in correlation_summary.split("\n"):
            if line.strip():
                f.write(f"# {line}\n")

        f.write(f"#\n")
        f.write(f"# ==================== 数据格式说明 ====================\n")
        f.write(f"# 列1: X坐标\n")
        f.write(f"# 列2: Y坐标\n")
        f.write(f"# 列3: {pc_name}值\n")
        f.write(f"# 数据行数: {len(output_data)}\n")
        f.write(f"# 缺失值标识: 无\n")
        f.write(f"#\n")
        f.write(f"# ==================== 数据开始 ====================\n")

        # 写入列标题
        f.write("X\tY\t" + pc_name + "\n")

        # 写入数据
        for _, row in output_data.iterrows():
            f.write(f"{row['X']:.6f}\t{row['Y']:.6f}\t{row[pc_name]:.6f}\n")

    print(f"✓ {pc_name} 数据已输出到: {filepath}")
    print(f"  - 数据点数: {len(output_data)}")
    print(f"  - {pc_name} 值范围: [{output_data[pc_name].min():.4f}, {output_data[pc_name].max():.4f}]")

# 4. 生成合并的解释报告文件
report_filename = f"{SURFACE_NAME.replace('-', '_')}_PCA_analysis_report.txt"
report_filepath = os.path.join(output_dir, report_filename)

with open(report_filepath, "w", encoding="utf-8") as f:
    f.write(f"PCA分析完整报告\n")
    f.write(f"=" * 50 + "\n")
    f.write(f"层位: {SURFACE_NAME}\n")
    f.write(f"分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"数据点总数: {len(data_seismic_attr_with_PC)}\n")
    f.write(f"输出主成分数: {pca_results['n_components']}\n\n")

    # 写入主成分贡献度摘要
    f.write(f"主成分贡献度摘要:\n")
    for i in range(pca_results["n_components"]):
        pc_name = f"PC{i + 1}"
        explained_var = pca_results["explained_variance_ratio"][i]
        cumulative_var = pca_results["explained_variance_ratio_cumsum"][i]
        f.write(
            f"  {pc_name}: 解释方差 {explained_var:.3f} ({explained_var * 100:.1f}%), 累积 {cumulative_var:.3f} ({cumulative_var * 100:.1f}%)\n"
        )

    f.write(f"\n" + "=" * 50 + "\n")

    # 写入PC1和PC2的详细解释
    for pc_name in ["PC1", "PC2"]:
        if pc_name in pca_results["component_contributions"].index:
            f.write(f"\n{pc_name} 详细解释:\n")
            f.write("-" * 30 + "\n")
            pca_interpretation = generate_pca_interpretation_text(
                pc_name, pca_results["component_contributions"], attribute_interpretations
            )
            f.write(pca_interpretation + "\n")

    # 写入井控区分析结果
    f.write(f"\n" + "=" * 50 + "\n")
    f.write(f"井控区分析结果:\n")
    f.write(correlation_summary + "\n")

print(f"✓ 完整分析报告已输出到: {report_filepath}")

print(f"\n======== 主成分数据输出完成 ========")
print(f"输出文件:")
print(f"  - PC1数据: {SURFACE_NAME.replace('-', '_')}_PC1.txt")
print(f"  - PC2数据: {SURFACE_NAME.replace('-', '_')}_PC2.txt")
print(f"  - 分析报告: {SURFACE_NAME.replace('-', '_')}_PCA_analysis_report.txt")
print(f"输出目录: {output_dir}")