# VAE + K-Means


In [None]:
# 确保src目录在Python路径中
import os
import sys
from typing import List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.preprocessing import StandardScaler

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_seismic_attributes_for_wells,
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_outlier_wells,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.feature_selection import select_best_features
from src.gmm_clustering import evaluate_gmm_clusters, perform_gmm_clustering
from src.pca_analysis import perform_pca_analysis
from src.visualization import visualize_attribute_map, visualize_gmm_clustering, visualize_pca_clustering

data_dir = "..\\data"
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [None]:
data_seismic_attr = parse_petrel_file(os.path.join(data_dir, "H6-2_attr"))

## 导入井点位置


In [None]:
data_well_position = pd.read_excel(os.path.join(data_dir, "well_without_attr.xlsx"))

# 选择对应层位的行，丢弃砂厚为 NaN 的行
data_well_purpose_surface_position = (
    data_well_position[data_well_position["Surface"] == "H6-2"]
    .replace(-999, np.nan)  # 将-999替换为NaN
    .dropna(subset=["Sand Thickness"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)
data_well_purpose_surface_position.head()

## 筛除离群井


In [None]:
# 筛选离群井
data_well_purpose_surface_filtered = filter_outlier_wells(data_well_purpose_surface_position, method="iqr")

# 显示筛选前后的井点数量
print(f"筛选前井点数量: {len(data_well_purpose_surface_position)}")
print(f"筛选后井点数量: {len(data_well_purpose_surface_filtered)}")

# 可视化筛选前后的井点分布
plt.figure(figsize=(12, 6))

# 计算坐标范围（使用所有井点的数据来确定范围）
x_min = data_well_purpose_surface_position["X"].min()
x_max = data_well_purpose_surface_position["X"].max()
y_min = data_well_purpose_surface_position["Y"].min()
y_max = data_well_purpose_surface_position["Y"].max()

# 可选：添加一些边距使图更美观
margin = 0.05  # 5%的边距
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * margin
x_max += x_range * margin
y_min -= y_range * margin
y_max += y_range * margin

# 绘制筛选前的井点分布
plt.subplot(1, 2, 1)
plt.scatter(data_well_purpose_surface_position["X"], data_well_purpose_surface_position["Y"], c="blue")
plt.title("筛选前井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# 绘制筛选后的井点分布
plt.subplot(1, 2, 2)
plt.scatter(data_well_purpose_surface_filtered["X"], data_well_purpose_surface_filtered["Y"], c="red")
plt.title("筛选后井点分布")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "well_filtering_comparison.png"))
plt.show()

## 处理属性缺失值


In [None]:
# 首先获取地震属性列表
attribute_names, _ = identify_attributes(os.path.join(data_dir, "H6-2_attr"))

# 使用preprocess_features处理地震数据
processed_seismic, attr_stats = preprocess_features(
    data=data_seismic_attr,
    attribute_columns=attribute_names,
    missing_values=[-999],
    missing_threshold=0.6,  # 缺失值超过60%的列将被删除
    outlier_method="iqr",
    outlier_threshold=1.5,
    verbose=True,
)

# 提取筛选后的属性
attribute_names_filtered = [col for col in processed_seismic.columns]

# 将处理后的属性数据与原始坐标数据合并
processed_seismic_full = data_seismic_attr[["X", "Y"]].copy()
for col in processed_seismic.columns:
    processed_seismic_full[col] = processed_seismic[col]

## 根据井点分布，缩小工区范围


In [None]:
# 限制工区范围
seismic_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=processed_seismic_full,
    well_data=data_well_purpose_surface_filtered,
    expansion_factor=1.5,  # 扩展50%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

## 提取井点处地震属性


In [None]:
# 为筛选前的井点提取地震属性
well_attr = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_position,
    seismic_data=processed_seismic_full,
    max_distance=50,
    num_points=5,
)

# 为筛选后的井点提取地震属性
well_attr_filtered = extract_seismic_attributes_for_wells(
    well_data=data_well_purpose_surface_filtered, seismic_data=processed_seismic_full, max_distance=50, num_points=5
)

# 保存处理结果
well_attr.to_excel(os.path.join(data_dir, "wells_attr.xlsx"), index=False)
print("筛选前井点的地震属性已保存到 wells_attr.xlsx")
well_attr_filtered.to_excel(os.path.join(data_dir, "wells_attr_filtered.xlsx"), index=False)
print("筛选后井点的地震属性已保存到 wells_attr_filtered.xlsx")

## 生成统计摘要


In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=seismic_attr_filtered,
    well_data=well_attr_filtered,
    common_attributes=attribute_names_filtered,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## VAE


In [None]:
class VAE(nn.Module):
    """
    Variational AutoEncoder (VAE) 用于地震属性向量的降维

    该模型使用MLP结构的编码器和解码器，适用于地震属性数据的无监督学习和聚类前的特征提取。

    Args:
        input_dim (int): 输入特征维度（地震属性数量）
        latent_dim (int): 隐变量维度
        encoder_dims (List[int], optional): 编码器各层维度，默认为 [64, 32, 16]
        decoder_dims (List[int], optional): 解码器各层维度，默认为 [16, 32, 64]
        activation (str): 激活函数类型，默认为 'relu'
        dropout_rate (float): Dropout比率，默认为 0.1
    """

    def __init__(
        self,
        input_dim: int,
        latent_dim: int = 8,
        encoder_dims: Optional[List[int]] = None,
        decoder_dims: Optional[List[int]] = None,
        activation: str = "relu",
        dropout_rate: float = 0.1,
    ):
        super(VAE, self).__init__()

        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.dropout_rate = dropout_rate

        # 默认编码器和解码器维度
        if encoder_dims is None:
            encoder_dims = [64, 32, 16]
        if decoder_dims is None:
            decoder_dims = [16, 32, 64]

        self.encoder_dims = encoder_dims
        self.decoder_dims = decoder_dims

        # 选择激活函数
        self.activation = self._get_activation_function(activation)

        # 构建编码器
        self.encoder = self._build_encoder()

        # 构建解码器
        self.decoder = self._build_decoder()

        # 初始化权重
        self._initialize_weights()

    def _get_activation_function(self, activation: str) -> nn.Module:
        """获取激活函数"""
        activation_dict = {
            "relu": nn.ReLU(),
            "leaky_relu": nn.LeakyReLU(0.2),
            "tanh": nn.Tanh(),
            "sigmoid": nn.Sigmoid(),
            "elu": nn.ELU(),
            "gelu": nn.GELU(),
        }

        if activation.lower() not in activation_dict:
            raise ValueError(f"不支持的激活函数: {activation}. 支持的函数: {list(activation_dict.keys())}")

        return activation_dict[activation.lower()]

    def _build_encoder(self) -> nn.ModuleList:
        """构建编码器网络"""
        layers = nn.ModuleList()

        # 输入层到第一个隐藏层
        prev_dim = self.input_dim

        # 中间隐藏层
        for dim in self.encoder_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(self.activation)
            if self.dropout_rate > 0:
                layers.append(nn.Dropout(self.dropout_rate))
            prev_dim = dim

        # 输出层：均值和方差
        self.fc_mu = nn.Linear(prev_dim, self.latent_dim)
        self.fc_logvar = nn.Linear(prev_dim, self.latent_dim)

        return layers

    def _build_decoder(self) -> nn.ModuleList:
        """构建解码器网络"""
        layers = nn.ModuleList()

        # 从隐变量开始
        prev_dim = self.latent_dim

        # 中间隐藏层
        for dim in self.decoder_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(self.activation)
            if self.dropout_rate > 0:
                layers.append(nn.Dropout(self.dropout_rate))
            prev_dim = dim

        # 输出层（重构）
        layers.append(nn.Linear(prev_dim, self.input_dim))

        return layers

    def _initialize_weights(self):
        """初始化网络权重"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)

    def encode(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        编码输入数据

        Args:
            x (torch.Tensor): 输入数据 [batch_size, input_dim]

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: (mu, logvar, z)
                - mu: 均值向量 [batch_size, latent_dim]
                - logvar: 对数方差向量 [batch_size, latent_dim]
                - z: 采样的隐变量 [batch_size, latent_dim]
        """
        # 通过编码器传播
        h = x
        for layer in self.encoder:
            h = layer(h)

        # 计算均值和对数方差
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)

        # 重参数化
        z = self.reparameterize(mu, logvar)

        return mu, logvar, z

    def reparameterize(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
        """
        重参数化技巧：从正态分布采样隐变量

        Args:
            mu (torch.Tensor): 均值向量 [batch_size, latent_dim]
            logvar (torch.Tensor): 对数方差向量 [batch_size, latent_dim]

        Returns:
            torch.Tensor: 采样的隐变量 z [batch_size, latent_dim]
        """
        if self.training:
            # 训练时进行采样
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + eps * std
        else:
            # 推理时直接返回均值
            return mu

    def decode(self, z: torch.Tensor) -> torch.Tensor:
        """
        解码隐变量

        Args:
            z (torch.Tensor): 隐变量 [batch_size, latent_dim]

        Returns:
            torch.Tensor: 重构的数据 [batch_size, input_dim]
        """
        h = z
        for layer in self.decoder:
            h = layer(h)

        return h

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        前向传播

        Args:
            x (torch.Tensor): 输入数据 [batch_size, input_dim]

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: (x_hat, mu, logvar)
                - x_hat: 重构数据 [batch_size, input_dim]
                - mu: 均值向量 [batch_size, latent_dim]
                - logvar: 对数方差向量 [batch_size, latent_dim]
        """
        # 编码
        mu, logvar, z = self.encode(x)

        # 解码
        x_hat = self.decode(z)

        return x_hat, mu, logvar

    def get_latent_representation(self, x: torch.Tensor) -> torch.Tensor:
        """
        获取输入数据的隐变量表示（用于聚类）

        Args:
            x (torch.Tensor): 输入数据 [batch_size, input_dim]

        Returns:
            torch.Tensor: 隐变量 [batch_size, latent_dim]
        """
        self.eval()
        with torch.no_grad():
            mu, logvar, z = self.encode(x)
            return z

    def generate(self, num_samples: int, device: torch.device = None) -> torch.Tensor:
        """
        从隐空间生成新样本

        Args:
            num_samples (int): 生成样本数量
            device (torch.device, optional): 设备

        Returns:
            torch.Tensor: 生成的样本 [num_samples, input_dim]
        """
        if device is None:
            device = next(self.parameters()).device

        self.eval()
        with torch.no_grad():
            # 从标准正态分布采样
            z = torch.randn(num_samples, self.latent_dim, device=device)
            # 解码生成样本
            generated = self.decode(z)

        return generated

    def get_model_info(self) -> dict:
        """
        获取模型结构信息

        Returns:
            dict: 模型信息字典
        """
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)

        return {
            "input_dim": self.input_dim,
            "latent_dim": self.latent_dim,
            "encoder_dims": self.encoder_dims,
            "decoder_dims": self.decoder_dims,
            "total_parameters": total_params,
            "trainable_parameters": trainable_params,
            "dropout_rate": self.dropout_rate,
        }


def vae_loss_function(
    x_hat: torch.Tensor, x: torch.Tensor, mu: torch.Tensor, logvar: torch.Tensor, beta: float = 1.0
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    VAE损失函数

    Args:
        x_hat (torch.Tensor): 重构数据
        x (torch.Tensor): 原始数据
        mu (torch.Tensor): 均值向量
        logvar (torch.Tensor): 对数方差向量
        beta (float): KL散度权重（β-VAE）

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: (total_loss, recon_loss, kl_loss)
    """
    # 重构损失（MSE）
    recon_loss = F.mse_loss(x_hat, x, reduction="mean")

    # KL散度损失
    kl_loss = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())

    # 总损失
    total_loss = recon_loss + beta * kl_loss

    return total_loss, recon_loss, kl_loss


def train_vae(model, data, epochs=100, batch_size=512, lr=1e-3, beta=1.0, verbose=True):
    """
    训练VAE模型

    Args:
        model: VAE模型
        data: 训练数据
        epochs: 训练轮数
        batch_size: 批次大小
        lr: 学习率
        beta: KL散度权重
        verbose: 是否打印训练信息

    Returns:
        训练损失历史
    """
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()

    # 创建数据加载器
    dataset = torch.utils.data.TensorDataset(data)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    loss_history = {"total": [], "recon": [], "kl": []}

    for epoch in range(epochs):
        epoch_total_loss = 0
        epoch_recon_loss = 0
        epoch_kl_loss = 0

        for batch_idx, (batch_data,) in enumerate(dataloader):
            optimizer.zero_grad()

            # 前向传播
            x_hat, mu, logvar = model(batch_data)

            # 计算损失
            total_loss, recon_loss, kl_loss = vae_loss_function(x_hat, batch_data, mu, logvar, beta)

            # 反向传播
            total_loss.backward()
            optimizer.step()

            epoch_total_loss += total_loss.item()
            epoch_recon_loss += recon_loss.item()
            epoch_kl_loss += kl_loss.item()

        # 计算平均损失
        num_batches = len(dataloader)
        avg_total_loss = epoch_total_loss / num_batches
        avg_recon_loss = epoch_recon_loss / num_batches
        avg_kl_loss = epoch_kl_loss / num_batches

        loss_history["total"].append(avg_total_loss)
        loss_history["recon"].append(avg_recon_loss)
        loss_history["kl"].append(avg_kl_loss)

        if verbose and (epoch + 1) % 10 == 0:
            print(
                f"Epoch [{epoch + 1}/{epochs}] - Total Loss: {avg_total_loss:.4f}, "
                f"Recon Loss: {avg_recon_loss:.4f}, KL Loss: {avg_kl_loss:.4f}"
            )

    return loss_history


In [None]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 准备地震属性数据
seismic_attrs = processed_seismic_full[good_attributes].copy()
print(f"使用的属性数量: {len(good_attributes)}")
print(f"地震样本数量: {len(seismic_attrs)}")

# 标准化数据
scaler = StandardScaler()
seismic_attrs_scaled = scaler.fit_transform(seismic_attrs)
seismic_attrs_tensor = torch.FloatTensor(seismic_attrs_scaled).to(device)

print(f"数据形状: {seismic_attrs_tensor.shape}")

In [None]:
# 创建VAE模型
input_dim = len(good_attributes)
latent_dim = 8  # 隐变量维度，可以调整

# 根据输入维度自适应调整网络结构
if input_dim <= 16:
    encoder_dims = [32, 16]
    decoder_dims = [16, 32]
elif input_dim <= 32:
    encoder_dims = [64, 32, 16]
    decoder_dims = [16, 32, 64]
else:
    encoder_dims = [128, 64, 32, 16]
    decoder_dims = [16, 32, 64, 128]

vae_model = VAE(
    input_dim=input_dim,
    latent_dim=latent_dim,
    encoder_dims=encoder_dims,
    decoder_dims=decoder_dims,
    activation="relu",
    dropout_rate=0.1,
).to(device)

# 打印模型信息
model_info = vae_model.get_model_info()
print("VAE模型信息:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

In [None]:
# 训练VAE模型
print("开始训练VAE模型...")
loss_history = train_vae(
    model=vae_model, data=seismic_attrs_tensor, epochs=100, batch_size=512, lr=1e-3, beta=1.0, verbose=True
)
print("VAE训练完成!")

In [None]:
# 可视化训练损失
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(loss_history["total"])
plt.title("总损失")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(loss_history["recon"])
plt.title("重构损失")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)

plt.subplot(1, 3, 3)
plt.plot(loss_history["kl"])
plt.title("KL散度损失")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "vae_training_loss.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 获取隐变量表示
print("提取隐变量表示...")
vae_model.eval()
with torch.no_grad():
    latent_representations = vae_model.get_latent_representation(seismic_attrs_tensor)
    latent_numpy = latent_representations.cpu().numpy()

print(f"隐变量形状: {latent_numpy.shape}")

# 可视化隐变量分布（前两个维度）
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(latent_numpy[:, 0], latent_numpy[:, 1], alpha=0.6, s=1)
plt.xlabel("隐变量维度 1")
plt.ylabel("隐变量维度 2")
plt.title("隐变量空间分布")
plt.grid(True)

# 隐变量各维度的分布
plt.subplot(1, 2, 2)
for i in range(min(latent_dim, 4)):  # 最多显示前4个维度
    plt.hist(latent_numpy[:, i], bins=50, alpha=0.7, label=f"维度 {i + 1}")
plt.xlabel("值")
plt.ylabel("频率")
plt.title("隐变量各维度分布")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "vae_latent_distribution.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# K-means聚类函数
def perform_kmeans_clustering(data, max_clusters=10, random_state=42):
    """
    执行K-means聚类并评估最优聚类数

    Args:
        data: 聚类数据
        max_clusters: 最大聚类数
        random_state: 随机种子

    Returns:
        最优聚类结果和评估指标
    """
    inertias = []
    silhouette_scores = []
    cluster_range = range(2, max_clusters + 1)

    results = {}

    print("评估不同聚类数的效果...")
    for n_clusters in cluster_range:
        # K-means聚类
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
        cluster_labels = kmeans.fit_predict(data)

        # 计算评估指标
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(data, cluster_labels)

        inertias.append(inertia)
        silhouette_scores.append(silhouette_avg)

        results[n_clusters] = {
            "model": kmeans,
            "labels": cluster_labels,
            "inertia": inertia,
            "silhouette_score": silhouette_avg,
        }

        print(f"聚类数 {n_clusters}: 惯性={inertia:.2f}, 轮廓系数={silhouette_avg:.3f}")

    # 寻找最优聚类数（基于轮廓系数）
    best_n_clusters = cluster_range[np.argmax(silhouette_scores)]
    print(f"\n基于轮廓系数的最优聚类数: {best_n_clusters}")

    return results, inertias, silhouette_scores, best_n_clusters


In [None]:
# 对隐变量进行K-means聚类
clustering_results, inertias, silhouette_scores, best_n_clusters = perform_kmeans_clustering(
    data=latent_numpy, max_clusters=3, random_state=42
)

# 可视化聚类评估指标
plt.figure(figsize=(15, 5))

# 肘部法则图
plt.subplot(1, 3, 1)
cluster_range = range(2, len(inertias) + 2)
plt.plot(cluster_range, inertias, "bo-")
plt.xlabel("聚类数")
plt.ylabel("惯性 (Within-cluster Sum of Squares)")
plt.title("肘部法则")
plt.grid(True)

# 轮廓系数图
plt.subplot(1, 3, 2)
plt.plot(cluster_range, silhouette_scores, "ro-")
plt.xlabel("聚类数")
plt.ylabel("轮廓系数")
plt.title("轮廓系数评估")
plt.axvline(x=best_n_clusters, color="g", linestyle="--", label=f"最优聚类数: {best_n_clusters}")
plt.legend()
plt.grid(True)

# 聚类结果可视化（使用最优聚类数）
plt.subplot(1, 3, 3)
best_labels = clustering_results[best_n_clusters]["labels"]
scatter = plt.scatter(latent_numpy[:, 0], latent_numpy[:, 1], c=best_labels, cmap="tab10", alpha=0.6, s=1)
plt.xlabel("隐变量维度 1")
plt.ylabel("隐变量维度 2")
plt.title(f"K-means聚类结果 (k={best_n_clusters})")
plt.colorbar(scatter)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "kmeans_evaluation.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 获取最终聚类结果
final_cluster_labels = clustering_results[best_n_clusters]["labels"]
final_silhouette = clustering_results[best_n_clusters]["silhouette_score"]

print(f"最终聚类配置:")
print(f"  聚类数: {best_n_clusters}")
print(f"  轮廓系数: {final_silhouette:.4f}")
print(f"  样本总数: {len(final_cluster_labels)}")

# 统计各聚类的样本数量
unique_labels, counts = np.unique(final_cluster_labels, return_counts=True)
print(f"\n各聚类样本数量:")
for label, count in zip(unique_labels, counts):
    print(f"  聚类 {label}: {count} 个样本 ({count / len(final_cluster_labels) * 100:.1f}%)")

# 将聚类结果添加到地震数据中
seismic_clustered = processed_seismic_full.copy()
seismic_clustered["Cluster"] = final_cluster_labels

# 保存聚类结果
seismic_clustered.to_csv(os.path.join(output_dir, "seismic_vae_kmeans_clusters.csv"), index=False)
print(f"\n聚类结果已保存到: {os.path.join(output_dir, 'seismic_vae_kmeans_clusters.csv')}")

In [None]:
# 空间聚类结果可视化
plt.figure(figsize=(15, 10))

# 主要的聚类空间分布图
plt.subplot(2, 3, (1, 4))
scatter = plt.scatter(
    seismic_clustered["X"], seismic_clustered["Y"], c=seismic_clustered["Cluster"], cmap="tab10", s=10, alpha=0.7
)
plt.colorbar(scatter, label="聚类标签")
plt.xlabel("X坐标")
plt.ylabel("Y坐标")
plt.title(f"VAE+K-means聚类空间分布 (k={best_n_clusters})")

# 各个聚类的单独分布
for i, cluster_id in enumerate(unique_labels[:4]):  # 最多显示前4个聚类
    plt.subplot(2, 3, i + 2)
    cluster_data = seismic_clustered[seismic_clustered["Cluster"] == cluster_id]
    plt.scatter(cluster_data["X"], cluster_data["Y"], s=1, alpha=0.7)
    plt.title(f"聚类 {cluster_id} ({len(cluster_data)} 样本)")
    plt.xlabel("X坐标")
    plt.ylabel("Y坐标")

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "vae_kmeans_spatial_clusters.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# 分析各聚类的属性特征
def analyze_cluster_characteristics(data, cluster_labels, attributes, n_clusters):
    """
    分析各聚类的属性特征
    """
    print("各聚类属性特征分析:")
    print("=" * 60)

    cluster_stats = {}

    for cluster_id in range(n_clusters):
        cluster_mask = cluster_labels == cluster_id
        cluster_data = data[cluster_mask]

        print(f"\n聚类 {cluster_id} (样本数: {np.sum(cluster_mask)}):")
        print("-" * 40)

        # 计算每个属性的统计信息
        stats = {}
        for attr in attributes:
            attr_values = cluster_data[attr]
            stats[attr] = {
                "mean": attr_values.mean(),
                "std": attr_values.std(),
                "min": attr_values.min(),
                "max": attr_values.max(),
            }
            print(f"  {attr}: 均值={stats[attr]['mean']:.3f}, 标准差={stats[attr]['std']:.3f}")

        cluster_stats[cluster_id] = stats

    return cluster_stats


# 创建聚类特征热力图
def plot_cluster_heatmap(cluster_stats, attributes, n_clusters):
    """
    绘制各聚类属性特征的热力图
    """
    # 准备热力图数据
    heatmap_data = np.zeros((n_clusters, len(attributes)))

    for i, cluster_id in enumerate(range(n_clusters)):
        for j, attr in enumerate(attributes):
            heatmap_data[i, j] = cluster_stats[cluster_id][attr]["mean"]

    # 对每个属性进行标准化以便比较
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    heatmap_data_normalized = scaler.fit_transform(heatmap_data.T).T

    plt.figure(figsize=(15, 8))
    sns.heatmap(
        heatmap_data_normalized,
        annot=True,
        fmt=".2f",
        xticklabels=attributes,
        yticklabels=[f"聚类 {i}" for i in range(n_clusters)],
        cmap="RdYlBu_r",
        center=0,
    )
    plt.title("各聚类属性特征热力图 (标准化后)")
    plt.xlabel("地震属性")
    plt.ylabel("聚类")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "cluster_characteristics_heatmap.png"), dpi=300, bbox_inches="tight")
    plt.show()


In [None]:
# 分析聚类特征
cluster_characteristics = analyze_cluster_characteristics(
    data=seismic_clustered, cluster_labels=final_cluster_labels, attributes=good_attributes, n_clusters=best_n_clusters
)

# 绘制聚类特征热力图
plot_cluster_heatmap(cluster_characteristics, good_attributes, best_n_clusters)

In [None]:
# 保存VAE模型和聚类结果
print("保存模型和结果...")

# 保存VAE模型
torch.save(
    {
        "model_state_dict": vae_model.state_dict(),
        "model_config": {
            "input_dim": input_dim,
            "latent_dim": latent_dim,
            "encoder_dims": encoder_dims,
            "decoder_dims": decoder_dims,
        },
        "scaler_params": {
            "mean": scaler.mean_.tolist(),  # 转换为Python list
            "scale": scaler.scale_.tolist(),  # 转换为Python list
        },
        "good_attributes": good_attributes,
        "best_n_clusters": best_n_clusters,
    },
    os.path.join(output_dir, "vae_model.pth"),
)

# 保存聚类模型
import joblib

joblib.dump(clustering_results[best_n_clusters]["model"], os.path.join(output_dir, "kmeans_model.pkl"))


# 定义类型转换函数
def convert_numpy_types(obj):
    """
    递归转换NumPy类型为Python原生类型
    """
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj


# 保存完整结果摘要
results_summary = {
    "vae_config": model_info,
    "best_n_clusters": int(best_n_clusters),  # 确保是Python int
    "silhouette_score": float(final_silhouette),  # 确保是Python float
    "cluster_sizes": dict(
        zip([int(x) for x in unique_labels.tolist()], [int(x) for x in counts.tolist()])
    ),  # 转换为Python int
    "good_attributes": good_attributes,
    "total_samples": int(len(final_cluster_labels)),  # 确保是Python int
}

# 应用类型转换
results_summary = convert_numpy_types(results_summary)

import json

with open(os.path.join(output_dir, "vae_kmeans_results_summary.json"), "w", encoding="utf-8") as f:
    json.dump(results_summary, f, ensure_ascii=False, indent=2)

print("保存完成!")
print(f"模型文件: {os.path.join(output_dir, 'vae_model.pth')}")
print(f"聚类模型: {os.path.join(output_dir, 'kmeans_model.pkl')}")
print(f"结果摘要: {os.path.join(output_dir, 'vae_kmeans_results_summary.json')}")
print(f"聚类数据: {os.path.join(output_dir, 'seismic_vae_kmeans_clusters.csv')}")