In [2]:
import json
json_str = open("p1_cluster_data.json","r").read()
p1_cluster_data = json.loads(json_str)

In [3]:
cluster_centers = []
for item in p1_cluster_data:
    center = item["cluster_center"]
    cluster_centers.append(center)

In [45]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def dynamic_clustering_centers_sequential(
    cluster_centers, similarity_threshold=0.3, max_group_size=30
):
    """
    对聚类中心进行动态聚类，保持顺序性并限制每组点的数量。

    参数:
        cluster_centers (list): 输入的聚类中心点列表（每个为向量）。
        similarity_threshold (float): 相似度阈值。
        max_group_size (int): 每组最多包含的点数。

    返回:
        cluster_centers_p2 (list): 新的聚类中心点列表。
        clusters_index (list): 每个聚类中对应的原始 cluster_centers 的 index 列表。
    """
    cluster_centers_p2 = []  # 新的聚类中心点列表
    clusters_index = []  # 每组对应的原始聚类中心的索引列表

    current_group = []
    current_center = None

    for i, center in enumerate(cluster_centers):
        if not current_group:
            # 初始化第一个组
            current_group.append(i)
            current_center = center
        else:
            # 计算当前中心与当前组中心的相似度
            similarity = cosine_similarity([center], [current_center])[0][0]
            #print("similarity", similarity, "index", i, "len(current_group)", len(current_group))
            if similarity >= similarity_threshold and len(current_group) < max_group_size:
                # 加入当前组
                #print("index",i, "join group")
                current_group.append(i)
                # 更新组中心
                current_center = np.mean(
                    [cluster_centers[idx] for idx in current_group],
                    axis=0
                )
            else:
                # 结束当前组并创建新组
                clusters_index.append(current_group)
                cluster_centers_p2.append(current_center)

                current_group = [i]
                current_center = center
                #break

    # 将最后一个组加入结果
    if current_group:
        clusters_index.append(current_group)
        cluster_centers_p2.append(current_center)

    return cluster_centers_p2, clusters_index

In [46]:
# 示例调用
cluster_centers_p2, clusters_index = dynamic_clustering_centers_sequential(cluster_centers)
# 输出结果
print("新的聚类中心点数量:", len(cluster_centers_p2))
print("每个聚类的原始索引:", clusters_index)

新的聚类中心点数量: 38
每个聚类的原始索引: [[0, 1], [2, 3, 4, 5, 6, 7, 8], [9, 10, 11], [12, 13, 14], [15, 16, 17], [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [33, 34, 35], [36], [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66], [67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96], [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126], [127], [128, 129, 130, 131, 132, 133, 134], [135], [136], [137], [138, 139, 140, 141, 142, 143, 144], [145, 146, 147, 148], [149, 150, 151, 152], [153], [154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183], [184], [185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,

In [55]:
def combine_clusters_and_centers(clusters, cluster_centers):
    """
    将聚类结果和中心点合并成结构化数据。

    参数:
        clusters (list): 聚类后的段落组，每组是一个段落列表。
        cluster_centers (list): 每个类别的中心向量。

    返回:
        list: 每个类别包含中心向量和段落列表的字典。
    """
    combined_data = []
    for center, para_list in zip(cluster_centers, clusters):
        combined_data.append({
            "cluster_center": list(center),  # 将 NumPy 数组转为列表
            "para_list": para_list
        })
    return combined_data

# 使用示例
combined_data = combine_clusters_and_centers(clusters_index, cluster_centers_p2)

# 检查输出
print(combined_data[1])  # 查看第一个类别的内容

{'cluster_center': [0.9857987250600543, -0.3283881289618356, 0.4229450768658093, 1.149447568825313, -0.33775764278003145, -0.4667956248990127, 0.3780176171234676, -7.156915596553257, 0.6572872996330261, -0.7464425223214286, -0.2707479461761458, -0.2769071193678038, 1.06223863363266, 0.1694998453770365, 0.26061205353055683, -0.057416643149086406, -0.6479008793830872, 0.20759161774601256, 0.09901451618809785, 9.468122550419398, 0.747743044580732, -0.42488548904657364, -1.2819325498172216, 0.41232914690460476, 0.44255306145974566, -0.7359139536108289, -0.12049431380416666, 0.7839081798280988, 0.7921500099556786, -0.32989270985126495, -0.8316041146005902, 0.13495383305209024, 0.29555640050343107, 0.5573640039988926, -1.0266165137290955, 1.046918851988656, 0.388860006417547, -0.4284930569784982, -0.6335409113339016, -0.351677625307015, -0.13658655913812773, 1.0182086059025355, -0.624523230961391, 1.566949486732483, -0.7954771901879992, 0.21745578715178585, 1.4290210689817155, 0.405232574258

In [56]:
import json

# 保存到 JSON 文件
with open("p2_cluster_data.json", "w", encoding="utf-8") as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=4)

print("数据已保存到 p2_cluster_data.json")


数据已保存到 p2_cluster_data.json
