选择 BEIR 语料 

In [2]:
import os, json, re, uuid, logging, numpy as np
import pathlib
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader

DATASET = "nfcorpus"          # 也可换成 ms_marco, trec-covid 等
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{DATASET}.zip"
out_dir = os.path.join(os.getcwd(), "datasets")
DATA_PATH = util.download_and_unzip(url, out_dir)
corpus, *_ = GenericDataLoader(data_folder=DATA_PATH).load(split="train")

  0%|          | 0/3633 [00:00<?, ?it/s]

100%|██████████| 3633/3633 [00:00<00:00, 72313.18it/s]


文本清洗 & 分块

In [20]:
import re
import logging

# 预编译正则，加速
punct_re = re.compile(r'[\.!?]')

def clean(text: str) -> str:
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def split_into_chunks(text: str, max_chars: int = 600):
    """
    按 max_chars 字符切片，然后向后延展到下一个句号/问号/感叹号，确保不截断句子。
    返回列表，每项为 (chunk_text, start_index, end_index)。
    """
    text = text.strip()
    spans = []
    i, L = 0, len(text)
    while i < L:
        end = min(L, i + max_chars)
        m = punct_re.search(text, end)
        if m:
            end = m.end()
        chunk = text[i:end].strip()
        spans.append((chunk, i, end))
        i = end
    logging.info(f"Created {len(spans)} chunks from text (max_chars={max_chars})")
    return spans

# Take only first 10 docs
doc_items = list(corpus.items())[:3]
docs = {doc_id: clean(record["text"]) for doc_id, record in doc_items}
#docs = {doc_id: clean(record["text"]) for doc_id, record in corpus.items()}

Prompt driven Relation Extraction

In [4]:
import os
import json
from typing import List, Dict, Any
import openai
import spacy

# 1. 初始化 DeepSeek/OpenAI 客户端
client = openai.OpenAI(
    api_key="sk-a1b9aa762afb4c32bfc609449c4462ac",
    base_url="https://api.deepseek.com/v1"
)

# 2. 定义工具调用 schema：extract_entities_relations
tools = [
    {
        "type": "function",
        "function": {
            "name": "extract_entities_relations",
            "description": "Extract entities and relations from a text chunk, with self-audit in one pass",
            "parameters": {
                "type": "object",
                "properties": {
                    "entities": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "entity_name": {"type": "string"},
                                "entity_type": {"type": "string"},
                                "entity_description": {"type": "string"},
                            },
                            "required": ["entity_name", "entity_type", "entity_description"]
                        }
                    },
                    "relations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "source_entity": {"type": "string"},
                                "relation": {"type": "string"},
                                "target_entity": {"type": "string"},
                                "relation_description": {"type": "string"},
                            },
                            "required": ["source_entity", "relation", "target_entity", "relation_description"]
                        }
                    }
                },
                "required": ["entities", "relations"]
            }
        }
    }
]

system_prompt = {
    "role": "system",
    "content": (
        "You are an information-extraction assistant. In one pass, extract:\n"
        " 1) All entities (name,type,description);\n"
        " 2) All relations (source,relation,target,description);\n"
        "Then self-audit and add any you missed. Return JSON matching the tool schema."
    )
}

# 3. 核心示例和简短系统指令
few_shot = [
    {
        "role": "system",
        "content": (
            "You are an information extraction assistant.  \n"
            "Perform these steps in one pass:\n"
            "1) Extract entities with {entity_name, entity_type, entity_description};\n"
            "2) Extract relations as triples {source_entity, relation, target_entity, relation_description};\n"
            "3) Self-audit: review your own output and add any missing items.\n"
            "Return exactly a JSON matching the tool schema."
        )
    },
    {
        "role": "user",
        "content": (
            "Chunk:\n"
            "\"Alice joined Acme Corp in 2021 and moved to London for work.\""
        )
    },
    {
        "role": "assistant",
        "content": "I will extract entities and relations from this text.",
        "tool_calls": [
            {
                "id": "call_123",
                "type": "function",
                "function": {
                    "name": "extract_entities_relations",
                    "arguments": json.dumps({
                        "entities": [
                            {"entity_name": "Alice", "entity_type": "Person", "entity_description": "An individual who joined a company in 2021"},
                            {"entity_name": "Acme Corp", "entity_type": "Organization", "entity_description": "Company Alice joined in 2021"},
                            {"entity_name": "London", "entity_type": "Location", "entity_description": "City Alice moved to for work"}
                        ],
                        "relations": [
                            {"source_entity": "Alice", "relation": "employment", "target_entity": "Acme Corp", "relation_description": "Alice joined Acme Corp in 2021"},
                            {"source_entity": "Alice", "relation": "relocation", "target_entity": "London", "relation_description": "Alice moved to London for work"}
                        ]
                    })
                }
            }
        ]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": "Successfully extracted entities and relations."
    }
]

def extract_chunk(chunk: str) -> Dict[str, Any]:
    try:
        resp = client.chat.completions.create(
            model="deepseek-chat",
            messages=few_shot + [{"role":"user","content":f"Chunk:\n\"{chunk}\""}],
            tools=tools,
            tool_choice={"type": "function", "function": {"name": "extract_entities_relations"}},
            temperature=0.0
        )
        logging.info(f"Response content: {resp}")
        # 处理工具调用响应
        if resp.choices[0].message.tool_calls:
            tool_call = resp.choices[0].message.tool_calls[0]
            return json.loads(tool_call.function.arguments)
        else:
            logging.error("No tool calls in response")
            raise Exception("No tool calls in response")
    except Exception as e:
        logging.error(f"Error processing chunk: {str(e)}")
        raise

批量读取

In [5]:

logging.basicConfig(format='%(asctime)s - %(message)s',
                   datefmt='%Y-%m-%d %H:%M:%S',
                   level=logging.INFO,
                   handlers=[LoggingHandler()])

In [22]:
from concurrent.futures import ThreadPoolExecutor
from sentence_transformers import SentenceTransformer

EMB = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
chunk_meta = {}          # cid -> {doc, start, end}
all_extractions = {}     # cid -> extraction result
chunk_emb_cache = {}     # cid -> embedding vector (np.ndarray)

def process_chunk(item):
    """
    Worker function for a single chunk: extract relations and embed.
    """
    doc_id, idx, chunk, start, end = item
    cid = f"{doc_id}#{idx}"
    ext = extract_chunk(chunk)
    vec = EMB.encode(chunk, normalize_embeddings=True, convert_to_numpy=True).astype(np.float32)
    return cid, doc_id, start, end, ext, vec

# === 2. Parallel Extraction & Embedding ===
# Prepare task list
tasks = []  # list of (doc_id, idx, chunk, start, end)
for doc_id, text in docs.items():
    for idx, (chunk, start, end) in enumerate(split_into_chunks(text)):
        tasks.append((doc_id, idx, chunk, start, end))

# Process with ThreadPoolExecutor
n_workers = min(8, len(tasks))
with ThreadPoolExecutor(max_workers=n_workers) as executor:
    for cid, doc_id, start, end, ext, vec in executor.map(process_chunk, tasks):
        chunk_meta[cid] = {"doc": doc_id, "start": start, "end": end}
        all_extractions[cid] = ext
        chunk_emb_cache[cid] = vec

2025-04-26 21:36:26 - Use pytorch device_name: mps
2025-04-26 21:36:26 - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-04-26 21:36:26 - Created 3 chunks from text (max_chars=600)
2025-04-26 21:36:26 - Created 3 chunks from text (max_chars=600)
2025-04-26 21:36:26 - Created 2 chunks from text (max_chars=600)
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:27 - HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-26 21:36:28 - HTTP Reque

Batches: 100%|██████████| 1/1 [00:00<00:00,  7.26it/s]


2025-04-26 21:37:25 - Response content: ChatCompletion(id='bce2f6bc-cf4d-46c4-84e4-c2c5b8b8c3dc', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_b80278b8-9f40-475a-8ac5-075cf09470e2', function=Function(arguments='{"entities":[{"entity_name":"statins","entity_type":"Drug","entity_description":"Medications studied for their effects on breast cancer"},{"entity_name":"simvastatin","entity_type":"Drug","entity_description":"A specific statin studied for preventing growth in breast cancer cell lines and animal models"},{"entity_name":"breast cancer","entity_type":"Disease","entity_description":"The disease under investigation in the study"},{"entity_name":"English cancer registries","entity_type":"Organization","entity_description":"Source of data for the cohort study"},{"entity_name":"National Ca

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]


2025-04-26 21:37:51 - Response content: ChatCompletion(id='3164543c-dcad-447e-a40a-2a903da6f121', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_17b1ba6c-9db3-403b-99eb-26c28bcfba85', function=Function(arguments='{"entities":[{"entity_name":"38","entity_type":"Statistic","entity_description":"A numerical value indicating a statistical result"},{"entity_name":"P < 0.01","entity_type":"Statistical Significance","entity_description":"Indicates a statistically significant result with a p-value less than 0.01"},{"entity_name":"processed fish products","entity_type":"Food Product","entity_description":"A category of food items derived from fish"},{"entity_name":"beta = 0.59","entity_type":"Statistic","entity_description":"A regression coefficient indicating the strength of association"},{"entity_n

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.44it/s]


2025-04-26 21:38:13 - Response content: ChatCompletion(id='937241b6-bd78-41ff-ab92-f698a70d33be', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_5819920a-88e6-4ec3-ad69-2e0ecea3248b', function=Function(arguments='{"entities":[{"entity_name":"hazard ratios (HRs)","entity_type":"Statistical Measure","entity_description":"Unadjusted and adjusted hazard ratios for breast cancer-specific and all-cause mortality"},{"entity_name":"breast cancer","entity_type":"Disease","entity_description":"A type of cancer affecting breast tissue"},{"entity_name":"statin users","entity_type":"Population Group","entity_description":"Individuals who use statins after breast cancer diagnosis"},{"entity_name":"time-dependent Cox regression models","entity_type":"Statistical Method","entity_description":"Models used to

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


2025-04-26 21:38:30 - Response content: ChatCompletion(id='0308046c-115a-4c8e-98cc-8433bb070a5e', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_8841c2d2-5e1f-42b4-b9e9-4bc5fefdf4f7', function=Function(arguments='{"entities":[{"entity_name":"79 (0.63-1.00)","entity_type":"Statistical Value","entity_description":"A statistical value representing a range of 0.63 to 1.00"},{"entity_name":"0.81 (0.70-0.95)","entity_type":"Statistical Value","entity_description":"A statistical value representing a range of 0.70 to 0.95"},{"entity_name":"large population-based breast cancer cohort","entity_type":"Study Group","entity_description":"A group studied in the research involving breast cancer patients"},{"entity_name":"statin users","entity_type":"Study Group","entity_description":"Individuals who use st

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]


2025-04-26 21:38:57 - Response content: ChatCompletion(id='f08e6455-5d55-498c-902a-ac0212d64762', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_79a79f25-43b9-4931-9cf5-da5e5e98a9ef', function=Function(arguments='{"entities":[{"entity_name":"4-nonylphenol (NP)","entity_type":"Chemical","entity_description":"A chemical compound measured in human milk samples"},{"entity_name":"4-octylphenol (OP)","entity_type":"Chemical","entity_description":"A chemical compound measured in human milk samples"},{"entity_name":"human milk samples","entity_type":"Biological Sample","entity_description":"Samples collected for measuring NP and OP concentrations"},{"entity_name":"mothers\' demographics","entity_type":"Demographic Data","entity_description":"Data related to the mothers\' characteristics"},{"entity_n

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.98it/s]


2025-04-26 21:39:31 - Response content: ChatCompletion(id='f27b44da-4570-4664-8aed-ea1782a43fe5', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_5d16040e-5868-4463-803f-03d7fcb75795', function=Function(arguments='{"entities":[{"entity_name":"Cox proportional hazards regression method","entity_type":"Method","entity_description":"Statistical method used to estimate mortality among statin users"},{"entity_name":"statin users","entity_type":"Group","entity_description":"Participants who used statins"},{"entity_name":"4,151 participants","entity_type":"Group","entity_description":"Total number of participants who used statins"},{"entity_name":"median follow-up of 3.25 years","entity_type":"Time","entity_description":"Median duration of follow-up after diagnosis"},{"entity_name":"6,011 participan

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s]


2025-04-26 21:39:58 - Response content: ChatCompletion(id='5c24014f-8054-42d2-ad8a-6c3ab34155aa', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_963771d3-c7bc-4272-98fc-f7d5782b9f12', function=Function(arguments='{"entities":[{"entity_name":"post-diagnostic statin use","entity_type":"Medical Treatment","entity_description":"Statin use after diagnosis, associated with risk decrease"},{"entity_name":"healthy adherer bias","entity_type":"Bias","entity_description":"Bias where healthier patients are more likely to adhere to treatment"},{"entity_name":"dying cancer patients","entity_type":"Patient Group","entity_description":"Cancer patients who are dying, less likely to continue statin use"},{"entity_name":"pre-diagnostic statin users","entity_type":"Patient Group","entity_description":"Patients

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.42it/s]


可视化知识图谱

In [27]:
from pyvis.network import Network

net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")
all_nodes = set()
for cid, ie in all_extractions.items():
    # add entities
    for ent in ie.get("entities", []):
        all_nodes.add(ent["entity_name"])
    # add relation endpoints
    for rel in ie.get("relations", []):
        all_nodes.add(rel["source_entity"])
        all_nodes.add(rel["target_entity"])
# add nodes to graph
for node in all_nodes:
    net.add_node(node)
# add edges
for cid, ie in all_extractions.items():
    for rel in ie.get("relations", []):
        net.add_edge(
            rel["source_entity"],
            rel["target_entity"],
            title=rel.get("relation_description",""),
            label=rel.get("relation","")
        )
net.toggle_physics(True)
net.show_buttons(['physics'])
net.write_html("knowledge_graph.html")
print("Knowledge graph saved to knowledge_graph.html")

Knowledge graph saved to knowledge_graph.html


从 all_extractions 构建 concept2cid / attr2aid / triples

In [30]:
from typing import Dict, Any, List, Tuple
concept2cid: Dict[str,int] = {}
attr2aid: Dict[str,int] = {}
triples: List[Tuple[int,int,int,str]] = []

for cid, ie in all_extractions.items():
    # register concept IDs
    for ent in ie.get("entities", []):
        name = ent["entity_name"].lower()
        concept2cid.setdefault(name, len(concept2cid))
    for rel in ie.get("relations", []):
        src = rel["source_entity"].lower()
        trg = rel["target_entity"].lower()
        concept2cid.setdefault(src, len(concept2cid))
        concept2cid.setdefault(trg, len(concept2cid))
    # build relation triples
    for rel in ie.get("relations", []):
        src = rel["source_entity"].lower()
        trg = rel["target_entity"].lower()
        pred = rel["relation"].lower()
        attr2aid.setdefault(pred, len(attr2aid))
        aid = attr2aid[pred]
        cid_h, cid_t = concept2cid[src], concept2cid[trg]
        triples.append((cid_h, aid, cid_t, cid))

logging.info(f"Built mapping: {len(concept2cid)} concepts, {len(attr2aid)} relations, {len(triples)} triples")

2025-04-26 21:57:27 - Built mapping: 68 concepts, 25 relations, 36 triples


MiniLM 嵌入 + Q8/Q4

In [31]:
import torch
from sentence_transformers import SentenceTransformer

EMB = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
D   = EMB.get_sentence_embedding_dimension()

def quant_tensor(vec: torch.Tensor, bits: int) -> torch.Tensor:
    # vec: 1-D float Tensor (已归一化)
    vmax = vec.abs().max().item()
    scale = (2**(bits-1) - 1) / max(1e-6, vmax)
    return (vec * scale).round().to(torch.int8)

# 4.1 Q8 量化概念向量
concept_vecs_t = torch.zeros((len(concept2cid), D), dtype=torch.int8)
for name, cid in concept2cid.items():
    text = f"{name} [Entity]"  # name + type
    vec_t = EMB.encode(text, normalize_embeddings=True, convert_to_tensor=True)
    q = quant_tensor(vec_t.cpu(), bits=8)
    concept_vecs_t[cid] = q

# 4.2 Q4 量化关系向量
attr_vecs_t = torch.zeros((len(attr2aid), D), dtype=torch.int8)
for pred, aid in attr2aid.items():
    vec_t = EMB.encode(pred, normalize_embeddings=True, convert_to_tensor=True)
    q = quant_tensor(vec_t.cpu(), bits=4)
    attr_vecs_t[aid] = q

2025-04-26 21:57:32 - Use pytorch device_name: mps
2025-04-26 21:57:32 - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.19it/s]
Batches: 1

Super-Community 检测 & 聚簇

In [34]:
import igraph as ig
from sklearn.cluster import KMeans, MiniBatchKMeans

# 5.1 社区检测（Leiden）
G = ig.Graph(n=len(concept2cid), edges=[(h,t) for h,_,t,_ in triples], directed=False)
leiden = G.community_leiden(objective_function="modularity")

# 修改社区映射的创建方式
cid2scid = {}
for comm_id, community in enumerate(leiden):
    for vertex in community:
        cid2scid[vertex] = comm_id

# 5.2 概念簇 & 属性簇 聚类
# 先转 numpy 用于 sklearn
concept_np = concept_vecs_t.numpy().astype(np.float32)
attr_np = attr_vecs_t.numpy().astype(np.float32)

cid_centroids = {}
aid_centroids = {}

for scid in set(cid2scid.values()):
    members = [cid for cid, sc in cid2scid.items() if sc == scid]
    if not members:
        continue
    
    # 概念簇
    k = min(32, len(members))
    km = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(concept_np[members])
    for idx, cid in enumerate(members):
        cid_centroids[cid] = km.cluster_centers_[km.labels_[idx]]
    
    # 属性簇（针对每个概念）
    for cid in members:
        rel_aids = [aid for h,aid,t,_ in triples if h==cid or t==cid]
        if not rel_aids:
            continue
        rel_vecs = attr_np[rel_aids]
        k2 = min(8, len(rel_aids))
        mb = MiniBatchKMeans(n_clusters=k2, random_state=42).fit(rel_vecs)
        for j, aid in enumerate(rel_aids):
            aid_centroids[aid] = mb.cluster_centers_[mb.labels_[j]]

Leaf-Bucket (倒排 + IndexFlatIP)

In [35]:
import json
import numpy as np
import faiss
import pathlib
import shutil
from collections import defaultdict

# 1. 把所有 chunk embedding 写成单个 .npy 并输出 chunk_ids 列表
chunk_ids   = list(chunk_emb_cache.keys())
chunk_vecs  = np.stack([chunk_emb_cache[cid] for cid in chunk_ids], axis=0).astype(np.float32)

np.save("chunk_vecs.npy", chunk_vecs)
with open("chunk_ids.json", "w", encoding="utf-8") as f:
    json.dump(chunk_ids, f, ensure_ascii=False, indent=2)

# 2. 建立 cid -> row_idx 映射，加载 memmap
chunk_idx_map = { cid: i for i, cid in enumerate(chunk_ids) }
chunk_vecs_mm  = np.load("chunk_vecs.npy", mmap_mode="r")  # 只在需要时加载页面

# 3. 为每个 aid 构建 Leaf-Bucket 的 integer pointer 列表
aid_chunks = defaultdict(list)
for _, aid, _, cid in triples:
    # triples 中最后一位就是 chunk_id
    if cid in chunk_idx_map:
        aid_chunks[aid].append(chunk_idx_map[cid])

# 4. 持久化每个 leaf：只存 indices + IVFFlatIP（或 FlatIP）
leaf_dir = pathlib.Path("leaf_buckets")
shutil.rmtree(leaf_dir, ignore_errors=True)
leaf_dir.mkdir(parents=True)

for aid, idxs in aid_chunks.items():
    uniq = sorted(set(idxs))
    if not uniq:
        continue
    
    # 4.1 切出对应 chunk vectors
    vecs = chunk_vecs_mm[uniq]       # shape = (len(uniq), D)
    d_dim = vecs.shape[1]

    # 4.2 用 FlatIP （零训练、100% 精确）
    index = faiss.IndexFlatIP(d_dim)
    index.add(vecs)                  # 加载进内存仅这一小批

    # 4.3 保存索引 & 指针列表
    faiss.write_index(index, str(leaf_dir / f"aid_{aid}.flatip"))
    with open(leaf_dir / f"aid_{aid}.idx.json", "w", encoding="utf-8") as f:
        json.dump(uniq, f, ensure_ascii=False)

print(f"Built {len(list(leaf_dir.glob('*.flatip')))} leaf buckets.")

Built 25 leaf buckets.
