# 向量库流程构建
离线流程

In [38]:
from utils.common_utils import load_articles, build_doubao_embedding
from llama_index.core.schema import TextNode
import json
import os
import uuid

        
emb_model = build_doubao_embedding()
def get_chunk_with_embedding(article_chunks, embedding_path):
    for a_name, chunks in article_chunks.items():
        print(f"Processing {a_name}")
        chunk_texts = [chunk["chunk"] for chunk in chunks]
        response = emb_model(
            # model="doubao-embedding-large-text-250515",
            model="doubao-embedding-text-240715",
            input=chunk_texts,
            encoding_format="float"
        ) 
        chunks_with_embedding = []
        for chunk, data in zip(chunks, response.data):
            chunk["embedding"] = data.embedding
            chunk["doc_id"] = str(uuid.uuid4())
            chunks_with_embedding.append(chunk)
        save_embeddings(chunks_with_embedding, embedding_path)
    return chunks_with_embedding
def save_embeddings(chunks_with_embedding, output_path):
    # 判断 filename 是否存在，如果存在则追加写入，否则创建新文件
    if os.path.exists(output_path):
        with open(output_path, 'r', encoding="utf-8") as f:
            existing = json.load(f)
        existing.extend(chunks_with_embedding)
        with open(output_path, 'w', encoding="utf-8") as f:
            json_str = json.dumps(existing, ensure_ascii=False, indent=4)
            json_str = json_str.replace(',\n\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020', ', ')
            f.write(json_str)
            # json.dump(existing, f, ensure_ascii=False, indent=4)
    else:
        with open(output_path, 'w', encoding="utf-8") as f:
            json_str = json.dumps(chunks_with_embedding, ensure_ascii=False, indent=4)
            # 替换掉 list 中的换行符，让 list 内容显示在一行
            json_str = json_str.replace(',\n\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020\u0020', ', ')
            f.write(json_str)
    print(f"Embedding saved to {output_path}")

In [39]:
article_chunks = load_articles("outputs_chunks/article_chunks06.json")
chunks_with_embedding = get_chunk_with_embedding(article_chunks, embedding_path="outputs_chunks/chunk_embedding08.json")

Processing 2-对联苯-8-羟基喹啉锌...及其应用于新型白光OLED_赵婷_llm_correct.md
Embedding saved to outputs_chunks/chunk_embedding08.json
Processing 2.7”a-Si_TFT矩阵(英文)_熊绍珍_llm_correct.md
Embedding saved to outputs_chunks/chunk_embedding08.json
Processing 2005_OLED行业一瞥_王力_llm_correct.md
Embedding saved to outputs_chunks/chunk_embedding08.json
Processing 200mm×200mm_OLED步进投影曝光机_周畅_llm_correct.md
Embedding saved to outputs_chunks/chunk_embedding08.json


In [3]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.schema import TextNode
from utils.common_utils import load_articles

def get_nodes(embedding_path):
    chunks_with_embedding = load_articles(embedding_path)
    chunks = chunks_with_embedding
    nodes = []
    for chunk in chunks:
        node = TextNode(
            text=chunk["chunk"], 
            id=chunk["chunk_id"], 
            doc_id=chunk["doc_id"],
            embedding=chunk["embedding"],
            metadata={
                "source": chunk["source"],
            }
        )
        nodes.append(node)
    return nodes

In [41]:
embedding_path = "outputs_chunks/chunk_embedding08.json"
allnodes = get_nodes(embedding_path)
docstore = SimpleDocumentStore()
docstore.add_documents(allnodes)

In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import Settings, StorageContext, VectorStoreIndex

# docstore.persist(persist_path="llama_index/docstore.json")
# docstore = SimpleDocumentStore.from_persist_path("llama_index/docstore.json")
db = chromadb.PersistentClient(path="llama_index/chroma_db")
chroma_collection = db.get_or_create_collection(name="sc_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store, docstore=docstore)
# index.storage_context.persist(persist_dir="./vector_index")
storage_context.persist(persist_dir="./vector_index")

In [None]:
vector_store.add(allnodes)

['c2bf6507-7b5a-4474-b58f-56f8bec05dcc',
 'a2b2e4e3-0d15-4aeb-b577-a9e3b2dea74b',
 'c95c7597-9fde-44f0-aecc-8c9b93e386e1',
 '7142332c-6373-4174-92a3-6e2ecdf4767f',
 'f3e47c01-65a3-4027-aeb1-3a8c96a3cf45',
 '3993d0f1-2c72-44a9-a889-7a3b4afa5e42',
 '12ede737-13d4-410e-a929-b18294feabb8',
 '241cad18-5598-4799-b5c8-131facd634b7',
 'b503d1b5-f7f7-4357-b97e-3d343f7420f9',
 'e75b80fb-1616-4800-8f33-c21adef1f89e',
 'baaa6088-fc0b-49fd-ac28-a2d7a4eccd67',
 '2af3ee24-858d-415b-b384-5591254a9e51',
 '64da9b40-17f9-4808-99b1-640559efc613',
 '32822fe6-ef30-4944-9ce2-3f6d930bb634',
 'bca97f89-39f9-4404-9ea6-b81cb578113e',
 '071211c4-236e-4e23-9d39-2fbeea196d44',
 '5bcdbd89-34cb-4609-a2fc-adf8034539e3',
 '9a71fc40-9bfe-4415-a0dd-2aff23ab2a94',
 '18163620-e783-4469-bf0f-ff84124e53dd',
 '9afb4d3b-a539-45b5-b9aa-a35204305649',
 '51b2e0c3-9805-43bc-aba3-06ee33b8311f',
 'f008e965-7b46-4d16-b3ff-a90b8cd97b41',
 '2afe7ed6-3eb1-4ca0-b981-8ee47fb70815',
 'a2a3dd8a-2652-4178-b3ff-7d58ea5d6f0e',
 '25b2bc0a-8783-

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding


embedding_model = OpenAIEmbedding(
    model="doubao-embedding-text-240715",
    api_key=os.environ.get("COMPLETION_OPENAI_API_KEY"),
    api_base=os.environ.get("COMPLETION_OPENAI_BASE_URL"),
)
Settings.embedding = embedding_model


In [None]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    storage_context=storage_context,
    show_progress=True,
    embed_model=embedding_model
)

In [None]:
# 将 index 永久化到 storage_context 中，并返回一个包含 index 的响应。
index.storage_context.persist(persist_dir="./vector_index")

# 检索流程构建 
在线流程

In [4]:
from llama_index.retrievers.bm25 import BM25Retriever
embedding_path = "outputs_chunks/chunk_embedding08.json"
allnodes = get_nodes(embedding_path)
bm25_retriever = BM25Retriever.from_defaults(
    nodes=allnodes,
    # docstore=docstore,
    similarity_top_k=10,
)
bm25_retriever.persist("llama_index/bm25_retriever.json")

Finding newlines for mmindex: 100%|██████████| 218k/218k [00:00<00:00, 43.6MB/s]


In [None]:
loaded_bm25_retriever = BM25Retriever.from_persist_dir("llama_index/bm25_retriever.json")

In [6]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import Settings, StorageContext, VectorStoreIndex

docstore = SimpleDocumentStore.from_persist_path("llama_index/docstore.json")
db = chromadb.PersistentClient(path="llama_index/chroma_db")
chroma_collection = db.get_or_create_collection(name="sc_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store, docstore=docstore)

In [20]:
from typing import List
from llama_index.core.base.embeddings.base import BaseEmbedding
from utils.common_utils import build_doubao_embedding
import os

emb_model = build_doubao_embedding()
class DouBaoEmbedding(BaseEmbedding):
    def __init__(self, model_name: str = "doubao-embedding-text-240715", **kwargs):
        super().__init__(**kwargs)
        self.model_name = model_name

    def _get_embedding(self, texts: list[str] | str) -> List[float] | List[List[float]]:
        # 这里替换为实际调用豆包平台的 API 获取 embedding 的逻辑
        # 例如通过 requests 请求、认证等
        single_text = isinstance(texts, str)
        if single_text:
            texts = [texts]
        response = emb_model(
            model=self.model_name,
            input=texts
        )
        embeddings = [
            embedding_data.embedding for embedding_data in response.data
        ]
        if single_text:
            return embeddings[0]
        return embeddings  # 返回浮点数列表

    async def _aget_embedding(self, text: str) -> List[float]:
        return self._get_embedding(text)

    def _get_text_embedding(self, text: list[str]) -> List[List[float]]:
        return self._get_embedding(text)

    def _get_query_embedding(self, query: str) -> List[float]:
        return self._get_embedding(query)
    async def _aget_text_embedding(self, text: list[str]) -> List[List[float]]:
        return self._get_text_embedding(text)
    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)
embedding_model = DouBaoEmbedding(
    model="doubao-embedding-text-240715",
    api_key=os.environ.get("COMPLETION_OPENAI_API_KEY"),
    api_base=os.environ.get("COMPLETION_OPENAI_BASE_URL"),
)


In [21]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    storage_context=storage_context,
    show_progress=True,
    embed_model=embedding_model
)

vector_retriever = index.as_retriever(
    similarity_top_k=10, 
    verbose=True
)

In [22]:
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)
from llama_index.core.schema import NodeWithScore
from llama_index.core import QueryBundle
from typing import List

# 4. 创建自定义的检索器
class CustomRetriever(BaseRetriever):
    """custom retriever that performs both vector and keyword table retrieval"""
    def __init__(self,
                 vector_retriever: VectorIndexRetriever,
                 bm25_retriever: BM25Retriever,
                 mode: str = "OR",
    ) -> None:
        self._vector_retriever = vector_retriever
        self._bm25_retriever = bm25_retriever
        if mode not in ["AND", "OR"]:
            raise ValueError("mode must be either AND or OR")
        self._mode = mode
        super().__init__()
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """retrieve nodes given query"""
        print(f"Retrieving nodes for query: {query_bundle.query_str}")
        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        bm25_nodes = self._bm25_retriever.retrieve(query_bundle)
        
        vector_ids = {node.node.node_id for node in vector_nodes}
        bm25_ids = {node.node.node_id for node in bm25_nodes}
        
        combined_dict = {node.node.node_id: node for node in vector_nodes}
        combined_dict.update({node.node.node_id: node for node in bm25_nodes})
        
        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(bm25_ids)
        if self._mode == "OR":
            retrieve_ids = vector_ids.union(bm25_ids)
        
        retrieve_nodes = [combined_dict[node_id] for node_id in retrieve_ids]
        print(f"{len(retrieve_nodes)} nodes retrieved")
        return retrieve_nodes


In [23]:
custom_retriever = CustomRetriever(
    vector_retriever, 
    bm25_retriever, 
)
retrieved_nodes = custom_retriever.retrieve("What is the capital of France?")

Retrieving nodes for query: What is the capital of France?
20 nodes retrieved


In [25]:
retrieved_nodes[0]

NodeWithScore(node=TextNode(id_='baaa6088-fc0b-49fd-ac28-a2d7a4eccd67', embedding=None, metadata={'source': '2-对联苯-8-羟基喹啉锌...及其应用于新型白光OLED_赵婷_llm_correct.md'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='这种利用空穴阻挡层来制备白光OLED 的方法, 工艺过程简单、器件稳定性好, 有利于工业化的大规模生产. # 1  实验部分', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.0)

In [27]:
retrieved_nodes[0].node.node_id


'baaa6088-fc0b-49fd-ac28-a2d7a4eccd67'

In [None]:
questions = [
    "Who is the author of the book 'The Great Gatsby'?",
    "What is the main character in 'The Great Gatsby'?",
    "What is the setting of 'The Great Gatsby'?",
    "What is the plot of 'The Great Gatsby'?",
    "What is the theme of 'The Great Gatsby'?",
    "What is the genre of 'The Great Gatsby'?",
]