In [None]:
import nest_asyncio
nest_asyncio.apply()
import os
from dotenv import load_dotenv
load_dotenv()  # 加载 .env 文件中的环境变量

In [None]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.llms.langchain import LangChainLLM
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain_openai import ChatOpenAI
from llama_index.embeddings.dashscope import (
    DashScopeEmbedding,
    DashScopeTextEmbeddingModels,
    DashScopeTextEmbeddingType,
)

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
embed_model_ali = DashScopeEmbedding(model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3,text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT)

llm = OpenAI(model="gpt-4o-mini")
llm_doubao = LangChainLLM(llm=ChatOpenAI(model="doubao-1.5-vision-32k"))

Settings.embed_model = embed_model_ali
Settings.llm = llm_doubao


In [None]:
from llama_cloud_services import LlamaParse
import os
import pickle

#还没有生成md_json_list.pkl文件
if not os.path.exists("md_json_list.pkl"):
    parser = LlamaParse(
        result_type="markdown",
        use_vendor_multimodal_model=True,
        vendor_multimodal_model_name="gemini-2.0-flash-001",
        language="ch_sim"
    )
    md_json_objs = parser.get_json_result("data/中文大模型基准测评2025年3月报告.pdf")
    md_json_list = md_json_objs[0]["pages"]

In [None]:
# 把解析结果取下来，只需要运行一次，要等待上一步parse完成，找到这里的job_id

import pickle
if not os.path.exists("md_json_list.pkl"):
    md_result =await parser._get_job_result('7e1be6ef-af58-462f-a339-1a3eba62fd0a',result_type='json')
    md_result["job_id"] = "7e1be6ef-af58-462f-a339-1a3eba62fd0a"

    image_dicts = parser.get_images([md_result], download_path="data_images")
    md_json_list = md_result["pages"]

    #存储到本地，方便后续使用
    pickle.dump(md_json_list, open("md_json_list.pkl", "wb"))
    pickle.dump(image_dicts, open("image_dicts.pkl", "wb"))
else:
    md_json_list = pickle.load(open("md_json_list.pkl", "rb"))
    image_dicts = pickle.load(open("image_dicts.pkl", "rb"))
    print(md_json_list[13])
    print(image_dicts[1])

In [None]:
from pathlib import Path
from llama_index.core.schema import TextNode
from typing import Optional
# get pages loaded through llamaparse
import re

def get_page_number(file_name):
    match = re.search(r"-page_(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0

def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file() and f.suffix == '.jpg']
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files

print(_get_sorted_image_files("./data_images"))

In [None]:
#控制参数

#是否启用文本摘要
ENABLE_TEXT_SUMMARY = True

#是否启用问题抽取
ENABLE_EXTRACTED_QUESTIONS = True

#是否拆分成小文本节点
ENABLE_SMALL_TEXT_NODES = False

#是否启用关键词索引
ENABLE_KEYWORDS_INDEX = False

In [None]:
summary_prompt = """
这是我们希望在整个文档中查找的块：
<chunk>
{CHUNK_CONTENT}
</chunk>
请给出一个简短的摘要，以方便在整个文档中定位这个块，以提高块的检索效果。只回答简短的摘要内容，不要回答其他内容。
"""
def generate_summary(json_dicts, llm):
    for obj in json_dicts:
        print(f'Start generating summary for page {obj["page"]}')
        md_text = obj["md"]
        prompt = summary_prompt.format(CHUNK_CONTENT=md_text)
        summary = llm.complete(prompt)
        print(summary)
        obj["summary"] = summary
    return json_dicts

# 生成摘要信息
if ENABLE_TEXT_SUMMARY:

    import pickle,os
    if os.path.exists("json_dicts_with_summary.pkl"):
        json_dicts_with_summary = pickle.load(open("json_dicts_with_summary.pkl", "rb"))
    else:
        json_dicts_with_summary = generate_summary(md_json_list, llm_doubao)
        pickle.dump(json_dicts_with_summary, open("json_dicts_with_summary.pkl", "wb"))
    
    for obj in json_dicts_with_summary:
        print(obj["summary"])
        

In [None]:
from copy import deepcopy
from pathlib import Path

def get_text_nodes(image_dir=None, json_dicts=None):
    
    nodes = []

    image_files = _get_sorted_image_files(image_dir) if image_dir is not None else None
    
    for idx, obj in enumerate(json_dicts):

        chunk_metadata = {}
        chunk_metadata["page_num"] = idx + 1
        chunk_metadata["image_path"] = str(image_files[idx])

        if "summary" in obj:
            chunk_metadata["text_summary"] = str(obj["summary"])

        node = TextNode(
            text = obj["md"],
            metadata=chunk_metadata,
        )
        nodes.append(node)

    return nodes

if ENABLE_TEXT_SUMMARY:
    text_nodes = get_text_nodes(image_dir="data_images", json_dicts=json_dicts_with_summary)
else:
    text_nodes = get_text_nodes(image_dir="data_images", json_dicts=md_json_list)

print(text_nodes[1].metadata)

In [None]:
from llama_index.core.extractors import QuestionsAnsweredExtractor
DEFAULT_QUESTION_GEN_TMPL = """\
以下是上下文信息:
{context_str}
根据上下文信息，生成 {num_questions} 个问题，这些问题的具体答案不太可能在其他地方找到。
注意问题必须是上下文能够回答的问题，不要问关于上下文之外的问题。
直接输出问题，不要多余说明。
"""

if ENABLE_EXTRACTED_QUESTIONS:
    if os.path.exists("extract_questions.pkl"):
        extract_questions = pickle.load(open("extract_questions.pkl", "rb"))
    else:
        questions_extractor = QuestionsAnsweredExtractor(llm=llm_doubao,prompt_template=DEFAULT_QUESTION_GEN_TMPL,metadata_mode='none',questions=5)
        extract_questions = questions_extractor.extract(text_nodes)
        pickle.dump(extract_questions, open("extract_questions.pkl", "wb"))
        
    for idx, node in enumerate(text_nodes):
        node.metadata.update(extract_questions[idx])

    for node in text_nodes:
        print(node.metadata)

In [None]:
for node in text_nodes:
    node.excluded_embed_metadata_keys= ["page_num", "image_path"]
    node.excluded_llm_metadata_keys = ["page_num", "image_path","text_summary","questions_this_excerpt_can_answer"]

In [None]:
from llama_index.core.node_parser import MarkdownNodeParser,SemanticSplitterNodeParser

if ENABLE_SMALL_TEXT_NODES:
    markdown_parser = MarkdownNodeParser(show_progress=True)
    small_text_nodes = markdown_parser.get_nodes_from_documents(text_nodes)

    print(len(small_text_nodes))
    for node in small_text_nodes:
        print(node.metadata)

In [None]:
import os
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import (
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
    KeywordTableIndex
)

vector_retriever = None
kw_retriever = None

# Define paths
small_top_k = 20
top_k = 3

def create_vector_retriever(nodes_to_index):

    STORAGE_DIR_VECTOR = "./storage_nodes/vector"

    chroma_client = chromadb.HttpClient(host="localhost", port=8000)
    if not os.path.exists(STORAGE_DIR_VECTOR):
        chroma_client.delete_collection("rag_collection")
    collection = chroma_client.get_or_create_collection("rag_collection")
    vector_store = ChromaVectorStore(chroma_collection=collection)

    if not os.path.exists(STORAGE_DIR_VECTOR):
        print(f'Creating vector index【{len(nodes_to_index)} nodes】...\n')
        storage_context =  StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex(nodes_to_index,storage_context=storage_context,show_progress=True,insert_batch_size=5)
        index.storage_context.persist(persist_dir=STORAGE_DIR_VECTOR)
    else:
        print(f'Loading vector index【{len(nodes_to_index)} nodes】...\n')
        storage_context = StorageContext.from_defaults(persist_dir=STORAGE_DIR_VECTOR,vector_store=vector_store)
        index = load_index_from_storage(storage_context=storage_context)

    #向量检索
    vector_retriever = index.as_retriever(similarity_top_k=small_top_k if ENABLE_SMALL_TEXT_NODES else top_k)
    return vector_retriever

def create_keyword_retriever(nodes_to_index):

    STORAGE_DIR_KEYWORD = "./storage_nodes/keyword"

    if not os.path.exists(STORAGE_DIR_KEYWORD):
        print(f'Creating keyeword index【{len(nodes_to_index)} nodes】...\n')

        #构造关键词表索引
        kw_index = KeywordTableIndex(nodes_to_index,show_progress=True)
        kw_index.storage_context.persist(persist_dir=STORAGE_DIR_KEYWORD)
    else:
        print(f'Loading keyeword index【{len(nodes_to_index)} nodes】...\n')
        storage_context =  StorageContext.from_defaults(persist_dir=STORAGE_DIR_KEYWORD)
        kw_index = load_index_from_storage(storage_context= storage_context)

    #返回关键词检索器
    kw_retriever = kw_index.as_retriever(num_chunks_per_query=small_top_k if ENABLE_SMALL_TEXT_NODES else top_k)
    return kw_retriever

vector_retriever = create_vector_retriever(small_text_nodes if ENABLE_SMALL_TEXT_NODES else text_nodes)

if ENABLE_KEYWORDS_INDEX:
    kw_retriever = create_keyword_retriever(small_text_nodes if ENABLE_SMALL_TEXT_NODES else text_nodes)
else:
    kw_retriever = None


In [None]:

from llama_index.core.retrievers import QueryFusionRetriever

if ENABLE_KEYWORDS_INDEX:
    fusion_retriever = QueryFusionRetriever(
        [vector_retriever, kw_retriever],
        similarity_top_k=small_top_k if ENABLE_SMALL_TEXT_NODES else top_k,
        num_queries=1,  # set this to 1 to disable query generation
        mode="reciprocal_rerank",
        use_async=True,
        verbose=True
    )

else:
    fusion_retriever = vector_retriever

In [None]:
#单个问题检索测试
import pandas as pd
from llama_index.core.schema import ImageNode, NodeWithScore, MetadataMode
from llama_index.core.prompts import PromptTemplate

# 获取所有相关的page_num
def get_parent_nodes(nodes):
    related_page_nums = set(node.node.metadata["page_num"] for node in nodes)

    parent_nodes = []
    for node in text_nodes:
        if node.metadata["page_num"] in related_page_nums:
            parent_node = NodeWithScore(node=node, score=0)
            parent_nodes.append(parent_node)
    
    for parent_node in parent_nodes:
        print(parent_node.score)
        parent_node_score = max(node.score for node in nodes if node.node.metadata["page_num"] == parent_node.node.metadata["page_num"])
        parent_node.score = parent_node_score

    parent_nodes.sort(key=lambda x: x.score, reverse=True)
    
    return parent_nodes[0:top_k]

def recursive_retrieve(query):

    nodes = fusion_retriever.retrieve(query)

    #for node in nodes:
    #    print(f'score: {node.score},metadata: {node.node.metadata}')

    if ENABLE_SMALL_TEXT_NODES:
        parent_nodes = get_parent_nodes(nodes)
        print(f"{len(parent_nodes)} parent nodes retrieved")

        for node in parent_nodes:
            print(f'score: {node.score},metadata: {node.node.metadata}')

        return parent_nodes
    else:
        return nodes

recursive_retrieve('SuperCLUE通用基准数据集哪几个维度构成？')

In [None]:
import pandas as pd

#评测功能，如果需要评测检索的准确性，请准备eva_cases.xlsx文件,一个问题，一个对应的页码
# Load the eval_cases.xlsx file
eval_cases = pd.read_excel("./data/eva_cases.xlsx")

def score_retrieval(eval_cases):

    total_score = 0
    for _, row in eval_cases.iterrows():

        score = 0
        max_score = top_k

        question = row["question"]
        true_page = row["page"]

        # Retrieve nodes for the question
        nodes = recursive_retrieve(question)

        # Check the retrieved nodes and calculate the score
        for rank, node in enumerate(nodes):
            retrieved_page = node.node.metadata["page_num"]
            if retrieved_page == true_page:
                score += max_score - rank
                break

        total_score += score
        print(f"Question: {question},true_page: {true_page}, score: {score}")

    print('\nAverage score:', (total_score / len(eval_cases))*20)    

    return total_score

# Calculate the score
#total_score = score_retrieval(eval_cases)

In [None]:
from llama_index.core.query_engine import CustomQueryEngine, SimpleMultiModalQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.schema import ImageNode, NodeWithScore, MetadataMode
from llama_index.core.prompts import PromptTemplate
from llama_index.core.base.response.schema import Response
from typing import Optional
from llama_index.multi_modal_llms.dashscope import DashScopeMultiModal
from llama_index.core.multi_modal_llms import MultiModalLLM
import os
import base64
from doubao import DoubaoVisionLLM

lvm = DoubaoVisionLLM(model_name='ep-20250205153642-hzqpj')

QA_PROMPT_TMPL = """\
以下是幻灯片中解析的Markdown文本和图片信息。Markdown文本已经尝试将相关图表转换为表格。
优先使用图片信息来回答问题。在无法理解图像时才使用Markdown文本信息。

---------------------
{context_str}
---------------------

-- 根据上下文信息并且不依赖先验知识, 回答查询。
-- 解释你是从解析的markdown、还是图片中得到答案的, 如果有差异, 请说明最终答案的理由。
-- 尽可能详细的回答问题。
-- 给出你重点参考的图片路径。

输出格式：{{"response": #你的Markdown格式的回答#, "image_path": [#与答案最相关的图片路径#]}}

查询: {query_str}
答案: """

QA_PROMPT = PromptTemplate(QA_PROMPT_TMPL)

class MultimodalQueryEngine(CustomQueryEngine):

    qa_prompt: PromptTemplate
    multi_modal_llm: MultiModalLLM | DoubaoVisionLLM

    def __init__(self, qa_prompt: Optional[PromptTemplate] = None, **kwargs) -> None:
        """Initialize."""
        super().__init__(qa_prompt=qa_prompt or QA_PROMPT, **kwargs)


    def image_to_base64(self,image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
        
    def custom_query(self, query_str: str):
        
        nodes = recursive_retrieve(query_str)

        # create ImageNode items from text nodes
        image_nodes = [
            NodeWithScore(node=ImageNode(image_path=n.metadata["image_path"]))
            for n in nodes
        ]
        
        # create context string from text nodes, dump into the prompt
        context_str = "\n\n".join(
            [r.get_content(metadata_mode=MetadataMode.LLM) + f'\n以上来自图片：{r.metadata['image_path']}' for r in nodes]
        )
        fmt_prompt = self.qa_prompt.format(context_str=context_str, query_str=query_str)

        response = self.multi_modal_llm.generate_response(
            prompt=fmt_prompt,
            image_paths = [image_node.node.image_path for image_node in image_nodes]
        )

        return Response(
            response=str(response),
            source_nodes=nodes,
            metadata={"text_nodes": text_nodes, "image_nodes": image_nodes},
        )

        return response
    
multi_query_engine = MultimodalQueryEngine(
    multi_modal_llm=lvm
)


In [None]:
response = multi_query_engine.query("3月份中文大模型评测，通用能力水平最高的模型前五名是谁？")

import json
from IPython.display import Markdown

print("\n*****************************************************************RESPONSE*****************************************************************************************\n")
response_json = json.loads(response.response)
answer = response_json.get("response", "")
image_paths = response_json.get("image_path", [])

# Format the answer and image paths in markdown
markdown_output = f"### Answer:\n\n{answer}\n\n### Images:\n"
for image_path in image_paths:
    markdown_output += f"![Image]({image_path})\n"
    
display(Markdown(markdown_output))

In [None]:
from markitdown import MarkItDown

md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
result = md.convert("data/中文大模型基准测评2024年度报告_simple.pdf")
print(result.text_content)

In [None]:
from markitdown import MarkItDown
from doubao import DoubaoVisionLLM

llm_prompt = '''' 
用中文提取图片中的详细信息，并使用Markdown格式化输出。
-- 对于其中的文字，使用OCR识别，并尽量保持原格式或类似格式输出。
-- 对于其中的表格与统计图表信息，选择表格结合文字的方式进行描述。
-- 对于其他有意义的图像部分，请使用文字描述。 
-- 合理排版，使得输出内容清晰易懂
''' 

lvm = DoubaoVisionLLM()

result = lvm.generate_response(prompt=llm_prompt, image_paths=["data/report_test.png"])
display(Markdown(result))