In [1]:
!pip install google-cloud-documentai pypdf openai

Collecting google-cloud-documentai
  Downloading google_cloud_documentai-2.34.0-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-cloud-documentai)
  Downloading google_api_core-2.21.0-py3-none-any.whl.metadata (2.8 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1 (from google-cloud-documentai)
  Downloading google_auth-2.35.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-cloud-documentai)
  Downloading proto_plus-1.25.0-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2 (from google-cloud-documentai)
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Colle

In [2]:
import os
from typing import Optional, Literal
from pathlib import Path
from glob import glob

from pydantic import BaseModel, SecretStr
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.oauth2 import service_account
from pypdf import PdfReader
import openai
from IPython.display import display, Markdown

In [9]:
# GCP Config
GCP_PROJECT_ID = "..."
GCP_LOCATION = "us"
GCP_PROCESSOR_ID = "..."
GCP_CREDENTIAL_PATH = "..."

# single file
file_path = "../data/pdf/EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf"

# multiple files
pdf_path = "../data/pdf"

mime_type = "application/pdf"

os.environ["OPENAI_API_KEY"] = "..."

In [5]:
class GCPConfig(BaseModel):
    GCP_PROJECT_ID: str
    GCP_LOCATION: Literal["us", "eu"]
    GCP_PROCESSOR_ID: str
    GCP_CREDENTIAL_PATH: str

class DocumentAI:
    def __init__(self, filepath, gcp_config: GCPConfig, mime_type="application/pdf"):
        self.filepath = filepath
        self.mime_type = mime_type
        self.client = documentai.DocumentProcessorServiceClient(
            client_options=self._get_client_options(gcp_config), 
            credentials=self._get_credentials(gcp_config)
        )
        self.process_name = self._get_process_name(gcp_config)
        self.load_file()

    def load_file(self):
        with open(self.filepath, "rb") as f:
            content = f.read()

        self.raw_document = documentai.RawDocument(
            content=content, mime_type=self.mime_type
        )
        return self.raw_document

    def run_ocr(self, pages):
        if self.raw_document is None:
            self.load_file()

        process_options = documentai.ProcessOptions(
            individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
                pages=pages
            )
        )

        request = documentai.ProcessRequest(
            name=self.process_name,
            raw_document=self.raw_document,
            process_options=process_options,
        )

        result = self.client.process_document(request=request)
        return result

    def _get_client_options(self, gcp_config):
        return ClientOptions(
            api_endpoint=f"{gcp_config.GCP_LOCATION}-documentai.googleapis.com"
        )
    
    def _get_credentials(self, gcp_config):
        return service_account.Credentials.from_service_account_file(
            gcp_config.GCP_CREDENTIAL_PATH
        )
    
    def _get_process_name(self, gcp_config):
        return self.client.processor_path(
            gcp_config.GCP_PROJECT_ID,
            gcp_config.GCP_LOCATION,
            gcp_config.GCP_PROCESSOR_ID
        )

class DocumentParser:
    def __init__(self, document):
        self.document = document
        self.text = document.text

    @property
    def pages(self):
        return self.document.pages
        
    def get_content_by_page(self, page_number):
        p = None
        for page in self.pages:
            if page.page_number == page_number:
                p = page
                break
 
        if p is None:
            raise ValueError()

        content = ""

        for line in p.lines:
            for segment in line.layout.text_anchor.text_segments:
                content += self.text[segment.start_index:segment.end_index]
        
        return content

    def get_image_by_page(self, page_number):
        p = None
        for page in self.pages:
            if page.page_number == page_number:
                p = page
                break

        if p is None:
            raise ValueError()

        return p.image.content
    
def refine(ocr_processed, image):
    client = openai.OpenAI()
    
    prompt = f"""# Role
You are an expert at comparing and refining OCR processing and images.
Given an image and OCR processing, refine the OCR processing and answer the question.
If a table exists, convert it to markdown and answer.
Do not include ``` or ```markdown in your answer.
** Don't add anything else and only answer the refined content. **
# ocr_processed
{ocr_processed}
# image
"""
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64, {image}"}}
            ]
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )
    response_text = response.choices[0].message.content
    return response_text

def transform_to_base64(byte):
    import base64
    return base64.b64encode(byte).decode("utf-8")

In [10]:
gcp_config = GCPConfig(
    GCP_PROJECT_ID=GCP_PROJECT_ID,
    GCP_LOCATION=GCP_LOCATION,
    GCP_PROCESSOR_ID=GCP_PROCESSOR_ID,
    GCP_CREDENTIAL_PATH=GCP_CREDENTIAL_PATH
)
dai = DocumentAI(file_path, gcp_config)

pages = len(PdfReader(file_path).pages)
page_splits = [range(i, min(i+14, pages+1)) for i in range(1, pages+1, 15)]

for page_split in page_splits:
    result = dai.run_ocr(page_split)
    document = result.document

    dp = DocumentParser(document)
    
    for page in page_split:
        content = dp.get_content_by_page(page)
        image = dp.get_image_by_page(page)
        image = transform_to_base64(image)

        print(f"{page=}")
        display(Markdown(refine(content, image)))

page=1


arXiv:2410.10315v2 [cs.CL] 15 Oct 2024  
EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations  
Zhangchi Feng¹, Dongdong Kuang¹, Zhongyuan Wang¹, Zhijie Nie¹, Yaowei Zheng¹, Richong Zhang¹*  
¹CCSE, School of Computer Science and Engineering, Beihang University, Beijing, China  
{zcmuller, kuangdd, wangzy23, hiyouga}@buaa.edu.cn, {niezj, zhangrc}@act.buaa.edu.cn  

Abstract  
This paper presents EasyRAG, a simple, lightweight, and efficient retrieval-augmented generation framework for automated network operations. Our framework has three advantages. The first is accurate question answering. We designed a straightforward RAG scheme based on (1) a specific data processing workflow (2) dual-route sparse retrieval for coarse ranking (3) LLM Reranker for reranking (4) LLM answer generation and optimization. This approach achieved first place in the GLM4 track in the preliminary round and second place in the GLM4 track in the semifinals. The second is simple deployment. Our method primarily consists of BM25 retrieval and BGE-reranker reranking, requiring no fine-tuning of any models, occupying minimal VRAM, easy to deploy, and highly scalable; we provide a flexible code library with various search and generation strategies, facilitating custom process implementation. The last one is efficient inference. We designed an efficient inference acceleration scheme for the entire coarse ranking, reranking, and generation process that significantly reduces the inference latency of RAG while maintaining a good level of accuracy; each acceleration scheme can be plug-and-play into any component of the RAG process, consistently enhancing the efficiency of the RAG system. Our code and data are released at https://github.com/BUAADreamer/EasyRAG.

1 Introduction  
Our solution can be summarized by Fig. 1, which includes a data processing workflow (Section 1.1) and the RAG process (Section 1.2).  
*Corresponding author  
'This work is a technical report of our solution at the 2024 (7th) CCF International AIOps Challenge. The official website of the Challenge is https://competition.aiops-challenge.com

1.1 Ingestion  
1.1.1 zedx file processing  
Due to the discovery that the original processing script missed some files, we have reprocessed the zedx files using the following steps:  
1. zedx Decompression: Decompress the official source data from four .zedx files, obtaining four packages of HTML documents.  
2. Path Parsing: Read the knowledge path and the actual file path from the nodetree.xml in each document package.  
3. Document Extraction: Extract the text, image titles, and image paths from each HTML document using BeautifulSoup.  
4. Saving: Save the document text in txt format, maintaining the relative location consistent with the HTML document. Also, save the knowledge path, file path, and image path information.  

1.1.2 Text Segmentation  
Segmentation Settings: We used SentenceSplitter for document segmentation, initially splitting into sentences using Chinese punctuation, then merging according to the set text block size. The used block size (chunk-size) is 1024, and the block overlap size (chunk-overlap) is 200.  

Eliminating Path Influence in Segmentation: In practice, we found that the original implementation of llama-index used a simple but unstable method of handling path information, subtracting the file path length from the text length to determine the actual text length used. This approach could cause different segmentation results with the same chunk-size and chunk-overlap, depending on the data path. During the preliminary competition, we observed that changing paths could lead to a fluctuation of...

page=2


1. Ingestion
   - Documents

2. EasyRAG
   - question
   - bm25
   - query rewrite
   - split
   - chunks:
     - chunk 1
     - chunk 2
   - chunk retrieval
   - metadata:
     - file path
   - Rerank Doc Embed / Retrieval filter
   - knowledge path --> Retrieval Doc Embed
   - extract:
     - LLM Context Embed
     - image content
     - generate describe
     - paddleocr
     - filtered images
     - image path
     - word rule
     - sentence rule
     - glm4v-96-chat
     - filter
   - chunk 1
   - chunk n-1
   - chunk k
   - chunk n
   - bge- -> reranker-v2-
   - minicom
   - rerank
   - generate
   - GLM-4
   - Ans':
   - path retrieval
   - chunk ntm
   - query <--
   - refine
     - GLM-4
     - Ans

Figure 1: EasyRAG Framework

Up to 3 percentage points in the final evaluation results is unacceptable in practice. To address this issue, we implemented a custom segmentation class that eliminates the use of path length, ensuring stable reproducibility.

1.1.3 Image Information Extraction
- Image Content Extraction Using a Multimodal Large Model
  - We extracted information from all images using GLM-4V-9B (GLM et al., 2024). The simple prompt that achieves good results is: 
    - Briefly describe the image.
  
- Image Filtering Based on Various Rules
  - We found that a small number of images are beneficial for the final question answering, but not all images are useful. Therefore, we designed a flexible strategy to filter out useless images using the following steps:
    1. Use the PP-OCRv4 model to extract text content from images and filter out images that do not contain Chinese.
    2. Filter images whose titles contain specific keywords (e.g., network diagrams, architecture).
    3. Filter images that are referenced in the text in a specific way (e.g., configuration as shown in Figure x, file as shown in Figure x).
  
With these filtering steps, we reduced the number of images from an original 6000 to fewer than 200. The filtering process is easily configurable, allowing for tuning to suit real-world scenarios.

1.2 RAG Pipeline
1.2.1 Query Rewriting
- Given the queries were very brief, we identified issues with some queries being semantically awkward or having unclear keywords. For instance, "What types of alarms are there in EMSPLUS?" and "What are the sources of faults?". Before inputting these queries into the RAG Pipeline, we used a Large Language Model (LLM, GLM4) for query rewriting, which involved two methods: query expansion and Hypothetical Document Embedding (HyDE) (Gao et al., 2022).

Query Expansion
- During the preliminary round, we summarized the characteristics of queries in the current operational maintenance scenario:
  - Technical keywords in queries are crucial.
  - Queries are short and vary greatly in the amount of information provided.

In this context, we attempted to summarize the key terms in the queries or other potentially relevant keywords using the LLM, i.e., using the LLM's knowledge for keyword association and summary in the fields of operation and communication. This is referred to as keyword expansion.

page=3


After manually annotating several data points with keywords and potential associations, we utilized the LLM (GLM4) for few-shot keyword summarization and expansion. Following (Wang et al., 2023), we generated new queries by directly concatenating the expanded keywords with the original query and then re-summarizing them using a large language model.

Let L represent the Large Language Model LLM, with q and p denoting the initial query and the prompt, respectively. Pexp represents the expanded query prompt, including manually annotated data points, and psum represents the prompt for summarizing and concatenating the sentence and expanded keywords using the large model.

In situations where queries lack specificity or identifiable elements, making it difficult for both dense and sparse retrieval methods to locate the target document, we designed a set of hypothetical document embedding methods, inspired by (Gao et al., 2022).

For the generation of fictional documents, we devised two approaches: 
1. Following the paper's methodology, we input the prompt phy and the original question q into the large language model L to produce the fictional document g0. 
2. During the semifinals, we discovered that such fictional documents contained a significant amount of irrelevant keywords and redundant information due to the large model's hallucinations, greatly affecting the effectiveness of the retrieval process. Therefore, we attempted to minimize the hallucinations and redundant information in the initial fictional document g0 by using the BM25 algorithm and dense retrieval (using GTE-QWEN encoding) to identify the most relevant top-1 document and use it for context prompting.

For the generated fictional documents, we also adopted two application methods: 
1. Using the fictional document q' combined with the original document q for coarse ranking retrieval. 
2. Using only the fictional document q' combined with the original document q for re-ranking of retrieval results.

### Dual-route Sparse Retrieval for Coarse Ranking

In the sparse retrieval section, we utilized the BM25 algorithm to construct the retriever. The core idea of BM25 is based on term frequency (TF) and inverse document frequency (IDF), and it also incorporates document length information to calculate the relevance between the document and query q. Specifically, the BM25 retriever primarily consists of a Chinese tokenizer and a stopword list. We will introduce each component in detail.

#### Chinese Tokenizer
For the Chinese tokenizer, we used the widely known jieba Chinese tokenizer, which is lightweight and supports multi-threaded mode to accelerate tokenization and part-of-speech analysis. It allows for customization of word frequency or dictionaries to adjust tokenization preferences. We also attempted to customize the vocabulary; in the current 5G communication maintenance scenario, we chose a related IT field lexicon collected by Tsinghua University loaded into the tokenizer. However, the results in practice were mediocre, so we ultimately continued using the original jieba lexicon.

#### Stopword List
For the Chinese stopword list, we adopted the common Chinese stopword list collected by Harbin Institute of Technology as a reference for filtering out meaningless words during Chinese tokenization. By filtering out irrelevant words and special symbols, we improve the hit rate of valid keywords and increase the recall rate of correct documents.

### Dual-route Retrieval
The BM25 dual-route retrieval for coarse ranking consists of text block retrieval and path retrieval.
1. **Text block retrieval:** Use BM25 to search the segmented text blocks, recalling the top 192 text blocks with a coarse ranking score greater than 0.
2. **Path retrieval:** Considering that some questions are highly relevant to our extracted knowledge paths, such as the question "How...".

page=4


many types of VNF elasticity are there?",
where both VNF and elasticity can be directly found in related knowledge paths. Hence, we designed a path search using BM25 to search the knowledge paths, recalling the top 6 text blocks with a coarse ranking score greater than 0.

Retrieval Process The BM25 retriever follows the document retrieval process below for a given query q:
1. Document Expansion. For text block retrieval, we concatenate the file path and each text block together to serve as expanded documents for retrieval.
2. Document Preprocessing. First, filter all documents (text blocks or paths) with stopwords, then use the Chinese tokenizer for tokenization, and pre-compute the IDF scores of the documents.
3. Query Processing. Filter the query q with stopwords and perform Chinese tokenization.
4. Similarity Recall. Count the keywords of query q and calculate the TF values of each document, compute the relevance scores based on TF and IDF values, and recall relevant documents based on scores.
5. File Path Filtering. For text block retrieval, we use the file paths provided in the competition to compare metadata, filtering out text blocks from other sources.

Dense Retrieval for Coarse Ranking
In the dense retrieval section, we employed the gte-Qwen2-7B-instruct model developed by Alibaba (Li et al., 2023), which has achieved advanced results on the MTEB benchmark.

Retrieval Process The dense retriever for a given query q follows the specific document retrieval process as outlined below:
1. Document Expansion. We concatenate the file path with each text block to serve as expanded documents for retrieval.
2. Document Encoding. All text blocks are input into the model to be encoded and the representations are stored in a Qdrant vector database.
3. Query Encoding. Using a query prompt template, we transform q into an input suitable for the GTE model and encode it using the model.
4. Similarity Recall. During retrieval, cosine similarity is used for matching, recalling the top 288 text blocks.
5. File Path Filtering. Using the file paths provided in the competition, we employ a Qdrant filter to eliminate text blocks from other sources.

LLM Reranker Re-ranking
We utilized the bge-reranker-v2-minicpm-layerwise model (Chen et al., 2024), a LLM Reranker trained on a hybrid of multiple multilingual ranking datasets using MiniCPM-2B-dpo-bf16. This model exhibits advanced ranking performance in both Chinese and English and includes accompanying tool code, which can be conveniently fine-tuned for specific scenarios.

Re-ranking Process The LLM-Reranker for a given query q and k' coarsely ranked text blocks follows the specific document ranking process as outlined below:
1. Document Expansion. We concatenate the knowledge paths with each text block to serve as expanded documents for retrieval.
2. Text Processing. Combine q with the k' text blocks to form k' query-document pairs, which are then input into the tokenizer to generate input data for the LLM.
3. Similarity Ranking. The input data is fed into the LLM to obtain re-ranking scores for the query and each text block, and the blocks are sorted according to these scores. The highest ranked k (typically 6) text blocks are returned.

Multi-route Ranking Fusion
Fusion Algorithm Since we designed multiple routes for coarse retrieval, it is also necessary to design corresponding ranking fusion strategies. We primarily used two strategies: simple merging and Reciprocal Rank Fusion (RRF). The simple merging strategy directly de-duplicates and merges text blocks obtained from multiple routes. Reciprocal Rank Fusion sums the reciprocals of the ranks of the same document across multiple retrieval paths to compute the fusion score for re-ranking.

page=5


Coarse Ranking Fusion  
The most straightforward use of ranking fusion is to merge the text blocks obtained from multi-route coarse retrieval into a single set of text blocks, which are then passed to the Reranker for re-ranking. In the semifinals, we used simple merging to combine results from two sparse retrieval routes.  

Re-ranking Fusion  
We can also perform fusion after coarse ranking and re-ranking for each route. In the preliminary rounds, we fused text blocks from sparse and dense retrieval routes. For these two routes, we designed three re-ranking fusion methods.  
1. Use RRF to merge the results after coarse and fine ranking.  
2. Input the text blocks from each route into the LLM to obtain respective answers, selecting the longer answer as the final one.  
3. Input the text blocks from each route into the LLM to obtain respective answers and directly concatenate the answers from all routes.  

1.2.6 LLM Answer Generation  
In this section, we first concatenate the contents of the top 6 text blocks obtained from re-ranking using the following template to create a context string:  
### Document 0: {chunk_i}  
### Document 5: {chunk_i}  

Note that the text blocks input into GLM4 here include concatenated image content, whereas the text blocks in the previous coarse and re-ranking processes did not include image content.  

We then combine the context string and the question using the following question-and-answer template, and input it into GLM4 to obtain an answer:  
The context information is as follows:  
{context_str}  
Please answer the following question based on the context information and not your own knowledge. Answers can be itemized. If the context does not contain relevant information, you may respond with "uncertain" and should not restate the context information:  
{query_str}  
Answer:  

Additionally, we have designed other formats of question-and-answer templates. Inspired by Chain-of-Thought (Wei et al., 2022), we designed a Chain-of-Thought question-and-answer template (see Appendix A.2). Drawing from COSTAR (Teo, 2023), we designed a markdown format question-and-answer template (see Appendix A.1). To emphasize the importance of the top 1 document, we designed a focused question-and-answer template (see Appendix A.3). Related experimental results are discussed therein.  

1.2.7 LLM Answer Optimization  
Due to our observation that the LLM gives attention to each text block, which may result in the effective information from the top 1 text block not being fully utilized, we designed an answer integration prompt (see Appendix B). This prompt allows us to integrate and supplement the answers derived from the 6 text blocks using the top 1 text block, leading to the final answer.  

2 Accuracy  
2.1 Abbreviations Introduction  
For ease of writing, we first introduce some important component identifiers.  
Data ① represents the official processed txt data.  
① represents our own processed version 0 txt data, which supplements some missing data compared to the official data.  
② is similar to ① but each txt begins with a concatenated knowledge path.  
③ represents our own processed version 1 txt data.

page=6


| id | data | chunk       | coarse ranking | re-ranking | fusion | accuracy |
|----|------|-------------|----------------|------------|--------|----------|
| 0  | 0    | 1024,50    | 0,3            | -          | -      | 57.86    |
| 1  | 0    | 1024,50    | 1,8            | -          | -      | 68.59    |
| 2  | 0    | 1024,50    | 3,8            | -          | -      | 69.55    |
| 3  | 0    | 1024,50    | 1,192          | 0,8        | -      | 73.73    |
| 4  | 0    | 1024,50    | 1,256          | 0,8        | -      | 70.68    |
| 5  | 0    | 1024,50    | 2,192          | 0,8        | -      | 69.25    |
| 6  | 0    | 1024,50    | 1,288          | 2,8        | -      | 77.07    |
| 7  | 1    | 1024,50    | 1,288          | 2,8        | -      | 77.51    |
| 8  | 2    | 1024,50    | 1,288          | 2,8        | -      | 77.92    |
| 9  | 2    | 1024,50    | 1,256          | 2,8        | -      | 78.49    |
| 10 | 5    | 1024,50    | 3,192          | 2,6        | -      | 80.90    |
| 11 | 2    | 1024,50    | 3,192          | 2,6        | -      | 81.38    |
| 12 | 2    | 1024,100   | 3,192          | 3,6        | -      | 81.77    |
| 13 | 2    | 1024,100   | 3,192          | 3,6        | -      | 81.88    |
| 14 | 2    | 1024,200   | 3,192          | 3,6        | -      | 82.87    |
| 15 | 2    | 1024,200   | 3,192          | 3,6        | -      | 82.97    |
| 16 | 2    | 1024,200   | 4,288          | 3,6        | -      | 83.02    |
| 17 | 2    | 1024,200   | 4,288          | 3,192      | (3),6  | 81.80    |
| 18 | 2    | 1024,200   | 4,288          | 3,192      | -      | 82.50    |
| 19 | 2    | 1024,200   | 4,288          | 3,192      | 3).6   | 83.45    |
| 20 | 2    | 1024,200   | 4,288          | 3,192      | -      | 83.70    |
| 21 | 2    | 1024,200   | 4,288          | 3,192      | -      | 84.38    |

page=7


id | data      | chunk      | coarse ranking | re-ranking | fusion | image | answer merge | accuracy
---|-----------|------------|----------------|------------|--------|-------|--------------|---------
0  | 960,200   | 3,192      | (3).6          | -          | -      | -     | -            | 91.53
1  | 960,200   | 4,288      | (3).6          | -          | -      | -     | -            | 88.40
2  | 960,200   | 4,288, 3,192 | (3).6        | 2          | -      | -     | -            | 90.00
3  | 1024,200  | 3,192      | (3).6          | -          | -      | -     | -            | 90.26
4  | 1024,200  | 3,192      | (3).6          | -          | -      | -     | -            | 91.38
5  | 1024,200  | 3,192      | (3).6          | -          | -      | -     | -            | 92.70
6  | 1024,200  | 3,192.1    | (3).6          | -          | -      | -     | -            | 89.30
7  | 1024,200  | 3,192.2    | (3).6          | -          | -      | -     | -            | 87.12
8  | 1024,200  | 3,192.1    | (3).6          | -          | -      | -     | -            | 92.43
9  | 1024,200  | 3,192.2    | (3).6          | -          | -      | -     | -            | 93.11
10 | 1024,200  | 3,192.2    | (3).6          | -          | -      | -     | -            | 90.17
11 | 1024,200  | 3,192.2    | (3).6          | -          | OCR Filter | -  | -            | 92.5
12 | 1024,200  | 3,192.2    | (3).6          | -          | Rule Filter | -| -            | 94.24
13 | 1024,200  | 3,192.2    | (5).6          | -          | Rule Filter | -| -            | 94.49
14 | 1024,200  | 3,192.5    | (3).6          | -          | Rule Filter | -| document concat | 96.65
15 | 1024,200  | 3,192.6    | (3).6          | -          | Rule Filter | -| prompt merge   | 95.72

page=8


Method | Preliminary Accuracy
--- | ---
Original | 82.0
Concat | 78.2
Summary | 79.4

Method | Semi-final Accuracy
--- | ---
Original | 92.7
Retrieval+HyDE | 89.2
rerank+HyDE | 88.2

Prompt Type | Semi-final Accuracy
--- | ---
Normal QA Template | 94.49
COT QA Template | 89.75
Markdown Format QA Template | 92.27
Focused QA Template | 93.51

page=9


5 Inference Latency  
5.1 Standard Scheme  
**Standard Time Delay**  
In the semi-final's standard scheme, we set the batch size for re-ranking to 32, with the inference latency for a question being 26 seconds, of which document sorting takes 6 seconds, and calling GLM4 twice takes 20 seconds.  

**Removing Answer Integration**  
By eliminating the answer integration step and directly returning the top 6 generated answers, only one call to GLM4 is needed, reducing the inference latency to 16 seconds.  

**Increasing Re-ranking Batch Size**  
Increasing the batch size to 256 increases the GPU memory usage but can reduce the inference latency to 24 seconds.  

**Full Process Acceleration Scheme**  
Beyond simple optimization strategies, we have also designed a full process acceleration scheme, which will be introduced in the following three subsections. This scheme aims to reduce time costs at each step. Due to the instability of GLM4 outputs, all experiments in this section terminate after the first generation of answers, without the final answer integration step, allowing for a more rigorous comparison of the impact of various acceleration methods on performance.  

5.2 BM25 Acceleration  
Since our retrieval stage relies heavily on BM25 for keyword matching, we introduced the bm25s (Lù, 2024) library to optimize the speed of BM25 retrieval.  

| Implementation  | Time (s) | Accuracy  |
|-----------------|----------|-----------|
| BM25Okapi       | 17       | 94.49     |
| BM25s           | 0.05     | 94.24     |

**Table 6:** Effects of BM25 acceleration on the test set. Time represents the total search time for 103 questions related to BM25, and accuracy represents the evaluation score of the final generated answers.  

5.3 Reranker Acceleration  
We used the bge-reranker-v2-minicpm-layerwise model developed by the Zhejiang University's Institute for AI (Chen et al., 2024) as the LLM Reranker. This model supports customization of the number of inference layers, allowing selection from 8-40 layers based on one's needs and resource constraints, thus reducing GPU memory overhead. In our preliminary experiments, we found that 28 layers performed slightly better than 40 layers, with a difference of about 0.2 points, consistent with the empirical research conclusions given in the original repository. Therefore, both the preliminary and semi-final accuracy experiments utilized 28 layers.  

However, since the Reranker is time-consuming in practical inference, we considered whether fewer layers could be used to speed up the process. Classic early-exit techniques in BERT, such as FastBERT (Liu et al., 2020) and DeeBERT (Xin et al., 2020), use information entropy exceeding a threshold as the condition for early exit, which is computationally intensive and results in unstable effects. Therefore, we designed a model early-exit algorithm based on maximum similarity selection; for each query, we check if the softmax similarity output at the 12th layer in the first batch contains any values exceeding a certain threshold; if so, this query is inferred using just 12 layers, otherwise 28 layers are used.  

We conducted an experiment using an A100 40G GPU to explore inference time, GPU memory usage, and accuracy at a batch size of 32, comparing different layers and early-exit methods. We randomly selected 10 queries and chose 192 text blocks for each, including 6 ground truth text blocks sorted using 28 layers in the complete RAG and 186 other random blocks.  

We predicted the sum of softmax scores of ground truth blocks relative to all blocks using various methods. Then, we assessed the similarity accuracy by dividing the predicted proportion by the proportion obtained with 28 layers and compared the ranking accuracy of predicted ground truth with the 28-layer results.  

It can be seen that our proposed model early-exit method, while reducing inference time by 33%, is able to maintain ranking results consistent with those obtained using 28 layers directly, surpassing the entropy selection methods.  

5.4 Context Compression  
We designed a context compression method based on BM25 semantic similarity, which we call BM25-Extract. For each chunk, we first split it into sentences, then use BM25 to calculate the similarity between the query and each chunk, and finally add sentences to the list in order of decreasing similarity until a set compression rate is reached. The sentences are then concatenated in their original order.

page=10


Method | Time(s) | Similarity (%) | Rank
--- | --- | --- | ---
8-layer | 1.67 | 73 | 2.5
12-layer | 2.20 | 88 | 3.2
20-layer | 3.58 | 86 | 4.0
28-layer | 5.25 | 100 | 6.0
40-layer | 7.71 | 100 | 5.4
Maximum (0.1) | 2.59 | 90 | 3.7
Maximum (0.2) | 3.55 | 96 | 4.5
Maximum (0.4) | 4.57 | 97 | 5.4
Entropy (0.2) | 2.74 | 89 | 3.4
Entropy (0.4) | 3.37 | 91 | 3.6
Entropy (0.6) | 4.01 | 91 | 4.0

Compression Algorithm | Compression Rate (%) | Tokens Saved | Accuracy | Time (s)
--- | --- | --- | --- | ---
Original Content | 100 | 0 | 94.49 | 9.30
LLMLingua(0.5) | 62.80 | 143k | 83.44 | 10.47
LongLLMLingua(0.5) | 62.80 | 143k | 80.86 | 10.52
BM25-Extract(0.5) | 65.92 | 160k | 86.48 | 8.16
BM25-Extract(0.8) | 83.84 | 59k | 89.00 | 7.70

page=11


Weijie Liu, Peng Zhou, Zhiruo Wang, Zhe Zhao, Haotang Deng, and Qi Ju. 2020. FastBERT: a self-distilling BERT with adaptive inference time. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 6035-6044, Online. Association for Computational Linguistics.  
Xing Han Lù. 2024. Bm25s: Orders of magnitude faster lexical search via eager sparse scoring. Preprint, arXiv:2407.03618.  
Sheila Teo. 2023. How I won Singapore's GPT-4 prompt engineering competition.  
Liang Wang, Nan Yang, and Furu Wei. 2023. Query2doc: Query expansion with large language models. arXiv preprint arXiv:2303.07678.  

## A Question-and-Answer Prompt Templates  

### A.1 Markdown Format Question-and-Answer Template  
## Objective  
Please, based on the information from k private domain documents about 5G operational maintenance, answer the given question.  
## Requirements  
1. You may itemize your answer; be as detailed and specific as possible.  
2. Do not merely repeat information from the context.  
3. Do not use your own knowledge; rely solely on the content from the context documents.  
## Context  
{context_str}  
## Question  
{query_str}  
## Answer  
Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824-24837.  

### A.2 Chain of Thought Question-and-Answer Template  
Context information as follows:  
{context_str}  
Ji Xin, Raphael Tang, Jaejun Lee, Yaoliang Yu, and Jimmy Lin. 2020. DeeBERT: Dynamic early exiting for accelerating BERT inference. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 2246-2251, Online. Association for Computational Linguistics.  
Please answer the following question based on the context information rather than your own knowledge. Think step by step, first provide an analysis process, then generate an answer:  
{query_str}  
Answer:  
11

page=12


A.3 Focused Question-and-Answer Template  
Context information as follows:  
{context_str}  
Please answer the following question based on the context information rather than your own knowledge. You may itemize your answer. Document 0's content is particularly important, consider it carefully. If the context does not contain relevant knowledge, you may respond with 'uncertain'. Do not simply restate the context information:  
{query_str}  
Answer:  

B Answer Integration Template  
Context:  
{top1_content_str}  
You will see a question and a corresponding reference answer. Please, based on the context knowledge and not your own knowledge, supplement the reference answer to make it more complete in addressing the question. Please note, strictly retain every character of the reference answer and reasonably integrate your supplement with the reference answer to produce a longer, more complete answer containing more terms and itemization.  
Question:  
{query_str}  
Reference answer:  
{answer_str}  
New answer:  
12