### Load Data and Construct VectorDatabase

In [1]:
from Utils import *
import os

# initialize the database
db = VectorDatabase()

# gpu if available
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:4000"

# initialize the database
db.initialize_process(chunk_size=256 ,chunk_overlap=200)

resource module not available on Windows
< VectorDatabase initialized > 
  - loading data into ChromaDB 
     - loading [faq] ...
        ... collection [faq] deleted.
     - loading [insurance] ...
        ... collection [insurance] deleted.
     - loading [finance] ...
        ... collection [finance] deleted.


### Vector Retriever

In [2]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='Vector')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized > 
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 87.33%
     - Category: [insurance], Accuracy: 90.00%
     - Category: [finance], Accuracy: 74.00%
     - Category: [faq], Accuracy: 98.00%


### BM25 Retriever

In [3]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='enhanced_questions.json')
# do question 
retriever.process_questions(method='original')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized > 
< Retriever initialized > 


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\marks\AppData\Local\Temp\jieba.cache
Loading model cost 0.592 seconds.
Prefix dict has been built successfully.


  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 66.67%
     - Category: [insurance], Accuracy: 70.00%
     - Category: [finance], Accuracy: 44.00%
     - Category: [faq], Accuracy: 86.00%


### BM25 + Vector Fusion Retriever

In [None]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='enhanced_questions.json')
# do question 
retriever.process_questions(method='BM25_Vector')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized > 
< Retriever initialized > 


## Llama index BM25

In [None]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='enhanced_questions.json')
# do question 
retriever.process_questions(method='BM25')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized > 
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 80.67%
     - Category: [insurance], Accuracy: 96.00%
     - Category: [finance], Accuracy: 56.00%
     - Category: [faq], Accuracy: 90.00%


## Llama index BM25 filter

In [None]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='enhanced_questions.json')
# do question 
retriever.process_questions(method='Vector_filter_BM25')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

### weight

In [None]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='enhanced_questions.json')
# do question 
retriever.process_questions(method='w')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

resource module not available on Windows
< VectorDatabase initialized > 
< Retriever initialized > 


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\marks\AppData\Local\Temp\jieba.cache
Loading model cost 0.616 seconds.
Prefix dict has been built successfully.


  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 86.00%
     - Category: [insurance], Accuracy: 94.00%
     - Category: [finance], Accuracy: 66.00%
     - Category: [faq], Accuracy: 98.00%


In [None]:
from Utils import *
import os

# initialize the database
db = VectorDatabase()

# initialize the database
s = db.summary_index()

In [None]:
from llama_index.core import PromptTemplate
from transformers import AutoModelForCausalLM,AutoTokenizer
system_prompt = "你是一個來自台灣的AI助理，你的名字是 TAIDE，樂於以台灣人的立場幫助使用者，會用正體中文回答問題。"
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

tokenizer = AutoTokenizer.from_pretrained(
"taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"
)
stopping_ids = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# # Transform a string into input zephyr-specific input
# def completion_to_prompt(completion):
#     return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"


# # Transform a list of chat messages into zephyr-specific input
# def messages_to_prompt(messages):
#     prompt = ""
#     for message in messages:
#         if message.role == "system":
#             prompt += f"<|system|>\n{message.content}</s>\n"
#         elif message.role == "user":
#             prompt += f"<|user|>\n{message.content}</s>\n"
#         elif message.role == "assistant":
#             prompt += f"<|assistant|>\n{message.content}</s>\n"

#     # ensure we start with a system prompt, insert blank if needed
#     if not prompt.startswith("<|system|>\n"):
#         prompt = "<|system|>\n</s>\n" + prompt

#     # add final assistant prompt
#     prompt = prompt + "<|assistant|>\n"

#     return prompt


import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

Settings.llm = HuggingFaceLLM(
    model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1",
    tokenizer_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1",
    context_window=2048,
    max_new_tokens=400,
    generate_kwargs={"temperature": 0.6, "top_p": 0.9, 'do_sample': True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    stopping_ids=stopping_ids,
    tokenizer_kwargs={"max_length": 8000},
    # messages_to_prompt=messages_to_prompt,
    # completion_to_prompt=completion_to_prompt,
    device_map="cuda",
    model_kwargs={"torch_dtype": torch.float16},
)

import logging

# 定義 logging 輸出格式
FORMAT = '%(asctime)s %(filename)s %(levelname)s:%(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)








Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
query_gen_str = """\
你是一個來自台灣非常強大的中文問答助理，你很擅長根據一個原始問題進行改寫，生成出多個相關\
、幫助加強語意的問題。 請生成 {num_queries} 搜尋問題，一個問題一行，以加強語意的角度，\
改寫下列輸入的問題：
問題： {query}
生成的問題：
"""
query_gen_prompt = PromptTemplate(query_gen_str)

def generate_queries(query: str, llm, num_queries: int = 4):
    response = llm.predict(
        query_wrapper_prompt, num_queries=num_queries, query=query
    )
    # assume LLM proper put each query on a newline
    queries = response.split("\n")
    queries_str = "\n".join(queries)
    print(f"Generated queries:\n{queries_str}")
    return queries

generate_queries("在2022年第3季，長榮公司有無從事衍生工具交易的情事？", Settings.llm)

In [1]:
from typing import List, Optional
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class QuestionOptimizer:
    def __init__(self, model_name: str = "Llama3-TAIDE-LX-8B-Chat-Alpha1"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        
    def _generate_response(self, prompt: str) -> str:
        """使用模型生成回應"""
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,  # 明確啟用截斷
            max_length=2048,  # 設定輸入長度限制
            padding=True
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=400,  # 只使用 max_new_tokens
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
            
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def enhance_question(self, query: str, num_variants: int = 3) -> List[str]:
        """生成改寫後的問題"""
        prompt = f"""
請依照以下規則，將問題改寫成{num_variants}個不同版本：
1. 保持原始問題的核心意思
2. 使用不同的表達方式
3. 添加相關的關鍵詞
4. 重組句子結構
5. 確保改寫後的問題更容易被搜尋系統理解

原始問題：{query}
"""
        response = self._generate_response(prompt)
        enhanced_questions = [q.strip() for q in response.split('\n') if q.strip()]
        return enhanced_questions[:num_variants]

    def process_batch(self, questions: List[dict]) -> List[dict]:
        """批次處理問題"""
        return [{
            'qid': q['qid'],
            'original': q['query'],
            'enhanced': self.enhance_question(q['query'])
        } for q in questions]

if __name__ == "__main__":
    optimizer = QuestionOptimizer()
    test_query = "在2022年第3季，長榮公司有無從事衍生工具交易的情事？"
    enhanced = optimizer.enhance_question(test_query)
    
    print("原始問題:", test_query)
    print("\n改寫後的問題:")
    for i, q in enumerate(enhanced, 1):
        print(f"{i}. {q}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


原始問題: 在2022年第3季，長榮公司有無從事衍生工具交易的情事？

改寫後的問題:
1. 請依照以下規則，將問題改寫成3個不同版本：
2. 1. 保持原始問題的核心意思
3. 2. 使用不同的表達方式
