In [None]:
from typing import List, Optional
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class QuestionOptimizer:
    def __init__(self, model_name: str = "Llama3-TAIDE-LX-8B-Chat-Alpha1"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        
    def _generate_response(self, prompt: str) -> str:
        """使用模型生成回應"""
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,  # 明確啟用截斷
            max_length=2048,  # 設定輸入長度限制
            padding=True
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=400,  # 只使用 max_new_tokens
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
            
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def enhance_question(self, query: str, num_variants: int = 3) -> List[str]:
        """生成改寫後的問題"""
        prompt = f"""
請依照以下規則，將問題改寫成{num_variants}個不同版本：
1. 保持原始問題的核心意思
2. 使用不同的表達方式
3. 添加相關的關鍵詞
4. 重組句子結構
5. 確保改寫後的問題更容易被搜尋系統理解

原始問題：{query}
"""
        response = self._generate_response(prompt)
        enhanced_questions = [q.strip() for q in response.split('\n') if q.strip()]
        return enhanced_questions[:num_variants]

    def process_batch(self, questions: List[dict]) -> List[dict]:
        """批次處理問題"""
        return [{
            'qid': q['qid'],
            'original': q['query'],
            'enhanced': self.enhance_question(q['query'])
        } for q in questions]

if __name__ == "__main__":
    optimizer = QuestionOptimizer()
    test_query = "在2022年第3季，長榮公司有無從事衍生工具交易的情事？"
    enhanced = optimizer.enhance_question(test_query)
    
    print("原始問題:", test_query)
    print("\n改寫後的問題:")
    for i, q in enumerate(enhanced, 1):
        print(f"{i}. {q}")

In [None]:
from Utils import *
from llama_index.core.node_parser import SentenceSplitter,SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# initialize the database
db = VectorDatabase()
dict = db.corpus_dict_finance

my_embedding = HuggingFaceEmbedding(
    model_name="TencentBAC/Conan-embedding-v1"
)
# Prepare documents
documents = [
    Document(
        text=text,
        id_=f"doc_id_{id}",
        metadata={"category": 'finance', "pid": id}
    ) for id, text in dict.items()
]

splitter = SentenceSplitter(chunk_size=256, chunk_overlap=200)
#
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
nodes_sentence = node_parser.get_nodes_from_documents(documents)

sentence_vector_index = VectorStoreIndex(
    nodes=nodes_sentence,
    embed_model=my_embedding,
    show_progress=True,
)
sentence_vector_index.storage_context.persist(persist_dir='./database/sentence/fiance.json')

In [None]:
from Utils import *
from llama_index.core.node_parser import SentenceSplitter,SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, Document, StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# initialize the database
db = VectorDatabase()
dict = db.corpus_dict_finance

my_embedding = HuggingFaceEmbedding(
    model_name="TencentBAC/Conan-embedding-v1"
)
# Prepare documents
documents = [
    Document(
        text=text,
        id_=f"doc_id_{id}",
        metadata={"category": 'finance', "pid": id}
    ) for id, text in dict.items()
]

splitter = SentenceSplitter(chunk_size=256, chunk_overlap=200)
#
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
nodes_sentence = node_parser.get_nodes_from_documents(documents)

sentence_vector_index = VectorStoreIndex(
    nodes=nodes_sentence,
    embed_model=my_embedding,
    show_progress=True,
)
sentence_vector_index.storage_context.persist(persist_dir='./database/sentence/fiance')

from llama_index.core import VectorStoreIndex, Document, StorageContext, load_index_from_storage
index = load_index_from_storage(
    StorageContext.from_defaults(persist_dir="./database/sentence/fiance"),
    embed_model=my_embedding,
)

from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterOperator
# Define metadata filters
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="category", value="finance", operator=FilterOperator.EQ),
        MetadataFilter(key="pid", value=[41, 70, 359, 870, 900, 951, 59], operator=FilterOperator.IN)
    ]
)

# Initialize the vector index retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,  # Retrieve all possible results
    filters=filters
)
results = retriever.retrieve("鴻海在2023年第1季度中，集團存貨之帳面金額是多少？")

In [None]:
from llama_index.core.indices.document_summary import DocumentSummaryIndexLLMRetriever

from typing import Any, Callable, Dict, List, Optional, Union, cast
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)
model_name = "Llama3-TAIDE-LX-8B-Chat-Alpha1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", torch_dtype=torch.bfloat16).eval()
#自定义本地模型
class OurLLM(CustomLLM):
    context_window: int = 4096
    num_output: int = 1024
    model_name_: str = "custom"

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name_=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        text, history = model.chat(tokenizer, prompt, history=[], temperature=0.1)
        return CompletionResponse(text=text)

    @llm_completion_callback()
    def stream_complete(
            self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        raise NotImplementedError()

In [None]:
from llama_index.core import PromptTemplate
from transformers import AutoModelForCausalLM,AutoTokenizer
system_prompt = "你是一個來自台灣的AI助理，你的名字是 TAIDE，樂於以台灣人的立場幫助使用者，會用正體中文回答問題。"
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

tokenizer = AutoTokenizer.from_pretrained(
"taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"
)
stopping_ids = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# # Transform a string into input zephyr-specific input
# def completion_to_prompt(completion):
#     return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"


# # Transform a list of chat messages into zephyr-specific input
# def messages_to_prompt(messages):
#     prompt = ""
#     for message in messages:
#         if message.role == "system":
#             prompt += f"<|system|>\n{message.content}</s>\n"
#         elif message.role == "user":
#             prompt += f"<|user|>\n{message.content}</s>\n"
#         elif message.role == "assistant":
#             prompt += f"<|assistant|>\n{message.content}</s>\n"

#     # ensure we start with a system prompt, insert blank if needed
#     if not prompt.startswith("<|system|>\n"):
#         prompt = "<|system|>\n</s>\n" + prompt

#     # add final assistant prompt
#     prompt = prompt + "<|assistant|>\n"

#     return prompt


import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

Settings.llm = HuggingFaceLLM(
    model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1",
    tokenizer_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1",
    context_window=2048,
    max_new_tokens=400,
    generate_kwargs={"temperature": 0.6, "top_p": 0.9, 'do_sample': True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    stopping_ids=stopping_ids,
    tokenizer_kwargs={"max_length": 8000},
    # messages_to_prompt=messages_to_prompt,
    # completion_to_prompt=completion_to_prompt,
    device_map="cuda",
    model_kwargs={"torch_dtype": torch.float16},
)

import logging

# 定義 logging 輸出格式
FORMAT = '%(asctime)s %(filename)s %(levelname)s:%(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)

In [None]:
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf
 
# Load 
path = "reference/finance/"
file = "351.pdf"
# Get elements
raw_pdf_elements = partition_pdf(
    filename=path+file,
    extract_images_in_pdf=True,
    infer_table_structure=True, 
    # Post processing to aggregate text once we have the title 
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Require maximum chunk size of 4000 chars
    # Attempt to create a new chunk at 3800 chars
    # Attempt to keep chunks > 2000 chars 
    max_characters=4000, 
    new_after_n_chars=3800, 
    combine_text_under_n_chars=2000,
    image_output_dir_path=path
)

class Element(BaseModel):
    type: str
    text: Any
 
# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))
 
# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(table_elements) 
# output: 28 elements in the PDF file
 
# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(text_elements) 
# output: 127 elements in the PDF file