In [1]:
import os
import json
import requests
from pymilvus import MilvusClient
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel,AutoModelForCausalLM
import numpy as np


In [2]:
test_question="MobileApp本地轉賬澳元收費?"

In [3]:
#Step1,调用embedding文档转换
def get_embedding(text):
    # 定义你的API端点URL
    url = 'http://127.0.0.1:8888/embed'
    
    # 准备要发送的JSON数据（Python字典）
    data = {
        'texts': text,
    }

    # 发送POST请求，直接传递Python字典到json参数中
    response = requests.post(url, json=data)
    
    # 打印响应内容
    #print(response.text)
    return response.text


test_question_embedding=get_embedding(test_question)

In [4]:
test_question_embedding

'{"count":1024,"embeddings":[-0.09237251430749893,0.011979890055954456,-0.007704391609877348,-0.08744870871305466,0.017107440158724785,-0.051423657685518265,0.04246022179722786,0.015550370328128338,0.01517690159380436,0.03196905553340912,-0.09213170409202576,0.029719168320298195,0.01790362037718296,-0.0069508664309978485,-0.05007468909025192,0.05250541493296623,0.009290176443755627,-0.07903514802455902,-0.03490309417247772,0.08292306959629059,-0.03364456817507744,0.020554833114147186,0.01180817000567913,0.03141379356384277,0.04272416606545448,0.02664758637547493,-0.02304256707429886,0.042257219552993774,-0.015944529324769974,-0.04035498574376106,0.05115848407149315,-0.0016141999512910843,-0.013144683092832565,0.02442389540374279,-0.01770941913127899,-0.010782282799482346,0.029263891279697418,-0.035706933587789536,0.020373767241835594,0.0015548889059573412,-0.05475395545363426,-0.024050336331129074,-0.013189819641411304,0.044405899941921234,-0.0329226516187191,-0.03151173144578934,0.001

In [5]:
#Step2,查询milvus中，TOP5相似的文本
query_vector=json.loads(test_question_embedding)['embeddings']

client = MilvusClient(url='http://127.0.0.1:19530', db_name="hsbc_c_db")

res = client.search(
    collection_name="hsbc_c",
    data=[query_vector],
    limit=5, # The number of results to return
    search_params={"metric_type": "IP"},
    output_fields=["context"]
)

#只保留0.7以上的结果
docs=[]
for hit in res:
    for line in hit:
        hit_score=line['distance']
        if hit_score>0.7:
            docs.append(line['entity']["context"])

In [6]:
#stper3,调用reranker模型对文本排序

RERANKER_MODEL_PATH = "E:\\RAG\\reranker\\qwen3-reranker"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


# 将这些变量定义在全局，以便所有函数都能访问
tokenizer = None
model = None
token_false_id = None
token_true_id = None
prefix_tokens = None
suffix_tokens = None

#
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_PATH, padding_side='left'
                                          )
# 加载模型并设置为评估模式
model = AutoModelForCausalLM.from_pretrained(RERANKER_MODEL_PATH).eval()

# 获取"no"和"yes"在词表中的token ID
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
# 设置最大序列长度
max_length = 8192

# 定义系统提示前缀，指示模型进行二分类判断
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
# 定义后缀，包含助手回答的开始部分
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
# 将前缀和后缀转换为token ID
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)

# 计算前缀和后缀的总token数
len(prefix_tokens) + len(suffix_tokens)

# 处理输入数据的函数
def process_inputs(pairs):
    # 对输入文本进行分词，但不进行填充
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    # 为每个输入添加前缀和后缀token
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    # 对所有序列进行填充，转换为PyTorch张量
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    # 将输入移至模型所在设备
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

def format_instruction(instruction, query, doc):
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def compute_logits(inputs, **kwargs):
    # 将输入传递给模型，获取输出的logits
    res = model(**inputs).logits
    # 选择最后一个位置的输出，即回答位置的词表概率分布
    batch_scores =  res[:, -1, :]
    # 提取"yes" token的logit值
    true_vector = batch_scores[:, token_true_id]
    # 提取"no" token的logit值
    false_vector = batch_scores[:, token_false_id]
    # 将"no"和"yes"的logit值堆叠成新的张量[batch_size, 2]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    # 应用log_softmax将logits转换为对数概率
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    
    # 提取"yes"对应的概率，转换回原始概率空间，并转为Python列表
    scores = batch_scores[:, 1].exp().tolist()
    return scores



In [7]:
# 添加更多测试用例
task = '智能客服查找文檔后回答客服問題'

# 示例1：添加更多查询-文档对
queries = [test_question]

documents = docs

# 创建所有查询-文档对
pairs = []
for queries_ in queries:
    for document in documents:
            pairs.append(format_instruction(task, queries_, document) )

# 处理输入并计算相关性分数
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

#根據相關性排序
reranker_docs=[]
arr = np.array(scores)
ascending_indices = np.argsort(arr)
descending_indices = ascending_indices[::-1]

for i in range(len(descending_indices)):
      index=descending_indices[i]
      score=scores[index]
      if score>0.9:
            reranker_docs.append(documents[index])
            


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
reranker_docs

['指定服務0收費及費用優惠2.本地外幣轉賬:你可透過恒生MobileApp以12種指定外幣進行本地轉賬，如澳元、人民幣、美元等，一律0收費²。',
 '指定服務0收費及費用優惠3. Global Money+海外轉賬:你可透過恒生MobileApp的Global Money+以當地貨幣轉賬至超過50個指定國家或地區，包括英國（英）美國（美金）澳洲（澳元）及紐西蘭（紐西蘭元）等，無需支付任何費用，最快即日到戶。']

In [15]:
# 1. 加载本地Qwen3模型和分词器
model_name =  "./qwen3-0.6b"  # 替换为你的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)

def generate_answer_with_context(query, retrieved_docs, task_description=""):
    """
    结合检索到的文档和问题，调用Qwen3生成答案
    
    Args:
        query: 用户问题
        retrieved_docs: 检索到的文档列表
        task_description: 任务描述，例如"用中文回答"
    Returns:
        str: 模型生成的答案
    """
    # 2. 构建Prompt
    context_text = "\n".join(retrieved_docs) # 将检索到的文档拼接起来

    prompt = f"""你的角色是銀行的智能客服，請基於以下已知資訊，簡潔和專業地回答用戶的問題。
如果無法從已知資訊中得到答案，請說\“根據已知資訊無法回答該問題，請您咨詢人工客服\”，不允許在答案中添加編造成分。
{task_description}

已知資訊:
{context_text}

問題:
{query}

请基於上述已知資訊回答該問題:
"""

    # 3. 编码输入并生成输出

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    generated_ids = model.generate(**inputs, max_new_tokens=512) # 控制生成长度
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # 4. 后处理：通常只需要提取模型新生成的部分
    # 一种简单的方法是移除Prompt部分
    answer = response.replace(prompt, "").strip()
    return answer



answer = generate_answer_with_context(test_question, reranker_docs, task)

In [None]:
print(answer.split("答案")[1])

:
根據已知資訊，指定服務0收費及費用優惠2中提到，透過恒生MobileApp以12種指定外幣進行本地轉賬，如澳元，可透過該服務進行轉賬，費用為0。因此，MobileApp本地轉賬澳元的收費為0。

**
