In [6]:
import os
import json
import numpy as np

from faiss_embed.faiss_indexer import embeddings_to_faiss, search_similar_embeddings
from faiss_embed.embedding_model import EmbeddingModel

# 指定要測試的 tribe_code
tribe_code = "CAA"
data_dir = os.path.join("data", tribe_code)

# 讀取預先產生的 embedding_list 與 sentence_index_map
emb_file = os.path.join(data_dir, f"{tribe_code}_embedding_list.npy")
idx_file = os.path.join(data_dir, f"{tribe_code}_sentence_index_map.npy")

embedding_list = np.load(emb_file, allow_pickle=True)
sentence_index_map = np.load(idx_file, allow_pickle=True)

# 如果後續需要對話全文，亦可讀取 JSON
dialogue_file = os.path.join(data_dir, f"{tribe_code}_dialogue.json")
with open(dialogue_file, 'r', encoding='utf-8') as f:
    dialogues = json.load(f)

# 建立 Faiss 索引
faiss_index = embeddings_to_faiss(embedding_list)

# 初始化模型，用來 query
model = EmbeddingModel()

def find_next_sentence_in_dialogue(query_sentence: str):
    # Query embedding
    query_emb = model.get_single_embedding(query_sentence)
    # 搜尋最相似的一個結果
    D, I = search_similar_embeddings(query_emb, faiss_index, top_k=1)
    best_idx = I[0][0]  # 取得第一筆相似度最高的 index

    d_id, u_id = sentence_index_map[best_idx]
    
    # 取得下一句 (如果存在)
    if u_id + 1 < len(dialogues[d_id]):
        return dialogues[d_id][u_id + 1][1]
    else:
        return None

# 測試
test_query = "今天天氣如何？"
next_sentence = find_next_sentence_in_dialogue(test_query)
print(f"查詢句子: {test_query}")
if next_sentence:
    print(f"回傳下一句: {next_sentence}")
else:
    print("已經是對話的最後一句，沒有後續了。")

initial target device: 100%|██████████| 2/2 [00:05<00:00,  2.57s/it]
Chunks:   0%|          | 0/1 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Chunks: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]

查詢句子: 今天天氣如何？
回傳下一句: 太陽出來了，天氣很熱。



