In [7]:
import re
import json
from typing import List, Tuple
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import pipeline
from datasets import Dataset
import numpy as np
import logging
import gc
from tqdm import tqdm
import emoji
import os
import opencc
from functools import lru_cache

# 設置記憶體管理環境變數
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 設置日誌
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

# 初始化多 GPU 環境
device_ids = [0, 1]  # 兩張 A10G
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 清理 GPU 記憶體
torch.cuda.empty_cache()

# 定義分類標籤
act_labels = ["提問", "補充", "引入", "表達情感", "讚美", "道歉", "感謝", "請求", "拒絕", "建議", "問候", "告別", "確認", "表達意圖", "吐槽", "催促", "安慰", "接梗", "回答", "提及", "其他"]
topic_labels = ["活動", "感情", "學業", "娛樂", "人際", "課業壓力", "社團活動", "感情八卦", "生活瑣事", "科技", "健康", "旅行"]

# 初始化簡繁轉換器
converter_to_simplified = opencc.OpenCC('t2s')
converter_to_traditional = opencc.OpenCC('s2t')

# 動態批次大小
def get_dynamic_batch_size(messages: List[str], base_batch_size: int = 128) -> int:
    gpu_memory = torch.cuda.memory_reserved(device) / 1024**3
    max_memory = 24  # A10G 24GB
    if gpu_memory > max_memory * 0.8:
        return max(16, base_batch_size // 2)
    return base_batch_size

# 單進程分片計算嵌入
@lru_cache(maxsize=1)
def load_embedder():
    return SentenceTransformer('paraphrase-MiniLM-L6-v2')

def compute_chunk(chunk: List[str], gpu_id: int, batch_size: int, embedder: SentenceTransformer) -> torch.Tensor:
    torch.cuda.set_device(gpu_id)
    embedder.to(f"cuda:{gpu_id}")
    with torch.no_grad():
        try:
            embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size, show_progress_bar=False)
            embeddings = embeddings.to("cuda:0")
            torch.cuda.empty_cache()
            gc.collect()
            return embeddings
        except torch.cuda.OutOfMemoryError:
            logger.warning(f"GPU {gpu_id} 記憶體不足，減半批次大小重試...")
            embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size // 2, show_progress_bar=False)
            embeddings = embeddings.to("cuda:0")
            torch.cuda.empty_cache()
            gc.collect()
            return embeddings

def precompute_embeddings(messages: List[str], num_gpus: int = 2, chunk_size: int = 5000, cache_file: str = "embeddings.pt") -> torch.Tensor:
    if os.path.exists(cache_file):
        logger.info(f"從快取檔案 {cache_file} 載入嵌入向量...")
        return torch.load(cache_file)
    
    logger.info(f"開始預計算 {len(messages)} 條訊息的嵌入向量...")
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    chunks = [simplified_messages[i:i + chunk_size] for i in range(0, len(messages), chunk_size)]
    batch_size = get_dynamic_batch_size(messages)
    
    embedder = load_embedder()
    results = []
    
    for i, chunk in enumerate(tqdm(chunks, desc="計算嵌入分片")):
        gpu_id = device_ids[i % num_gpus]
        logger.info(f"處理分片 {i+1}/{len(chunks)} on GPU {gpu_id}")
        chunk_embeddings = compute_chunk(chunk, gpu_id, batch_size, embedder)
        results.append(chunk_embeddings)
    
    embeddings = torch.cat(results, dim=0)
    embeddings = embeddings.cpu()
    torch.save(embeddings, cache_file)
    logger.info(f"嵌入計算完成，已儲存至 {cache_file}")
    torch.cuda.empty_cache()
    return embeddings

# 清洗文本
def clean_text(text: str) -> str:
    if not text or len(text.strip()) < 3 or "[語音訊息]" in text:
        return ""
    patterns = r'(上午|下午)\d{1,2}:\d{2}\s*|\[照片\]|\[影片\]|\[貼圖\]|.*已收回訊息.*|☎.*|^\*+$'
    return re.sub(patterns, '', text).strip()

# 關鍵詞分類（擴展行為與話題）
def keyword_infer_dialogue_act(message: str, prev_act: str = None) -> str:
    message_lower = message.lower()
    keywords = {
        "提問": ['有沒有', '什麼', '會不會', '你覺得', '怎麼', '為什麼', '嗎', '哪個', '可不可以', '到了沒', '幾點', '到底', '知道', '多少', '誰', '哪裡', '咩', '欸？'],
        "補充": ['因為', '所以', '雖然', '不過', '但是', '然後', '而且', '結果', '還有', '就', '其實', '剛剛', '出來', '聽到', '天才', '之前'],
        "引入": ['重點是', '主要是', '說到', '講到', '提到', '我也不知道', '話說'],
        "表達情感": ['難過', '尷尬', '自豪', '麻煩', '緊張', '開心', '笑死', '希望', '怕', '覺得', '累了', '冷淡', '好笑', '靠北', '你娘', '超趕', '超好笑', '恨', '想哭', '啊啊', '嚇死', '聊不起來'],
        "讚美": ['漂亮', '帥', '很棒', '很讚', '很可愛', '適合', '好會', '好一點', '顏值', '很高'],
        "道歉": ['對不起', '抱歉', '不好意思', 'sorry'],
        "感謝": ['謝謝', '感謝', '感激', '多謝'],
        "請求": ['拜託', '請', '幫我', '可以嗎'],
        "拒絕": ['不要', '不行', '不可以', '不ok', '沒辦法', '不會', '我沒'],
        "建議": ['建議', '不如', '要不要', '不然', '等等再', '應該'],
        "問候": ['嗨', '嘿', '你好', '早安'],
        "告別": ['掰掰', '拜拜', '再見', '晚安', '解散', '出門', '你刪掉啊'],
        "確認": ['真的假的', '確定', '確認', '好啦', '是嗎', '不確定'],
        "表達意圖": ['我要', '我想', '我會', '可能', '不可能'],
        "吐槽": ['笑死', '怎麼可能', '超醜', '靠腰', '你快點', '智障', '扁你'],
        "催促": ['快點', '最好快點', '趕緊', '供三信'],
        "安慰": ['不會吧', '又沒差', '沒事的'],
        "接梗": ['超好笑', '哈哈哈', '我也是', '後來也換'],
        "回答": ['是', '不是', '對', '錯', '好', '嗯', '喔', '應該', '沒有'],
        "提及": ['許恩開', '陳姿佑', '呂忠言', '張少權', '李冠毅', '尹俊翔', '蔡', '賀哥', '魏氏凱', '余葶', '理工男']
    }
    # 優先檢查提問
    if any(kw in message_lower for kw in keywords["提問"]):
        return "提問"
    for act, kws in keywords.items():
        if act != "提問" and any(kw in message_lower for kw in kws):
            return act
    # 上下文檢查
    if prev_act == "提問" and not any(kw in message_lower for kw in keywords["提問"]):
        return "其他"
    return "其他"

# 初始化分類器
logger.info("正在初始化情感分類器（j-hartmann/emotion-english-distilroberta-base）...")
sentiment_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    device=device_ids[0],
    truncation=True,
    max_length=128,
    top_k=None
)

logger.info("正在初始化零樣本分類器...")
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_ids[1],
    truncation=True,
    max_length=512
)

# 對話行為分類
def infer_dialogue_act(messages: List[str], prev_acts: List[str] = None, embeddings: torch.Tensor = None) -> List[str]:
    if prev_acts is None:
        prev_acts = [None] * len(messages)
    
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    batch_size = get_dynamic_batch_size(messages)
    logger.info(f"正在對 {len(messages)} 條訊息進行對話行為分類，批次大小: {batch_size}")
    
    num_batches = (len(messages) + batch_size - 1) // batch_size
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, len(messages), batch_size), total=num_batches, desc="行為分類批次處理"):
            batch = simplified_messages[i:i + batch_size]
            batch_results = zero_shot_classifier(batch, act_labels, multi_label=False, batch_size=len(batch))
            results.extend(batch_results)
            torch.cuda.empty_cache()
            gc.collect()
    
    predicted_acts = []
    for i, (message, prev_act, result) in tqdm(enumerate(zip(messages, prev_acts, results)), total=len(messages), desc="後處理對話行為"):
        predicted_act = result['labels'][0]
        confidence = result['scores'][0]
        
        if confidence < 0.3 and embeddings is not None and i > 0:
            similarity = util.cos_sim(embeddings[i], embeddings[i-1]).item()
            if similarity > 0.7 and prev_acts[i-1]:
                predicted_act = prev_acts[i-1]
            else:
                predicted_act = keyword_infer_dialogue_act(message, prev_act)
        elif prev_act == "提問" and predicted_act not in ["提問", "補充", "回答"]:
            predicted_act = "其他"
        
        predicted_acts.append(predicted_act if predicted_act else "其他")
    return predicted_acts

# 解析聊天記錄
def parse_chat_log(chat_log: str, user_id: str, friend_id: str, time_threshold: int = 5) -> List[List[Tuple[str, str, int]]]:
    lines = [line.strip() for line in chat_log.split('\n') if line.strip() and "儲存日期" not in line and not re.match(r'\d{4}/\d{2}/\d{2}', line)]
    conversation_groups = []
    current_group = []
    prev_time = None
    
    logger.info(f"開始解析聊天記錄 (User: {user_id}, Friend: {friend_id})...")
    for line in tqdm(lines, desc="解析聊天行"):
        match = re.match(r'(上午|下午)(\d{1,2}):(\d{2})\s*([A-B])\s+(.+)', line)
        if match:
            period, hour, minute, speaker, message = match.groups()
            cleaned_message = clean_text(message)
            if cleaned_message:
                current_time = int(hour) * 60 + int(minute) + (12 * 60 if period == "下午" else 0)
                if prev_time and abs(current_time - prev_time) > time_threshold:
                    conversation_groups.append(current_group)
                    current_group = []
                role = f"User{user_id}" if speaker == "B" else f"User{friend_id}"
                current_group.append((role, cleaned_message, current_time))
                prev_time = current_time
    
    if current_group:
        conversation_groups.append(current_group)
    logger.info(f"解析完成，生成 {len(conversation_groups)} 組對話")
    return conversation_groups

# 添加情緒標籤
def add_emotion_labels(conversation_groups: List[List[Tuple[str, str, int]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str]]]:
    all_messages = [emoji.demojize(msg) for group in conversation_groups for _, msg, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量情感分類...")
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="情感分類批次處理"):
            batch = simplified_messages[i:i + batch_size]
            batch_results = sentiment_classifier(batch, batch_size=len(batch), truncation=True, max_length=128)
            results.extend(batch_results)
            torch.cuda.empty_cache()
            gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    emotion_map = {
        'joy': '開心', 'sadness': '難過', 'anger': '憤怒', 'fear': '害怕', 'love': '喜愛', 
        'surprise': '驚訝', 'neutral': '中性', ':smiling_face_with_heart-eyes:': '喜愛', 
        ':pouting_face:': '難過', ':laughing:': '開心', ':sob:': '難過', ':angry_face:': '憤怒',
        ':face_with_tears_of_joy:': '開心', ':pleading_face:': '喜愛', ':scream:': '興奮', 
        ':thinking_face:': '焦慮', ':weary_face:': '無奈', ':sleeping_face:': '懶散'
    }
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝情感標籤")):
        for speaker, message, time in group:
            sentiment_scores = results[msg_idx]
            top_emotion = max(sentiment_scores, key=lambda x: x['score'])['label']
            confidence = max(sentiment_scores, key=lambda x: x['score'])['score']
            emotion = emotion_map.get(top_emotion, '中性')
            # 檢查表情符號
            for emoji_key, emo in emotion_map.items():
                if emoji_key in message and emo != '中性':
                    emotion = emo
                    break
            # 增強上下文推理
            if confidence < 0.5 and msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.7:
                    emotion = labeled_groups[group_idx][-1][3] if labeled_groups[group_idx] else '中性'
            # 關鍵詞後處理
            message_lower = message.lower()
            if '笑死' in message_lower or '好笑' in message_lower or '哈哈' in message_lower:
                emotion = '開心' if '超' not in message_lower else '興奮'
            elif '難過' in message_lower or '尷尬' in message_lower or '怕' in message_lower or '冷淡' in message_lower or '想哭' in message_lower:
                emotion = '難過'
            elif '嚇死' in message_lower:
                emotion = '害怕'
            elif '扁你' in message_lower:
                emotion = '憤怒'
            elif '沒安全感' in message_lower:
                emotion = '焦慮'
            elif '很高' in message_lower and '顏值' in message_lower:
                emotion = '喜愛'
            elif '你娘' in message_lower or '靠北' in message_lower or '耖你媽' in message_lower:
                emotion = '難過' if '笑' not in message_lower else '開心'
            elif '怎麼辦' in message_lower or '到底' in message_lower or '超趕' in message_lower:
                emotion = '焦慮'
            elif '我要睡了' in message_lower or '又沒差' in message_lower:
                emotion = '懶散'
            elif '去宜蘭玩' in message_lower or '超好笑' in message_lower:
                emotion = '興奮' if '超' in message_lower else '開心'
            elif '我沒準備' in message_lower or '我他媽' in message_lower or '超醜' in message_lower or '減肥' in message_lower:
                emotion = '焦慮' if '怎麼辦' in message_lower else '無奈'
            elif '不知道' in message_lower or '搞不好' in message_lower:
                emotion = '困惑'
            elif '超級冷' in message_lower and '好' not in message_lower:
                emotion = '難過'
            labeled_groups[group_idx].append((speaker, message, time, emotion))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("情感標籤添加完成")
    return labeled_groups

# 添加話題標籤
def add_topic_labels(conversation_groups: List[List[Tuple[str, str, int, str]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str, str]]]:
    all_messages = [msg for group in conversation_groups for _, msg, _, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量話題分類...")
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="話題分類批次處理"):
            batch = simplified_messages[i:i + batch_size]
            batch_results = zero_shot_classifier(batch, topic_labels, multi_label=True, batch_size=len(batch))
            results.extend(batch_results)
            torch.cuda.empty_cache()
            gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝話題標籤")):
        for speaker, message, time, emotion in group:
            result = results[msg_idx]
            predicted_topics = sorted([(label, score) for label, score in zip(result['labels'], result['scores'])], key=lambda x: x[1], reverse=True)[:2]
            predicted_topics = [label for label, score in predicted_topics if score > 0.4]
            message_lower = message.lower()
            if '女朋友' in message_lower or '喜歡' in message_lower or '在一起' in message_lower or '約會' in message_lower or 'fu' in message_lower or '曖昧' in message_lower:
                predicted_topics.append('感情八卦')
            if '群組' in message_lower or '男生' in message_lower or '女生' in message_lower or '室友' in message_lower or '朋友' in message_lower:
                predicted_topics.append('人際')
            if '笑死' in message_lower or '好笑' in message_lower or '照片' in message_lower or '電影' in message_lower or '超好笑' in message_lower:
                predicted_topics.append('娛樂')
            if '遊覽車' in message_lower or '出去玩' in message_lower or '計畫' in message_lower or '活動' in message_lower or '參加' in message_lower:
                predicted_topics.append('社團活動')
            if '填' in message_lower or '名額' in message_lower or '家教' in message_lower or '考試' in message_lower or '進度' in message_lower:
                predicted_topics.append('課業壓力')
            if '睡了' in message_lower or '洗澡' in message_lower or '行李' in message_lower or '出門' in message_lower or '吃飯' in message_lower or '起床' in message_lower:
                predicted_topics.append('生活瑣事')
            if '手機' in message_lower or '電腦' in message_lower or '科技' in message_lower or '版本' in message_lower:
                predicted_topics.append('科技')
            if '健康' in message_lower or '運動' in message_lower or '生病' in message_lower or '暈車' in message_lower:
                predicted_topics.append('健康')
            if '桃園' in message_lower or '宜蘭' in message_lower or '走路' in message_lower or '車上' in message_lower or '台中' in message_lower or '台北' in message_lower:
                predicted_topics.append('旅行')
            if not predicted_topics and msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.7:
                    topic_str = labeled_groups[group_idx][-1][4] if labeled_groups[group_idx] else "日常"
                else:
                    topic_str = "日常"
            else:
                topic_str = "+".join(sorted(set(predicted_topics[:2]))) if predicted_topics else "日常"
            labeled_groups[group_idx].append((speaker, message, time, emotion, topic_str))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("話題標籤添加完成")
    return labeled_groups

# 優化格式化訓練數據
def format_training_data(conversation_groups: List[List[Tuple[str, str, int, str, str]]], embeddings: torch.Tensor, max_turns: int = 10, similarity_threshold: float = 0.7) -> List[dict]:
    training_data = []
    msg_idx = 0
    
    logger.info("開始格式化訓練資料...")
    all_messages = [item[1] for group in conversation_groups for item in group]
    logger.info("預先批量計算所有對話行為...")
    all_dialogue_acts = infer_dialogue_act(all_messages, embeddings=embeddings)
    act_idx = 0
    
    for group in tqdm(conversation_groups, desc="格式化對話組"):
        messages = [item[1] for item in group]
        dialogue_acts = all_dialogue_acts[act_idx:act_idx + len(messages)]
        act_idx += len(messages)
        
        if len(messages) > 1:
            curr_embeddings = embeddings[msg_idx:msg_idx + len(messages) - 1]
            next_embeddings = embeddings[msg_idx + 1:msg_idx + len(messages)]
            similarities = util.cos_sim(curr_embeddings, next_embeddings).diagonal().cpu().numpy()
        else:
            similarities = np.array([])
        
        for i in range(len(group) - 1):
            context = group[max(0, i - max_turns + 1):i + 1]
            next_msg = group[i + 1]
            similarity = similarities[i] if i < len(similarities) else 0.0
            
            # 加強語義完整性和話題一致性檢查
            context_topic = context[-1][4].split("+")[0]  # 取主要話題
            response_topic = next_msg[4].split("+")[0]
            if (similarity >= similarity_threshold and len(next_msg[1]) >= 3 and 
                any(c in next_msg[1] for c in ['。', '！', '？', '是', '不', '我', '你', '他', '有', '沒']) and
                context_topic == response_topic):
                prompt = "\n".join(f"{s}: {m} [情緒: {e}, 行為: {dialogue_acts[j]}, 話題: {t}]" 
                                 for j, (s, m, _, e, t) in enumerate(context))
                response = f"{next_msg[1]} [情緒: {next_msg[3]}, 行為: {dialogue_acts[i + 1]}, 話題: {next_msg[4]}]"
                training_data.append({"prompt": prompt, "response": response})
        msg_idx += len(group)
    
    logger.info(f"格式化完成，生成 {len(training_data)} 筆訓練資料")
    torch.cuda.empty_cache()
    return training_data

# 混合多人群聊
def mix_conversations(chat_logs: List[str], friend_ids: List[str], time_threshold: int = 5, max_turns: int = 10, similarity_threshold: float = 0.7) -> List[dict]:
    all_groups = []
    for user_id, (chat_log, friend_id) in enumerate(tqdm(zip(chat_logs, friend_ids), desc="解析聊天檔案", total=len(chat_logs))):
        groups = parse_chat_log(chat_log, str(user_id), friend_id, time_threshold)
        all_groups.extend(groups)
    
    all_messages = [msg for group in all_groups for _, msg, _ in group]
    embeddings = precompute_embeddings(all_messages, num_gpus=2, chunk_size=5000)
    
    logger.info("開始添加情緒標籤...")
    all_groups = add_emotion_labels(all_groups, embeddings)
    logger.info("開始添加話題標籤...")
    all_groups = add_topic_labels(all_groups, embeddings)
    
    logger.info("開始混合對話...")
    all_conversations = [item for group in all_groups for item in group]
    all_conversations.sort(key=lambda x: x[2])
    
    mixed_groups = []
    current_group = []
    prev_time = None
    prev_topic = None
    for item in tqdm(all_conversations, desc="混合對話"):
        speaker, message, time, emotion, topic = item
        if prev_time and (abs(time - prev_time) > time_threshold or topic.split("+")[0] != prev_topic):
            if current_group:
                mixed_groups.append(current_group)
            current_group = []
        current_group.append((speaker, message, time, emotion, topic))
        prev_time = time
        prev_topic = topic.split("+")[0]
    if current_group:
        mixed_groups.append(current_group)
    
    logger.info(f"混合完成，生成 {len(mixed_groups)} 組對話")
    return format_training_data(mixed_groups, embeddings, max_turns, similarity_threshold)

# 主函數
def process_chat_to_training_data(chat_files: List[str], friend_ids: List[str], output_file: str, time_threshold: int = 5, max_turns: int = 10, similarity_threshold: float = 0.7):
    chat_logs = []
    for chat_file in tqdm(chat_files, desc="讀取聊天檔案"):
        try:
            with open(chat_file, 'r', encoding='utf-8') as f:
                chat_logs.append(f.read())
            logger.info(f"成功讀取檔案: {chat_file}")
        except FileNotFoundError:
            logger.error(f"找不到檔案: {chat_file}")
            return
    
    training_data = mix_conversations(chat_logs, friend_ids, time_threshold, max_turns, similarity_threshold)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, ensure_ascii=False, indent=2)
    
    logger.info(f"訓練資料已儲存至 {output_file}，總共 {len(training_data)} 筆資料")

# 測試用
if __name__ == "__main__":
    chat_files = ["data/claire.txt"]
    friend_ids = ["2"]
    output_file = "output/out.json"
    
    process_chat_to_training_data(chat_files, friend_ids, output_file)
    
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            logger.info("前 3 筆訓練資料範例：")
            for i, sample in enumerate(data[:3]):
                logger.info(f"樣本 {i + 1}:")
                logger.info(f"Prompt: {sample['prompt']}")
                logger.info(f"Response: {sample['response']}")
    except FileNotFoundError:
        logger.error("無法讀取輸出檔案，請檢查處理過程是否成功。")

2025-03-28 15:39:09,872 - 正在初始化情感分類器（j-hartmann/emotion-english-distilroberta-base）...
Device set to use cuda:0
2025-03-28 15:39:10,642 - 正在初始化零樣本分類器...
Device set to use cuda:1
讀取聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 15:39:12,517 - 成功讀取檔案: data/claire.txt
讀取聊天檔案: 100%|██████████| 1/1 [00:00<00:00, 284.92it/s]
解析聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 15:39:12,524 - 開始解析聊天記錄 (User: 0, Friend: 2)...

解析聊天行: 100%|██████████| 2014/2014 [00:00<00:00, 200449.15it/s]
2025-03-28 15:39:12,540 - 解析完成，生成 116 組對話
解析聊天檔案: 100%|██████████| 1/1 [00:00<00:00, 47.89it/s]
2025-03-28 15:39:12,543 - 從快取檔案 embeddings.pt 載入嵌入向量...
2025-03-28 15:39:12,547 - 開始添加情緒標籤...
2025-03-28 15:39:12,608 - 開始全批量情感分類...
情感分類批次處理: 100%|██████████| 12/12 [00:02<00:00,  4.14it/s]
組裝情感標籤: 100%|██████████| 116/116 [00:00<00:00, 7498.37it/s]
2025-03-28 15:39:15,528 - 情感標籤添加完成
2025-03-28 15:39:15,530 - 開始添加話題標籤...
2025-03-28 15:39:15,545 - 開始全批量話題分類...
話題分類批次處理: 100%|██████████| 12/12 [00:48<00:00

In [9]:
import re
import json
from typing import List, Tuple
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import pipeline
from datasets import Dataset
import numpy as np
import logging
import gc
from tqdm import tqdm
import emoji
import os
import opencc

# 設置記憶體管理環境變數
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 設置日誌
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

# 初始化多 GPU 環境
device_ids = [0, 1]  # 兩張 A10G
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 清理 GPU 記憶體
torch.cuda.empty_cache()

# 定義分類標籤
act_labels = ["提問", "補充", "引入", "表達情感", "讚美", "道歉", "感謝", "請求", "拒絕", "建議", "問候", "告別", "確認", "表達意圖", "吐槽", "催促", "安慰", "接梗"]
topic_labels = ["活動", "感情", "學業", "娛樂", "人際", "課業壓力", "社團活動", "感情八卦", "生活瑣事"]

# 初始化簡繁轉換器
converter_to_simplified = opencc.OpenCC('t2s')
converter_to_traditional = opencc.OpenCC('s2t')

# 動態批次大小
def get_dynamic_batch_size(messages: List[str], base_batch_size: int = 256) -> int:
    gpu_memory = torch.cuda.memory_reserved(device) / 1024**3
    max_memory = 24  # A10G 24GB
    if gpu_memory > max_memory * 0.8:
        return max(16, base_batch_size // 2)
    return base_batch_size

# 單進程分片計算嵌入
def compute_chunk(chunk: List[str], gpu_id: int, batch_size: int, embedder: SentenceTransformer) -> torch.Tensor:
    torch.cuda.set_device(gpu_id)
    embedder.to(f"cuda:{gpu_id}")
    try:
        embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size, show_progress_bar=False)
        embeddings = embeddings.to("cuda:0")
        torch.cuda.empty_cache()
        gc.collect()
        return embeddings
    except torch.cuda.OutOfMemoryError:
        logger.warning(f"GPU {gpu_id} 記憶體不足，減半批次大小重試...")
        embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size // 2, show_progress_bar=False)
        embeddings = embeddings.to("cuda:0")
        torch.cuda.empty_cache()
        gc.collect()
        return embeddings

def precompute_embeddings(messages: List[str], num_gpus: int = 2, chunk_size: int = 5000) -> torch.Tensor:
    logger.info(f"開始預計算 {len(messages)} 條訊息的嵌入向量...")
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    chunks = [simplified_messages[i:i + chunk_size] for i in range(0, len(messages), chunk_size)]
    batch_size = get_dynamic_batch_size(messages)
    
    embedder = SentenceTransformer('intfloat/multilingual-e5-large')
    results = []
    
    for i, chunk in enumerate(tqdm(chunks, desc="計算嵌入分片")):
        gpu_id = device_ids[i % num_gpus]
        logger.info(f"處理分片 {i+1}/{len(chunks)} on GPU {gpu_id}")
        chunk_embeddings = compute_chunk(chunk, gpu_id, batch_size, embedder)
        results.append(chunk_embeddings)
    
    embeddings = torch.cat(results, dim=0)
    embeddings = embeddings.cpu()
    logger.info("嵌入計算完成")
    torch.cuda.empty_cache()
    return embeddings

# 清洗文本
def clean_text(text: str) -> str:
    if not text:
        return ""
    patterns = r'(上午|下午)\d{1,2}:\d{2}\s*|\[照片\]|\[影片\]|\[貼圖\]|.*已收回訊息.*|☎.*|^\*+$'
    return re.sub(patterns, '', text).strip()

# 關鍵詞分類（擴展行為與話題）
def keyword_infer_dialogue_act(message: str, prev_act: str = None) -> str:
    message_lower = message.lower()
    keywords = {
        "提問": ['有沒有', '什麼', '會不會', '你覺得呢', '怎麼', '為什麼', '嗎', '哪個', '可不可以', '到了沒', '幾點', '到底', '知道', '多少'],
        "補充": ['因為', '所以', '雖然', '不過', '但是', '然後', '而且', '結果', '還有', '幫', '就'],
        "引入": ['重點是', '主要是', '說到', '講到', '提到', '我也不知道'],
        "表達情感": ['難過', '尷尬', '自豪', '麻煩', '緊張', '開心', '笑死', '希望', '怕', '覺得', '累了', '冷淡', '好笑', '靠北', '你娘', '超趕'],
        "讚美": ['漂亮', '帥', '很棒', '很讚', '很可愛', '適合', '好會', '好一點', '顏值', '很高'],
        "道歉": ['對不起', '抱歉', '不好意思', 'sorry'],
        "感謝": ['謝謝', '感謝', '感激', '多謝'],
        "請求": ['拜託', '請', '幫我', '可以嗎'],
        "拒絕": ['不要', '不行', '不可以', '不ok', '沒辦法', '不會', '我沒'],
        "建議": ['建議', '不如', '要不要', '不然', '等等再'],
        "問候": ['嗨', '嘿', '你好', '早安'],
        "告別": ['掰掰', '拜拜', '再見', '晚安', '解散', '出門', '你刪掉啊'],
        "確認": ['真的假的', '確定', '確認', '好啦'],
        "表達意圖": ['我要', '我想', '我會'],
        "吐槽": ['笑死', '怎麼可能', '超醜', '靠腰', '你快點'],
        "催促": ['快點', '最好快點', '趕緊', '供三信'],
        "安慰": ['不會吧', '又沒差', '沒事的'],
        "接梗": ['超好笑', '哈哈哈', '我也是吧', '後來也換']
    }
    for act, kws in keywords.items():
        if any(kw in message_lower for kw in kws):
            return act
    return "回答" if prev_act == "提問" else None

# 初始化分類器
logger.info("正在初始化情感分類器（Erlangshen-Roberta-110M-Sentiment）...")
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment",
    device=device_ids[0],
    truncation=True,
    max_length=128
)

logger.info("正在初始化零樣本分類器...")
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_ids[0],
    truncation=True,
    max_length=512
)

# 對話行為分類
def infer_dialogue_act(messages: List[str], prev_acts: List[str] = None, embeddings: torch.Tensor = None) -> List[str]:
    if prev_acts is None:
        prev_acts = [None] * len(messages)
    
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    dataset = Dataset.from_dict({"text": simplified_messages})
    batch_size = get_dynamic_batch_size(messages)
    logger.info(f"正在對 {len(messages)} 條訊息進行對話行為分類，批次大小: {batch_size}")
    
    num_batches = (len(messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(messages), batch_size), total=num_batches, desc="行為分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = zero_shot_classifier(batch, act_labels, multi_label=False, batch_size=len(batch))
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    predicted_acts = []
    for i, (message, prev_act, result) in tqdm(enumerate(zip(messages, prev_acts, results)), total=len(messages), desc="後處理對話行為"):
        predicted_act = result['labels'][0]
        confidence = result['scores'][0]
        
        if confidence < 0.1 and embeddings is not None and i > 0:  # 降低閾值至 0.1
            similarity = util.cos_sim(embeddings[i], embeddings[i-1]).item()
            if similarity > 0.7:
                predicted_act = prev_acts[i-1]
            else:
                predicted_act = keyword_infer_dialogue_act(message, prev_act)
        elif prev_act == "提問" and predicted_act not in ["提問", "補充", "回答"]:
            predicted_act = "回答"
        
        predicted_acts.append(predicted_act)
    return predicted_acts

# 解析聊天記錄
def parse_chat_log(chat_log: str, user_id: str, friend_id: str, time_threshold: int = 5) -> List[List[Tuple[str, str, int]]]:
    lines = [line.strip() for line in chat_log.split('\n') if line.strip() and "儲存日期" not in line and not re.match(r'\d{4}/\d{2}/\d{2}', line)]
    conversation_groups = []
    current_group = []
    prev_time = None
    
    logger.info(f"開始解析聊天記錄 (User: {user_id}, Friend: {friend_id})...")
    for line in tqdm(lines, desc="解析聊天行"):
        match = re.match(r'(上午|下午)(\d{1,2}):(\d{2})\s*([A-B])\s+(.+)', line)
        if match:
            period, hour, minute, speaker, message = match.groups()
            cleaned_message = clean_text(message)
            if cleaned_message and len(cleaned_message) >= 3:
                current_time = int(hour) * 60 + int(minute) + (12 * 60 if period == "下午" else 0)
                if prev_time and abs(current_time - prev_time) > time_threshold:
                    conversation_groups.append(current_group)
                    current_group = []
                role = f"User{user_id}" if speaker == "B" else f"User{friend_id}"
                current_group.append((role, cleaned_message, current_time))
                prev_time = current_time
    
    if current_group:
        conversation_groups.append(current_group)
    logger.info(f"解析完成，生成 {len(conversation_groups)} 組對話")
    return conversation_groups

# 添加情緒標籤
def add_emotion_labels(conversation_groups: List[List[Tuple[str, str, int]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str]]]:
    all_messages = [emoji.demojize(msg) for group in conversation_groups for _, msg, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量情感分類...")
    dataset = Dataset.from_dict({"text": simplified_messages})
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="情感分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = sentiment_classifier(batch, batch_size=len(batch), truncation=True, max_length=128)
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    emotion_map = {
        'Positive': '開心', 'Negative': '難過', 'Neutral': '中性',
        ':smiling_face_with_heart-eyes:': '喜愛', ':pouting_face:': '難過',
        ':laughing:': '開心', ':sob:': '難過', ':angry_face:': '憤怒',
        ':face_with_tears_of_joy:': '開心', ':pleading_face:': '喜愛',
        ':scream:': '興奮', ':thinking_face:': '焦慮', ':weary_face:': '無奈',
        ':sleeping_face:': '懶散'
    }
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝情感標籤")):
        for speaker, message, time in group:
            sentiment = results[msg_idx]['label']
            confidence = results[msg_idx]['score']
            emotion = emotion_map.get(sentiment, '中性')
            # 檢查表情符號
            for emoji_key, emo in emotion_map.items():
                if emoji_key in message and emo != '中性':
                    emotion = emo
                    break
            # 降低閾值並增強上下文推理
            if confidence < 0.5 and msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.7:
                    emotion = labeled_groups[group_idx][-1][3] if labeled_groups[group_idx] else '中性'
            # 關鍵詞後處理
            message_lower = message.lower()
            if '笑死' in message_lower or '好笑' in message_lower or '哈哈' in message_lower:
                emotion = '開心' if '超' not in message_lower else '興奮'
            elif '難過' in message_lower or '尷尬' in message_lower or '怕' in message_lower or '冷淡' in message_lower:
                emotion = '難過'
            elif '很高' in message_lower and '顏值' in message_lower:
                emotion = '喜愛'
            elif '你娘' in message_lower or '靠北' in message_lower or '耖你媽' in message_lower:
                emotion = '難過' if '笑' not in message_lower else '開心'
            elif '怎麼辦' in message_lower or '到底' in message_lower or '超趕' in message_lower:
                emotion = '焦慮'
            elif '我要睡了' in message_lower or '又沒差' in message_lower:
                emotion = '懶散'
            elif '去宜蘭玩' in message_lower or '超好笑' in message_lower:
                emotion = '興奮' if '超' in message_lower else '開心'
            elif '我沒準備' in message_lower or '我他媽' in message_lower:
                emotion = '焦慮' if '怎麼辦' in message_lower else '無奈'
            labeled_groups[group_idx].append((speaker, message, time, emotion))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("情感標籤添加完成")
    return labeled_groups

# 添加話題標籤
def add_topic_labels(conversation_groups: List[List[Tuple[str, str, int, str]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str, str]]]:
    all_messages = [msg for group in conversation_groups for _, msg, _, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量話題分類...")
    dataset = Dataset.from_dict({"text": simplified_messages})
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="話題分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = zero_shot_classifier(batch, topic_labels, multi_label=True, batch_size=len(batch))
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝話題標籤")):
        for speaker, message, time, emotion in group:
            result = results[msg_idx]
            predicted_topics = sorted([(label, score) for label, score in zip(result['labels'], result['scores'])], key=lambda x: x[1], reverse=True)[:3]
            predicted_topics = [label for label, score in predicted_topics if score > 0.4]
            # 關鍵詞後處理
            message_lower = message.lower()
            if '女朋友' in message_lower or '喜歡' in message_lower or '在一起' in message_lower:
                predicted_topics.append('感情八卦')
            if '群組' in message_lower or '男生' in message_lower or '女生' in message_lower or '室友' in message_lower:
                predicted_topics.append('人際')
            if '笑死' in message_lower or '好笑' in message_lower or '照片' in message_lower:
                predicted_topics.append('娛樂')
            if '遊覽車' in message_lower or '出去玩' in message_lower or '計畫' in message_lower or '股練' in message_lower:
                predicted_topics.append('社團活動')
            if '填' in message_lower or '名額' in message_lower or '家教' in message_lower:
                predicted_topics.append('課業壓力')
            if '睡了' in message_lower or '洗澡' in message_lower or '行李' in message_lower or '出門' in message_lower:
                predicted_topics.append('生活瑣事')
            if not predicted_topics and msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.7:
                    topic_str = labeled_groups[group_idx][-1][4] if labeled_groups[group_idx] else "日常"
                else:
                    topic_str = "日常"
            else:
                topic_str = "+".join(sorted(set(predicted_topics))) if predicted_topics else "日常"
            labeled_groups[group_idx].append((speaker, message, time, emotion, topic_str))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("話題標籤添加完成")
    return labeled_groups

# 優化格式化訓練數據
def format_training_data(conversation_groups: List[List[Tuple[str, str, int, str, str]]], embeddings: torch.Tensor, max_turns: int = 10, similarity_threshold: float = 0.8) -> List[dict]:
    training_data = []
    msg_idx = 0
    
    logger.info("開始格式化訓練資料...")
    all_messages = [item[1] for group in conversation_groups for item in group]
    logger.info("預先批量計算所有對話行為...")
    all_dialogue_acts = infer_dialogue_act(all_messages, embeddings=embeddings)
    act_idx = 0
    
    for group in tqdm(conversation_groups, desc="格式化對話組"):
        messages = [item[1] for item in group]
        dialogue_acts = all_dialogue_acts[act_idx:act_idx + len(messages)]
        act_idx += len(messages)
        
        if len(messages) > 1:
            curr_embeddings = embeddings[msg_idx:msg_idx + len(messages) - 1]
            next_embeddings = embeddings[msg_idx + 1:msg_idx + len(messages)]
            similarities = util.cos_sim(curr_embeddings, next_embeddings).diagonal().cpu().numpy()
        else:
            similarities = np.array([])
        
        for i in range(len(group) - 1):
            context = group[max(0, i - max_turns + 1):i + 1]
            next_msg = group[i + 1]
            similarity = similarities[i] if i < len(similarities) else 0.0
            
            if similarity >= similarity_threshold and len(next_msg[1]) >= 3:
                prompt = "\n".join(f"{s}: {m} [情緒: {e}, 行為: {dialogue_acts[j]}, 話題: {t}]" 
                                 for j, (s, m, _, e, t) in enumerate(context))
                response = f"{next_msg[1]} [情緒: {next_msg[3]}, 行為: {dialogue_acts[i + 1]}, 話題: {next_msg[4]}]"
                training_data.append({"prompt": prompt, "response": response})
        msg_idx += len(group)
    
    logger.info(f"格式化完成，生成 {len(training_data)} 筆訓練資料")
    torch.cuda.empty_cache()
    return training_data

# 混合多人群聊
def mix_conversations(chat_logs: List[str], friend_ids: List[str], time_threshold: int = 5, max_turns: int = 10, similarity_threshold: float = 0.8) -> List[dict]:
    all_groups = []
    for user_id, (chat_log, friend_id) in enumerate(tqdm(zip(chat_logs, friend_ids), desc="解析聊天檔案", total=len(chat_logs))):
        groups = parse_chat_log(chat_log, str(user_id), friend_id, time_threshold)
        all_groups.extend(groups)
    
    all_messages = [msg for group in all_groups for _, msg, _ in group]
    embeddings = precompute_embeddings(all_messages, num_gpus=2, chunk_size=5000)
    
    logger.info("開始添加情緒標籤...")
    all_groups = add_emotion_labels(all_groups, embeddings)
    logger.info("開始添加話題標籤...")
    all_groups = add_topic_labels(all_groups, embeddings)
    
    logger.info("開始混合對話...")
    all_conversations = [item for group in all_groups for item in group]
    all_conversations.sort(key=lambda x: x[2])
    
    mixed_groups = []
    current_group = []
    prev_time = None
    prev_topic = None
    for item in tqdm(all_conversations, desc="混合對話"):
        speaker, message, time, emotion, topic = item
        if prev_time and (abs(time - prev_time) > time_threshold or topic.split("+")[0] != prev_topic):
            if current_group:
                mixed_groups.append(current_group)
            current_group = []
        current_group.append((speaker, message, time, emotion, topic))
        prev_time = time
        prev_topic = topic.split("+")[0]
    if current_group:
        mixed_groups.append(current_group)
    
    logger.info(f"混合完成，生成 {len(mixed_groups)} 組對話")
    return format_training_data(mixed_groups, embeddings, max_turns, similarity_threshold)

# 主函數
def process_chat_to_training_data(chat_files: List[str], friend_ids: List[str], output_file: str, time_threshold: int = 5, max_turns: int = 10, similarity_threshold: float = 0.8):
    chat_logs = []
    for chat_file in tqdm(chat_files, desc="讀取聊天檔案"):
        try:
            with open(chat_file, 'r', encoding='utf-8') as f:
                chat_logs.append(f.read())
            logger.info(f"成功讀取檔案: {chat_file}")
        except FileNotFoundError:
            logger.error(f"找不到檔案: {chat_file}")
            return
    
    training_data = mix_conversations(chat_logs, friend_ids, time_threshold, max_turns, similarity_threshold)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, ensure_ascii=False, indent=2)
    
    logger.info(f"訓練資料已儲存至 {output_file}，總共 {len(training_data)} 筆資料")

# 測試用
if __name__ == "__main__":
    chat_files = ["data/claire.txt"]
    friend_ids = ["2"]
    output_file = "output/out.json"
    
    process_chat_to_training_data(chat_files, friend_ids, output_file)
    
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            logger.info("前 3 筆訓練資料範例：")
            for i, sample in enumerate(data[:3]):
                logger.info(f"樣本 {i + 1}:")
                logger.info(f"Prompt: {sample['prompt']}")
                logger.info(f"Response: {sample['response']}")
    except FileNotFoundError:
        logger.error("無法讀取輸出檔案，請檢查處理過程是否成功。")

2025-03-28 15:46:06,908 - 正在初始化情感分類器（Erlangshen-Roberta-110M-Sentiment）...
Device set to use cuda:0
2025-03-28 15:46:07,990 - 正在初始化零樣本分類器...
Device set to use cuda:0
讀取聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 15:46:09,862 - 成功讀取檔案: data/claire.txt
讀取聊天檔案: 100%|██████████| 1/1 [00:00<00:00, 382.62it/s]
解析聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 15:46:09,870 - 開始解析聊天記錄 (User: 0, Friend: 2)...

解析聊天行: 100%|██████████| 2014/2014 [00:00<00:00, 212201.77it/s]
2025-03-28 15:46:09,885 - 解析完成，生成 116 組對話
解析聊天檔案: 100%|██████████| 1/1 [00:00<00:00, 47.95it/s]
2025-03-28 15:46:09,888 - 開始預計算 1499 條訊息的嵌入向量...
2025-03-28 15:46:09,910 - Use pytorch device_name: cuda
2025-03-28 15:46:09,911 - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large
計算嵌入分片:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 15:46:13,783 - 處理分片 1/1 on GPU 0
計算嵌入分片: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
2025-03-28 15:46:16,527 - 嵌入計算完成
2025-03-28 15:46:16,527 - 開始添加情緒標籤...
2025-03

In [12]:
import re
import json
from typing import List, Tuple
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import pipeline
from datasets import Dataset
import numpy as np
import logging
import gc
from tqdm import tqdm
import emoji
import os
import opencc

# 設置記憶體管理環境變數
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 設置日誌
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

# 初始化多 GPU 環境
device_ids = [0, 1]  # 兩張 A10G
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 清理 GPU 記憶體
torch.cuda.empty_cache()

# 定義分類標籤
act_labels = ["提問", "補充", "引入", "表達情感", "讚美", "道歉", "感謝", "請求", "拒絕", "建議", "問候", "告別", "確認", "表達意圖", "吐槽", "催促", "安慰", "接梗"]
topic_labels = ["活動", "感情", "學業", "娛樂", "人際", "課業壓力", "社團活動", "感情八卦", "生活瑣事", "宿舍生活", "考試", "旅行", "美食", "科技"]

# 初始化簡繁轉換器
converter_to_simplified = opencc.OpenCC('t2s')
converter_to_traditional = opencc.OpenCC('s2t')

# 動態批次大小
def get_dynamic_batch_size(messages: List[str], base_batch_size: int = 256) -> int:
    gpu_memory = torch.cuda.memory_reserved(device) / 1024**3
    max_memory = 24  # A10G 24GB
    if gpu_memory > max_memory * 0.8:
        return max(16, base_batch_size // 2)
    return base_batch_size

# 單進程分片計算嵌入
def compute_chunk(chunk: List[str], gpu_id: int, batch_size: int, embedder: SentenceTransformer) -> torch.Tensor:
    torch.cuda.set_device(gpu_id)
    embedder.to(f"cuda:{gpu_id}")
    try:
        embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size, show_progress_bar=False)
        embeddings = embeddings.to("cuda:0")
        torch.cuda.empty_cache()
        gc.collect()
        return embeddings
    except torch.cuda.OutOfMemoryError:
        logger.warning(f"GPU {gpu_id} 記憶體不足，減半批次大小重試...")
        embeddings = embedder.encode(chunk, convert_to_tensor=True, batch_size=batch_size // 2, show_progress_bar=False)
        embeddings = embeddings.to("cuda:0")
        torch.cuda.empty_cache()
        gc.collect()
        return embeddings

def precompute_embeddings(messages: List[str], num_gpus: int = 2, chunk_size: int = 5000) -> torch.Tensor:
    logger.info(f"開始預計算 {len(messages)} 條訊息的嵌入向量...")
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    chunks = [simplified_messages[i:i + chunk_size] for i in range(0, len(messages), chunk_size)]
    batch_size = get_dynamic_batch_size(messages)
    
    embedder = SentenceTransformer('intfloat/multilingual-e5-large')
    results = []
    
    for i, chunk in enumerate(tqdm(chunks, desc="計算嵌入分片")):
        gpu_id = device_ids[i % num_gpus]
        logger.info(f"處理分片 {i+1}/{len(chunks)} on GPU {gpu_id}")
        chunk_embeddings = compute_chunk(chunk, gpu_id, batch_size, embedder)
        results.append(chunk_embeddings)
    
    embeddings = torch.cat(results, dim=0)
    embeddings = embeddings.cpu()
    logger.info("嵌入計算完成")
    torch.cuda.empty_cache()
    return embeddings

# 清洗文本
def clean_text(text: str) -> str:
    if not text:
        return ""
    patterns = r'(上午|下午)\d{1,2}:\d{2}\s*|\[照片\]|\[影片\]|\[貼圖\]|.*已收回訊息.*|☎.*|^\*+$'
    return re.sub(patterns, '', text).strip()

# 關鍵詞分類（優化行為標籤）
def keyword_infer_dialogue_act(message: str, prev_act: str = None) -> str:
    message_lower = message.lower()
    keywords = {
        "提問": ['有沒有', '什麼', '會不會', '你覺得呢', '怎麼', '為什麼', '嗎', '哪個', '可不可以', '到了沒', '幾點', '到底', '知道', '多少'],
        "補充": ['因為', '所以', '雖然', '不過', '但是', '然後', '而且', '結果', '還有', '幫', '就', '剛剛'],
        "引入": ['重點是', '主要是', '說到', '講到', '提到', '我也不知道'],
        "表達情感": ['難過', '尷尬', '自豪', '麻煩', '緊張', '開心', '笑死', '希望', '怕', '覺得', '累了', '冷淡', '好笑', '靠北', '你娘', '超趕'],
        "讚美": ['漂亮', '帥', '很棒', '很讚', '很可愛', '適合', '好會', '好一點', '顏值', '很高'],
        "道歉": ['對不起', '抱歉', '不好意思', 'sorry'],
        "感謝": ['謝謝', '感謝', '感激', '多謝'],
        "請求": ['拜託', '請', '幫我', '可以嗎'],
        "拒絕": ['不要', '不行', '不可以', '不ok', '沒辦法', '不會', '我沒'],
        "建議": ['建議', '不如', '要不要', '不然', '等等再', '你可以'],
        "問候": ['嗨', '嘿', '你好', '早安'],
        "告別": ['掰掰', '拜拜', '再見', '晚安', '解散', '出門', '你刪掉啊'],
        "確認": ['真的假的', '確定', '確認', '好啦'],
        "表達意圖": ['我要', '我想', '我會'],
        "吐槽": ['笑死', '怎麼可能', '超醜', '靠腰', '你快點'],
        "催促": ['快點', '最好快點', '趕緊', '供三信'],
        "安慰": ['不會吧', '又沒差', '沒事的'],
        "接梗": ['超好笑', '哈哈哈', '我也是吧', '後來也換']
    }
    for act, kws in keywords.items():
        if any(kw in message_lower for kw in kws):
            return act
    # 根據上下文調整
    if prev_act == "提問" and '嗎' not in message_lower:
        return "回答"
    if '覺得' in message_lower or '很' in message_lower:
        return "表達情感"
    if '真的' in message_lower and '假的' in message_lower:
        return "確認"
    return "閒聊"  # 默認行為

# 初始化分類器
logger.info("正在初始化情感分類器（Erlangshen-Roberta-110M-Sentiment）...")
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment",
    device=device_ids[0],
    truncation=True,
    max_length=128
)

logger.info("正在初始化零樣本分類器...")
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_ids[0],
    truncation=True,
    max_length=512
)

# 對話行為分類
def infer_dialogue_act(messages: List[str], prev_acts: List[str] = None, embeddings: torch.Tensor = None) -> List[str]:
    if prev_acts is None:
        prev_acts = [None] * len(messages)
    
    simplified_messages = [converter_to_simplified.convert(msg) for msg in messages]
    dataset = Dataset.from_dict({"text": simplified_messages})
    batch_size = get_dynamic_batch_size(messages)
    logger.info(f"正在對 {len(messages)} 條訊息進行對話行為分類，批次大小: {batch_size}")
    
    num_batches = (len(messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(messages), batch_size), total=num_batches, desc="行為分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = zero_shot_classifier(batch, act_labels, multi_label=False, batch_size=len(batch))
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    predicted_acts = []
    for i, (message, prev_act, result) in tqdm(enumerate(zip(messages, prev_acts, results)), total=len(messages), desc="後處理對話行為"):
        predicted_act = result['labels'][0]
        confidence = result['scores'][0]
        
        if confidence < 0.1 and embeddings is not None and i > 0:
            similarity = util.cos_sim(embeddings[i], embeddings[i-1]).item()
            if similarity > 0.7 and prev_acts[i-1]:
                predicted_act = prev_acts[i-1]
            else:
                predicted_act = keyword_infer_dialogue_act(message, prev_act)
        elif prev_act == "提問" and predicted_act not in ["提問", "補充", "回答"]:
            predicted_act = "回答"
        
        if not predicted_act:
            predicted_act = "閒聊"
        
        predicted_acts.append(predicted_act)
    return predicted_acts

# 解析聊天記錄
def parse_chat_log(chat_log: str, user_id: str, friend_id: str, time_threshold: int = 10) -> List[List[Tuple[str, str, int]]]:
    lines = [line.strip() for line in chat_log.split('\n') if line.strip() and "儲存日期" not in line and not re.match(r'\d{4}/\d{2}/\d{2}', line)]
    conversation_groups = []
    current_group = []
    prev_time = None
    
    logger.info(f"開始解析聊天記錄 (User: {user_id}, Friend: {friend_id})...")
    for line in tqdm(lines, desc="解析聊天行"):
        match = re.match(r'(上午|下午)(\d{1,2}):(\d{2})\s*([A-B])\s+(.+)', line)
        if match:
            period, hour, minute, speaker, message = match.groups()
            cleaned_message = clean_text(message)
            if cleaned_message and len(cleaned_message) >= 3:
                current_time = int(hour) * 60 + int(minute) + (12 * 60 if period == "下午" else 0)
                if prev_time and abs(current_time - prev_time) > time_threshold:
                    conversation_groups.append(current_group)
                    current_group = []
                role = f"User{user_id}" if speaker == "B" else f"User{friend_id}"
                current_group.append((role, cleaned_message, current_time))
                prev_time = current_time
    
    if current_group:
        conversation_groups.append(current_group)
    logger.info(f"解析完成，生成 {len(conversation_groups)} 組對話")
    return conversation_groups

# 添加情緒標籤（優化）
def add_emotion_labels(conversation_groups: List[List[Tuple[str, str, int]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str]]]:
    all_messages = [emoji.demojize(msg) for group in conversation_groups for _, msg, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量情感分類...")
    dataset = Dataset.from_dict({"text": simplified_messages})
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="情感分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = sentiment_classifier(batch, batch_size=len(batch), truncation=True, max_length=128)
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    emotion_map = {
        'Positive': '開心', 'Negative': '難過', 'Neutral': '中性',
        ':smiling_face_with_heart-eyes:': '喜愛', ':pouting_face:': '難過',
        ':laughing:': '開心', ':sob:': '難過', ':angry_face:': '憤怒',
        ':face_with_tears_of_joy:': '開心', ':pleading_face:': '喜愛',
        ':scream:': '興奮', ':thinking_face:': '焦慮', ':weary_face:': '無奈',
        ':sleeping_face:': '懶散'
    }
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝情感標籤")):
        for speaker, message, time in group:
            sentiment = results[msg_idx]['label']
            confidence = results[msg_idx]['score']
            emotion = emotion_map.get(sentiment, '中性')
            # 檢查表情符號
            for emoji_key, emo in emotion_map.items():
                if emoji_key in message and emo != '中性':
                    emotion = emo
                    break
            # 增強上下文推理
            if confidence < 0.6 and msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.75 and labeled_groups[group_idx]:
                    prev_emotion = labeled_groups[group_idx][-1][3]
                    if prev_emotion in ['開心', '興奮', '喜愛'] and '愛' in message_lower:
                        emotion = '喜愛'
                    elif prev_emotion in ['難過', '焦慮', '無奈'] and '不' in message_lower:
                        emotion = '難過'
                    else:
                        emotion = prev_emotion
                elif '嗎' in message or '什麼' in message or '怎麼' in message:
                    emotion = '焦慮'
            # 更細化的關鍵詞後處理
            message_lower = message.lower()
            if '笑死' in message_lower or '好笑' in message_lower or '哈哈' in message_lower:
                emotion = '開心' if '超' not in message_lower else '興奮'
            elif '難過' in message_lower or '尷尬' in message_lower or '怕' in message_lower:
                emotion = '難過'
            elif '很高' in message_lower and '顏值' in message_lower:
                emotion = '喜愛'
            elif '你娘' in message_lower or '靠北' in message_lower or '耖你媽' in message_lower:
                emotion = '難過' if '笑' not in message_lower else '開心'
            elif '怎麼辦' in message_lower or '到底' in message_lower or '超趕' in message_lower:
                emotion = '焦慮'
            elif '我要睡了' in message_lower or '又沒差' in message_lower or '我沒' in message_lower:
                emotion = '懶散'
            elif '去宜蘭玩' in message_lower or '超好笑' in message_lower:
                emotion = '興奮' if '超' in message_lower else '開心'
            elif '我沒準備' in message_lower or '我他媽' in message_lower:
                emotion = '焦慮' if '怎麼辦' in message_lower else '無奈'
            elif '希望' in message_lower or '覺得' in message_lower and '不' not in message_lower:
                emotion = '開心'
            elif '愛' in message_lower or '喜歡' in message_lower:
                emotion = '喜愛'
            elif len(message) < 5 and '我' not in message_lower:
                emotion = '中性'
            labeled_groups[group_idx].append((speaker, message, time, emotion))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("情感標籤添加完成")
    return labeled_groups

# 添加話題標籤（優化）
def add_topic_labels(conversation_groups: List[List[Tuple[str, str, int, str]]], embeddings: torch.Tensor) -> List[List[Tuple[str, str, int, str, str]]]:
    all_messages = [msg for group in conversation_groups for _, msg, _, _ in group]
    simplified_messages = [converter_to_simplified.convert(msg) for msg in all_messages]
    batch_size = get_dynamic_batch_size(all_messages)
    logger.info("開始全批量話題分類...")
    dataset = Dataset.from_dict({"text": simplified_messages})
    num_batches = (len(all_messages) + batch_size - 1) // batch_size
    results = []
    for i in tqdm(range(0, len(all_messages), batch_size), total=num_batches, desc="話題分類批次處理"):
        batch = simplified_messages[i:i + batch_size]
        batch_results = zero_shot_classifier(batch, topic_labels, multi_label=True, batch_size=len(batch))
        results.extend(batch_results)
        torch.cuda.empty_cache()
        gc.collect()
    
    labeled_groups = [[] for _ in conversation_groups]
    msg_idx = 0
    for group_idx, group in enumerate(tqdm(conversation_groups, desc="組裝話題標籤")):
        for speaker, message, time, emotion in group:
            result = results[msg_idx]
            predicted_topics = sorted([(label, score) for label, score in zip(result['labels'], result['scores'])], key=lambda x: x[1], reverse=True)[:3]
            predicted_topics = [label for label, score in predicted_topics if score > 0.3]
            # 關鍵詞後處理
            message_lower = message.lower()
            if '女朋友' in message_lower or '喜歡' in message_lower or '在一起' in message_lower or '害羞' in message_lower:
                predicted_topics.append('感情八卦')
            if '群組' in message_lower or '男生' in message_lower or '女生' in message_lower or '室友' in message_lower or '合拍' in message_lower:
                predicted_topics.append('人際')
            if '笑死' in message_lower or '好笑' in message_lower or '照片' in message_lower:
                predicted_topics.append('娛樂')
            if '遊覽車' in message_lower or '出去玩' in message_lower or '計畫' in message_lower or '溜冰' in message_lower:
                predicted_topics.append('社團活動')
            if '填' in message_lower or '名額' in message_lower or '家教' in message_lower or '討論完' in message_lower:
                predicted_topics.append('課業壓力')
            if '睡了' in message_lower or '洗澡' in message_lower or '行李' in message_lower or '出門' in message_lower:
                predicted_topics.append('生活瑣事')
            if '宿舍' in message_lower or '床' in message_lower or '吵' in message_lower:
                predicted_topics.append('宿舍生活')
            if '考試' in message_lower or '成績' in message_lower or '期末' in message_lower:
                predicted_topics.append('考試')
            if '宜蘭' in message_lower or '台中' in message_lower or '旅行' in message_lower:
                predicted_topics.append('旅行')
            if '吃' in message_lower or '好吃' in message_lower or '餐廳' in message_lower:
                predicted_topics.append('美食')
            if '手機' in message_lower or '電腦' in message_lower or 'app' in message_lower or '帳號' in message_lower:
                predicted_topics.append('科技')
            # 上下文一致性檢查
            if msg_idx > 0:
                similarity = util.cos_sim(embeddings[msg_idx], embeddings[msg_idx-1]).item()
                if similarity > 0.75 and labeled_groups[group_idx]:
                    prev_topic = labeled_groups[group_idx][-1][4].split("+")[0]
                    if prev_topic in predicted_topics or not predicted_topics:
                        topic_str = prev_topic
                    else:
                        topic_str = "+".join(sorted(set(predicted_topics))) if predicted_topics else "日常"
                else:
                    topic_str = "+".join(sorted(set(predicted_topics))) if predicted_topics else "日常"
            else:
                topic_str = "+".join(sorted(set(predicted_topics))) if predicted_topics else "日常"
            labeled_groups[group_idx].append((speaker, message, time, emotion, topic_str))
            msg_idx += 1
    
    torch.cuda.empty_cache()
    logger.info("話題標籤添加完成")
    return labeled_groups

# 優化格式化訓練數據
def format_training_data(conversation_groups: List[List[Tuple[str, str, int, str, str]]], embeddings: torch.Tensor, max_turns: int = 5, similarity_threshold: float = 0.85) -> List[dict]:
    training_data = []
    msg_idx = 0
    
    logger.info("開始格式化訓練資料...")
    all_messages = [item[1] for group in conversation_groups for item in group]
    logger.info("預先批量計算所有對話行為...")
    all_dialogue_acts = infer_dialogue_act(all_messages, embeddings=embeddings)
    act_idx = 0
    
    for group in tqdm(conversation_groups, desc="格式化對話組"):
        messages = [item[1] for item in group]
        dialogue_acts = all_dialogue_acts[act_idx:act_idx + len(messages)]
        act_idx += len(messages)
        
        if len(messages) > 1:
            curr_embeddings = embeddings[msg_idx:msg_idx + len(messages) - 1]
            next_embeddings = embeddings[msg_idx + 1:msg_idx + len(messages)]
            similarities = util.cos_sim(curr_embeddings, next_embeddings).diagonal().cpu().numpy()
        else:
            similarities = np.array([])
        
        for i in range(len(group) - 1):
            context = group[max(0, i - max_turns + 1):i + 1]
            next_msg = group[i + 1]
            similarity = similarities[i] if i < len(similarities) else 0.0
            
            if similarity >= similarity_threshold and len(next_msg[1]) >= 3:
                prompt_text = "\n".join(f"{s}: {m}" for s, m, _, _, _ in context)
                response_text = next_msg[1]
                metadata = {
                    "context": [{"speaker": s, "message": m, "emotion": e, "act": dialogue_acts[j], "topic": t} 
                                for j, (s, m, _, e, t) in enumerate(context)],
                    "response": {"speaker": next_msg[0], "message": next_msg[1], "emotion": next_msg[3], 
                                "act": dialogue_acts[i + 1], "topic": next_msg[4]}
                }
                training_data.append({"prompt": prompt_text, "response": response_text, "metadata": metadata})
        msg_idx += len(group)
    
    logger.info(f"格式化完成，生成 {len(training_data)} 筆訓練資料")
    torch.cuda.empty_cache()
    return training_data

# 混合多人群聊
def mix_conversations(chat_logs: List[str], friend_ids: List[str], time_threshold: int = 10, max_turns: int = 5, similarity_threshold: float = 0.85) -> List[dict]:
    all_groups = []
    for user_id, (chat_log, friend_id) in enumerate(tqdm(zip(chat_logs, friend_ids), desc="解析聊天檔案", total=len(chat_logs))):
        groups = parse_chat_log(chat_log, str(user_id), friend_id, time_threshold)
        all_groups.extend(groups)
    
    all_messages = [msg for group in all_groups for _, msg, _ in group]
    embeddings = precompute_embeddings(all_messages, num_gpus=2, chunk_size=5000)
    
    logger.info("開始添加情緒標籤...")
    all_groups = add_emotion_labels(all_groups, embeddings)
    logger.info("開始添加話題標籤...")
    all_groups = add_topic_labels(all_groups, embeddings)
    
    logger.info("開始混合對話...")
    all_conversations = [item for group in all_groups for item in group]
    all_conversations.sort(key=lambda x: x[2])
    
    mixed_groups = []
    current_group = []
    prev_time = None
    prev_topic = None
    for item in tqdm(all_conversations, desc="混合對話"):
        speaker, message, time, emotion, topic = item
        if prev_time and (abs(time - prev_time) > time_threshold or topic.split("+")[0] != prev_topic):
            if current_group:
                mixed_groups.append(current_group)
            current_group = []
        current_group.append((speaker, message, time, emotion, topic))
        prev_time = time
        prev_topic = topic.split("+")[0]
    if current_group:
        mixed_groups.append(current_group)
    
    logger.info(f"混合完成，生成 {len(mixed_groups)} 組對話")
    return format_training_data(mixed_groups, embeddings, max_turns, similarity_threshold)

# 主函數
def process_chat_to_training_data(chat_files: List[str], friend_ids: List[str], output_file: str, time_threshold: int = 10, max_turns: int = 5, similarity_threshold: float = 0.85):
    chat_logs = []
    for chat_file in tqdm(chat_files, desc="讀取聊天檔案"):
        try:
            with open(chat_file, 'r', encoding='utf-8') as f:
                chat_logs.append(f.read())
            logger.info(f"成功讀取檔案: {chat_file}")
        except FileNotFoundError:
            logger.error(f"找不到檔案: {chat_file}")
            return
    
    training_data = mix_conversations(chat_logs, friend_ids, time_threshold, max_turns, similarity_threshold)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, ensure_ascii=False, indent=2)
    
    logger.info(f"訓練資料已儲存至 {output_file}，總共 {len(training_data)} 筆資料")

# 測試用
if __name__ == "__main__":
    chat_files = ["data/claire.txt"]
    friend_ids = ["2"]
    output_file = "output/out.json"
    
    process_chat_to_training_data(chat_files, friend_ids, output_file)
    
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            logger.info("前 3 筆訓練資料範例：")
            for i, sample in enumerate(data[:3]):
                logger.info(f"樣本 {i + 1}:")
                logger.info(f"Prompt: {sample['prompt']}")
                logger.info(f"Response: {sample['response']}")
                logger.info(f"Metadata: {json.dumps(sample['metadata'], ensure_ascii=False)}")
    except FileNotFoundError:
        logger.error("無法讀取輸出檔案，請檢查處理過程是否成功。")

2025-03-28 16:24:10,861 - 正在初始化情感分類器（Erlangshen-Roberta-110M-Sentiment）...
Device set to use cuda:0
2025-03-28 16:24:11,988 - 正在初始化零樣本分類器...
Device set to use cuda:0
讀取聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 16:24:13,838 - 成功讀取檔案: data/claire.txt
讀取聊天檔案: 100%|██████████| 1/1 [00:00<00:00, 170.59it/s]
解析聊天檔案:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-28 16:24:13,873 - 開始解析聊天記錄 (User: 0, Friend: 2)...

解析聊天行:   0%|          | 0/26848 [00:00<?, ?it/s][A
解析聊天行: 100%|██████████| 26848/26848 [00:00<00:00, 211445.88it/s][A
2025-03-28 16:24:14,006 - 解析完成，生成 1253 組對話
解析聊天檔案: 100%|██████████| 1/1 [00:00<00:00,  6.05it/s]
2025-03-28 16:24:14,011 - 開始預計算 19809 條訊息的嵌入向量...
2025-03-28 16:24:14,193 - Use pytorch device_name: cuda
2025-03-28 16:24:14,194 - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large
計算嵌入分片:   0%|          | 0/4 [00:00<?, ?it/s]2025-03-28 16:24:17,592 - 處理分片 1/4 on GPU 0
計算嵌入分片:  25%|██▌       | 1/4 [00:04<00:12,  4.16s/it]2025-03-28 16:24:21,