**前置設定**

In [1]:
!echo "Mounting Google Drive..."
%cd /

# 此處為了訓練方便，掛載了Google Drive，需要在我的雲端硬碟建立'Fine_Tune'並在裡面放入訓練資料
from google.colab import drive
drive.mount('/content/drive')
# 包含訓練用資料的位置
%cd /content/drive/MyDrive/Fine_Tune

Mounting Google Drive...
/
Mounted at /content/drive
/content/drive/MyDrive/Fine_Tune


In [2]:
import torch
# 模型相關設定
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRE_TRAINED_MODEL_NAME = 'microsoft/unixcoder-base'
FINE_TUNED_MODEL_PATH = './' + PRE_TRAINED_MODEL_NAME.replace("/", "-")



**文字預處理**

In [6]:
import pandas as pd
import re
from collections import Counter

from nltk.stem import WordNetLemmatizer

# 此為可選項，可以選擇開啟或不開啟
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    """對 token 列表進行詞形還原"""
    return [lemmatizer.lemmatize(token) for token in tokens]

def load_code_snippets(file_path):
    """從 CSV 檔案載入程式碼片段"""
    return pd.read_csv(file_path)

def tokenize(text, n_gram_range=(1, 1), keep_both=True):
    """
    將程式碼文本進行斷詞，並可選擇性地生成 n-grams
    - n_gram_range: (min_n, max_n)，例如 (1,2) 會同時生成unigram與bigram
    - keep_both: 是否同時保留"空格形式"與"原樣形式"
    """
    # 保留運算符作為獨立的 token
    tokens = re.findall(r'\w+|==|!=|<=|>=|[\+\-\*/=<>!&|%\^~]', text)
    tokens = [token.lower() for token in tokens]

    # 如果只要 unigram
    if n_gram_range == (1, 1):
        return tokens

    ngrams = []
    for n in range(n_gram_range[0], n_gram_range[1] + 1):
        for i in range(len(tokens) - n + 1):
            slice_tokens = tokens[i:i+n]
            if keep_both:
                # 空格形式（以空格進行連接，但可能造成詞語的改變）
                ngrams.append(" ".join(slice_tokens))
                # 原樣形式直接連起來）
                ngrams.append("".join(slice_tokens))
            else:
                ngrams.append(" ".join(slice_tokens))
    return ngrams

def preprocess(df, n_gram_range=(1, 1)):
    """對程式碼片段進行預處理"""
    df['tokens'] = df['code'].apply(lambda x: tokenize(x, n_gram_range=n_gram_range))
    return df

def create_semantic_mapping():
    """建立從自然語言到程式碼運算符的語意映射"""

    # 考慮到此處的程式碼為python，運算符號本身存在意義，需要另外保留運算符（+-*/等）作為獨立的Token，並為自然語言的查詢建立對應的映射，使其能與運算符匹配
    return {
        "add": "+", "sum": "+", "plus": "+", "addition": "+",
        "concatenate": "+", "join": "+",
        "assign": "=", "set": "=",
        "subtract": "-", "minus": "-", "subtraction": "-",
        "multiply": "*", "times": "*", "multiplication": "*",
        "divide": "/", "division": "/",
        "equals": "==", "is": "==",
        "less": "<", "smaller": "<",
        "greater": ">", "larger": ">",
    }

def build_statistics(processed_df):
    """建立詞彙庫和其他統計數據"""
    tokenized_docs = processed_df['tokens'].tolist()

    # 文件頻率，計算每個Token出現在多少篇文件中
    doc_freq = Counter()
    for doc in tokenized_docs:
        # 遍歷每個doc
        doc_freq.update(set(doc))

    # 詞彙庫
    vocab = list(doc_freq.keys())

    # token總數
    total_tokens = sum(len(doc) for doc in tokenized_docs)

    # 文件總數
    num_docs = len(tokenized_docs)

    # 平均文件長度
    avg_doc_len = total_tokens / num_docs if num_docs > 0 else 0

    return vocab, doc_freq, tokenized_docs, avg_doc_len

if __name__ == '__main__':
    # 載入資料
    code_snippets_df = load_code_snippets('code_snippets.csv')

    # 預處理資料
    processed_df = preprocess(code_snippets_df)

    # 建立統計數據
    vocab, doc_freq, tokenized_docs, avg_doc_len = build_statistics(processed_df)

    # 建立語意映射
    semantic_mapping = create_semantic_mapping()

    # 印出一些統計數據
    print(f"詞彙庫大小: {len(vocab)}")
    print(f"文件總數: {len(tokenized_docs)}")
    print(f"平均文件長度: {avg_doc_len:.2f}")
    print("\n前 10 個最常見的 token:")
    print(doc_freq.most_common(10))
    print("\n語意映射:")
    print(semantic_mapping)

詞彙庫大小: 4946
文件總數: 500
平均文件長度: 57.23

前 10 個最常見的 token:
[('def', 500), ('=', 429), ('return', 362), ('if', 302), ('self', 297), ('in', 191), ('none', 155), ('for', 153), ('not', 152), ('0', 130)]

語意映射:
{'add': '+', 'sum': '+', 'plus': '+', 'addition': '+', 'concatenate': '+', 'join': '+', 'assign': '=', 'set': '=', 'subtract': '-', 'minus': '-', 'subtraction': '-', 'multiply': '*', 'times': '*', 'multiplication': '*', 'divide': '/', 'division': '/', 'equals': '==', 'is': '==', 'less': '<', 'smaller': '<', 'greater': '>', 'larger': '>'}


**sparse_retrieval**

In [9]:
import numpy as np
from collections import Counter

class TFIDFRetriever:
    """TF-IDF檢索器"""
    def __init__(self, documents):
        """初始化TF-IDF檢索器"""
        self.documents = documents
        # 建立詞彙庫、文件頻率等統計數據
        self.vocab, self.doc_freq, self.tokenized_docs, self.avg_doc_len = build_statistics(documents)
        self.num_docs = len(self.tokenized_docs)

        # 把詞彙表（vocab）裡的每個單詞對應到一個整數索引，之後可以用來查詢詞彙對應的索引
        self.vocab_map = {word: i for i, word in enumerate(self.vocab)}

        # 計算IDF
        self.idf = self._calculate_idf()
        # 建立TF-IDF向量
        self.doc_vectors = self._create_doc_vectors()

    def _calculate_idf(self):
        """用公式計算IDF分數"""
        idf = np.zeros(len(self.vocab))
        for i, word in enumerate(self.vocab):
            idf[i] = np.log(self.num_docs / (self.doc_freq[word] + 1)) # +1避免分母為0
        return idf

    def _create_doc_vectors(self):
        """建立每篇文件的TF-IDF向量"""

        # 建立一個全零矩陣，大小是(文件數量 × 詞彙表大小)
        doc_vectors = np.zeros((self.num_docs, len(self.vocab)))
        for i, doc in enumerate(self.tokenized_docs):
            # doc代表每個文件斷詞後的詞彙列表，利用Counter計算每個詞彙在該文件中出現的次數
            tf = Counter(doc)
            for word, count in tf.items():
                if word in self.vocab_map:
                    # 使用次線性詞頻縮放 (sublinear tf scaling): 1 + log(tf)
                    doc_vectors[i, self.vocab_map[word]] = 1 + np.log(count)
        return doc_vectors * self.idf # 矩陣或向量中對應位置的元素進行乘法運算

    def retrieve(self, query, k=10, query_expansion=False):
        """進行query，返回文件在資料庫中的索引"""
        query_tokens = tokenize(query)
        if query_expansion:
            # 擴展功能，可選擇是否對token進行詞型還原
            lemmatized_tokens = lemmatize_tokens(query_tokens)
            query_tokens.extend(lemmatized_tokens)

        # 計算query的TF
        query_vector = np.zeros(len(self.vocab))
        tf = Counter(query_tokens)
        for word, count in tf.items():
            if word in self.vocab_map:
                query_vector[self.vocab_map[word]] = 1 + np.log(count)
        # 計算query的TF-IDF
        query_vector = query_vector * self.idf

        # 計算查詢向量的範數(norm)
        query_norm = np.linalg.norm(query_vector)

        # 如果查詢向量的範數為0 (代表查詢詞皆不存在於語料庫中)，則所有分數為0
        if query_norm == 0:
            scores = np.zeros(self.num_docs)
        else:
            # 計算餘弦相似度（cosine公式）
            doc_norms = np.linalg.norm(self.doc_vectors, axis=1)
            scores = np.dot(self.doc_vectors, query_vector) / (doc_norms * query_norm)
            # 將分母為0可能導致的 nan 值替換為 0，確保數值穩定性
            scores = np.nan_to_num(scores)

        # 取得前k個結果
        top_k_indices = np.argsort(scores)[::-1][:k]
        top_k_scores = scores[top_k_indices]

        """
        TF衡量詞在文件內的重要性
        IDF衡量詞在整個語料庫中的稀有程度（越稀有越重要）
        TF×IDF = 詞在文件中的加權重要性 → 形成TF-IDF向量，高分的詞通常是該文件獨有且重要的詞，在計算相似度時的貢獻也更大
        """
        return top_k_indices, top_k_scores


class BM25Retriever:
    """BM25 檢索器"""
    def __init__(self, documents, k1=1.5, b=0.75):
        """初始化 BM25 檢索器"""
        self.documents = documents

        # k1較大時，高詞頻的詞對分數貢獻增加，飽和程度降低 => 讓高頻詞影響更大
        # k1較小時，高詞頻對分數的邊際影響降低，快速飽和 => 避免長文本裡同一個詞過度加分
        self.k1 = k1

        # 當 b=1 → 完全使用長度正規化，長文件的詞頻被縮小
        # 當 b=0 → 不考慮文件長度，所有文件同樣計算
        self.b = b

        # 建立詞彙庫、文件頻率等統計數據
        self.vocab, self.doc_freq, self.tokenized_docs, self.avg_doc_len = build_statistics(documents)
        self.num_docs = len(self.tokenized_docs)
        self.doc_len = [len(doc) for doc in self.tokenized_docs]
        self.vocab_map = {word: i for i, word in enumerate(self.vocab)}

        # 計算 IDF
        self.idf = self._calculate_idf()

    def _calculate_idf(self):
        """計算BM25的IDF分數"""
        idf = np.zeros(len(self.vocab))
        for i, word in enumerate(self.vocab):
            idf[i] = np.log(((self.num_docs - self.doc_freq[word] + 0.5) / (self.doc_freq[word] + 0.5)) + 1)
        return idf

    def retrieve(self, query, k=10, query_expansion=False):
        """根據查詢檢索文件"""
        query_tokens = tokenize(query)
        if query_expansion:
            lemmatized_tokens = lemmatize_tokens(query_tokens)
            query_tokens.extend(lemmatized_tokens)
        scores = np.zeros(self.num_docs)

        for i in range(self.num_docs):
            tf = Counter(self.tokenized_docs[i])
            score = 0
            for word in query_tokens:
                if word in self.vocab_map:
                    tf_word = tf[word]
                    idf_word = self.idf[self.vocab_map[word]]
                    # 計算BM25分數
                    score += idf_word * (tf_word * (self.k1 + 1)) / (tf_word + self.k1 * (1 - self.b + self.b * self.doc_len[i] / self.avg_doc_len))
            scores[i] = score

        # 取得前k個結果
        top_k_indices = np.argsort(scores)[::-1][:k]
        top_k_scores = scores[top_k_indices]
        return top_k_indices, top_k_scores

if __name__ == '__main__':
    # 載入並預處理資料
    code_snippets_df = load_code_snippets('code_snippets.csv')
    processed_df = preprocess(code_snippets_df)

    # 初始化檢索器
    tfidf_retriever = TFIDFRetriever(processed_df)
    bm25_retriever = BM25Retriever(processed_df)

    # 範例查詢
    query = "add two numbers"

    # TF-IDF
    tfidf_top_k = tfidf_retriever.retrieve(query)
    print(f"查詢: '{query}'")
    print(f"TF-IDF 前 10 個檢索到的文件 ID: {tfidf_top_k}")

    # BM25
    bm25_top_k = bm25_retriever.retrieve(query)
    print(f"BM25 前 10 個檢索到的文件 ID: {bm25_top_k}")

查詢: 'add two numbers'
TF-IDF 前 10 個檢索到的文件 ID: (array([214, 452, 288, 271, 126, 158, 159, 160, 161, 186]), array([0.23646849, 0.16435265, 0.13304508, 0.11034009, 0.08994983,
       0.        , 0.        , 0.        , 0.        , 0.        ]))
BM25 前 10 個檢索到的文件 ID: (array([214, 452, 288, 271, 126, 158, 159, 160, 161, 186]), array([6.30910208, 5.31547725, 4.64946264, 3.44801661, 3.10240623,
       0.        , 0.        , 0.        , 0.        , 0.        ]))


**評估sparse_retrieval本地分數**

In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
import pandas as pd
from tqdm import tqdm

# 由於提供的檔案中沒有用於驗證稀疏檢所器的data，此處利用用於微調密集檢索器的資料train_queries.csv建立一個新的語料庫以及對應的問題-答案集，用於評估當前檢索器的效能


def evaluate(retriever, df, query_expansion=False):
    """
    在完整的資料集上評估檢索器的效能
    資料集同時作為語料庫和查詢集
    """
    recall_at_10 = 0
    # 使用tqdm顯示進度條
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Evaluating {retriever.__class__.__name__}"):
        query = row['query']
        true_code_id = row['code_id']

        top_k_indices, _ = retriever.retrieve(query, k=10, query_expansion=query_expansion)
        top_k_code_ids = retriever.documents.iloc[top_k_indices]['code_id'].tolist()

        if true_code_id in top_k_code_ids:
            recall_at_10 += 1

    return recall_at_10 / len(df)

if __name__ == '__main__':
    # 載入train_queries.csv
    print("Loading train_queries.csv for self-evaluation...")
    queries_df = pd.read_csv('train_queries.csv')

    # 建立一個唯一的code_id作為真實答案
    queries_df['code_id'] = range(len(queries_df))

    # --- 預處理語料庫 ---
    # 在此情境下，完整的queries_df就是我們的語料庫
    print("\nPreprocessing corpus...")
    processed_corpus = preprocess(queries_df.copy(), n_gram_range=(1, 1))

    # --- 實驗一: 基本Unigram(無查詢擴充) ---
    print("\n--- Evaluating with Unigrams (No Query Expansion) ---")
    # 初始化檢索器
    tfidf_retriever = TFIDFRetriever(processed_corpus)
    bm25_retriever = BM25Retriever(processed_corpus)

    # 在完整的資料集上進行評估
    tfidf_recall = evaluate(tfidf_retriever, queries_df, query_expansion=False)
    bm25_recall = evaluate(bm25_retriever, queries_df, query_expansion=False)

    print(f"TF-IDF Recall@10: {tfidf_recall:.4f}")
    print(f"BM25 Recall@10: {bm25_recall:.4f}")

    # --- 實驗二:Unigram+查詢擴充 ---
    print("\n--- Evaluating with Unigrams (With Query Expansion) ---")
    # 檢索器已建立，直接調用評估函式並開啟查詢擴充
    tfidf_recall_qe = evaluate(tfidf_retriever, queries_df, query_expansion=True)
    bm25_recall_qe = evaluate(bm25_retriever, queries_df, query_expansion=True)

    print(f"TF-IDF with Query Expansion Recall@10: {tfidf_recall_qe:.4f}")
    print(f"BM25 with Query Expansion Recall@10: {bm25_recall_qe:.4f}")

Loading train_queries.csv for self-evaluation...

Preprocessing corpus...

--- Evaluating with Unigrams (No Query Expansion) ---


Evaluating TFIDFRetriever: 100%|██████████| 500/500 [00:04<00:00, 101.50it/s]
Evaluating BM25Retriever: 100%|██████████| 500/500 [00:08<00:00, 61.60it/s]


TF-IDF Recall@10: 0.7660
BM25 Recall@10: 0.6680

--- Evaluating with Unigrams (With Query Expansion) ---


Evaluating TFIDFRetriever: 100%|██████████| 500/500 [00:07<00:00, 65.88it/s] 
Evaluating BM25Retriever: 100%|██████████| 500/500 [00:19<00:00, 26.31it/s]

TF-IDF with Query Expansion Recall@10: 0.7860
BM25 with Query Expansion Recall@10: 0.6720





**生成篩選困難負樣本的資料**

In [16]:
import pandas as pd
import json
from tqdm import tqdm


def prepare_hard_negatives(top_k=50):
    """
    為訓練查詢生成困難負樣本。
    對於train_queries.csv（訓練資料）中的每個查詢，使用TF-IDF在code_snippets.csv中（test_queries的語料庫，也是微調模型時的負樣本）
    檢索top-k的相似程式碼，並將其作為困難負樣本儲存。
    """
    print("--- 開始生成困難負樣本 ---")

    # 1. 載入所有需要的資料
    print("步驟 1/4: 載入資料...")
    train_queries_df = pd.read_csv('train_queries.csv')
    code_snippets_df = load_code_snippets('code_snippets.csv')

    # 2. 預處理程式碼片段並初始化TF-IDF檢索器
    print("步驟 2/4: 初始化 TF-IDF 檢索器...")
    processed_snippets_df = preprocess(code_snippets_df)
    # 使用TF-IDF檢索器
    tfidf_retriever = TFIDFRetriever(processed_snippets_df)
    print("檢索器初始化完成。")

    # 3. 為每個訓練查詢尋找困難負樣本
    print("步驟 3/4: 挖掘困難負樣本...")
    training_data_with_negatives = []
    for _, row in tqdm(train_queries_df.iterrows(), total=len(train_queries_df), desc="處理查詢"):
        query = row['query']
        positive_code = row['code']

        # 使用 TF-IDF 檢索 Top-K個候選
        # 由於 train_queries.csv 中的 code 不存在於 code_snippets.csv 中，不需要擔心檢索到正樣本
        top_indices, _ = tfidf_retriever.retrieve(query, k=top_k, query_expansion=True)

        # 將檢索到的索引 (indices) 轉換為實際的 code_id（code_snippets中有code_id）
        hard_negative_ids = [int(code_snippets_df.iloc[i]['code_id']) for i in top_indices]

        training_data_with_negatives.append({
            'query': query,
            'positive_code': positive_code,
            'hard_negative_ids': hard_negative_ids
        })

    # 4. 儲存結果到JSON檔案
    output_path = 'train_data_with_negatives.json'
    print(f"步驟 4/4: 儲存結果到 {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(training_data_with_negatives, f, indent=4, ensure_ascii=False)

    print(f"--- 成功生成並儲存 {len(training_data_with_negatives)} 筆含困難負樣本的訓練資料 ---")

if __name__ == '__main__':
    prepare_hard_negatives()


--- 開始生成困難負樣本 ---
步驟 1/4: 載入資料...
步驟 2/4: 初始化 TF-IDF 檢索器...
檢索器初始化完成。
步驟 3/4: 挖掘困難負樣本...


處理查詢: 100%|██████████| 500/500 [00:08<00:00, 61.76it/s] 

步驟 4/4: 儲存結果到 train_data_with_negatives.json...
--- 成功生成並儲存 500 筆含困難負樣本的訓練資料 ---





**密集模型微調**

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import json
import random



# 可選擇'top5_single'或'stratified_multi'，在Kaggle分數最高的前兩個策略
STRATEGY = 'top5_single'

num_layers = 4 # 選擇要用最後幾層的Output平均作為特徵
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# 【每個正樣本要搭配的困難負樣本數量
NUM_NEGATIVES_PER_POSITIVE = 4 # 每個正樣本要搭配的困難負樣本數量

class TripletDataset(Dataset):
    """三元組數據集，用於微調密集檢索模型"""
    def __init__(self, train_data_with_negatives, code_id_to_code_map, strategy = 'top5_single'):
        """初始化數據集，並在此處完成數據增強（分層抽樣）"""
        self.triplets = []
        self.strategy = strategy
        print("\nCreating training triplets with STRATIFIED negatives...")

        # 此處挑選在Kaggle分數最高的前兩個策略
        if self.strategy == 'top5_single':
            print("\nCreating training triplets with Top-5 Single Negative strategy...")
            for item in tqdm(train_data_with_negatives):
                query = item['query']
                positive_code = item['positive_code']
                hard_negatives_pool = item['hard_negative_ids'][:5] # 只取前 5 個
                if hard_negatives_pool:
                    neg_id = random.choice(hard_negatives_pool)
                    negative_code = code_id_to_code_map[neg_id]
                    self.triplets.append([query, positive_code, negative_code])

        elif self.strategy == 'stratified_multi':
            print("\nCreating training triplets with Stratified Multi-Negative strategy...")
            for item in tqdm(train_data_with_negatives):
                query = item['query']
                positive_code = item['positive_code']
                hard_negatives_pool = item['hard_negative_ids'] # 使用全部 Top 50

                if hard_negatives_pool:
                    # 實作分層抽樣
                    # 層 1: Top 1-10
                    # 層 2: Top 11-20
                    # 層 3: Top 21-35
                    # 層 4: Top 36-50
                    strata = [hard_negatives_pool[0:10], hard_negatives_pool[10:20], hard_negatives_pool[20:35], hard_negatives_pool[35:50]]
                    for stratum in strata:
                        if stratum:
                            neg_id = random.choice(stratum)
                            negative_code = code_id_to_code_map[neg_id]
                            self.triplets.append([query, positive_code, negative_code])
        else:
            raise ValueError("Invalid strategy specified. Choose 'top5_single' or 'stratified_multi'.")

    def __len__(self):
        """返回數據集的大小"""
        return len(self.triplets)

    def __getitem__(self, idx):
        """獲取一個數據樣本 (一個三元組)"""
        return self.triplets[idx]

def collate_fn(batch, tokenizer, max_length=512):
    """將 batch 的文字一次性 tokenizer，提高效率"""
    anchors, positives, negatives = zip(*batch)

    anchor_inputs = tokenizer(list(anchors), return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
    positive_inputs = tokenizer(list(positives), return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
    negative_inputs = tokenizer(list(negatives), return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)

    return {
        'anchor': {key: val.to(DEVICE) for key, val in anchor_inputs.items()},
        'positive': {key: val.to(DEVICE) for key, val in positive_inputs.items()},
        'negative': {key: val.to(DEVICE) for key, val in negative_inputs.items()}
    }

def get_embedding(model, tokenizer, text, max_length=512):
    """輔助函式，用於獲取單個文本的嵌入向量"""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length).to(DEVICE)
    with torch.no_grad():
        # 判斷模型是否為 Encoder-Decoder 架構
        if hasattr(model, 'get_encoder'):
            outputs = model.get_encoder()(**inputs, output_hidden_states=True)
        else:
            outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        stacked_layers = torch.stack(hidden_states[-num_layers:])
        mean_last_layers = torch.mean(stacked_layers, dim=0)
        embedding = mean_last_layers.mean(dim=1)
    return embedding.cpu()

# 將anchor、positive、negative的token輸入模型，取多層hidden_state做平均
def get_layerwise_embeddings(model, batch_inputs, num_layers=num_layers):
    """
    batch_inputs: batch['anchor'] / batch['positive'] / batch['negative']
    num_layers: 取最後幾層做平均（作為文字的特徵）
    """
    # Check if the model has an encoder (i.e., is an encoder-decoder model)
    if hasattr(model, 'get_encoder'):
        outputs = model.get_encoder()(input_ids=batch_inputs['input_ids'],attention_mask=batch_inputs['attention_mask'], output_hidden_states=True)
    else:
        outputs = model(**batch_inputs, output_hidden_states=True)

    hidden_states = outputs.hidden_states  # tuple of all layers

    # 取最後 num_layers 層平均
    stacked_layers = torch.stack(hidden_states[-num_layers:])  # shape: (num_layers, batch_size, seq_len, hidden_size)
    mean_last_layers = torch.mean(stacked_layers, dim=0)      # shape: (batch_size, seq_len, hidden_size)

    # 對 token 平均 pooling，得到每個樣本的句子向量
    embeddings = mean_last_layers.mean(dim=1)  # shape: (batch_size, hidden_size)
    return embeddings


def evaluate_recall(model, tokenizer, val_df, corpus_df, cached_corpus_embeddings=None):
    model.eval()
    #  先計算全部的語料庫特徵
    if cached_corpus_embeddings is None:
        print("\nCreating cached embeddings for the corpus...")
        all_codes = list(corpus_df['code'])
        corpus_embeddings = []
        batch_size = 32
        for i in tqdm(range(0, len(all_codes), batch_size), desc="Corpus Embeddings"):
            batch_codes = all_codes[i:i+batch_size]
            inputs = tokenizer(batch_codes, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(DEVICE)
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states
                stacked_layers = torch.stack(hidden_states[-num_layers:])
                mean_last_layers = torch.mean(stacked_layers, dim=0)
                embeddings = mean_last_layers.mean(dim=1)
            corpus_embeddings.append(embeddings.cpu())
        corpus_embeddings = torch.cat(corpus_embeddings, dim=0)
    else:
        corpus_embeddings = cached_corpus_embeddings

    recall_at_10 = 0
    for _, row in tqdm(val_df.iterrows(), total=val_df.shape[0], desc="Evaluating Recall@10"):
        query = row['query']
        true_code_string = row['code']
        query_embedding = get_embedding(model, tokenizer, query)

        # 計算餘弦相似度
        scores = torch.nn.functional.cosine_similarity(query_embedding, corpus_embeddings)
        top_k_indices = torch.argsort(scores, descending=True)[:10]
        top_k_codes = corpus_df.iloc[top_k_indices]['code'].values
        if true_code_string in top_k_codes:
            recall_at_10 += 1
    return recall_at_10 / len(val_df), corpus_embeddings

class DenseRetriever:
    """密集檢索器"""
    def __init__(self, documents, model_name_or_path, batch_size=32):
        """初始化密集檢索器"""
        self.documents = documents
        # 載入預訓練模型和斷詞器
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path)
        self.model.to(DEVICE)
        self.model.eval() # 預設為評估模式
        self.batch_size = batch_size
        # 建立文件的嵌入向量（使用 batch 化）
        self.doc_embeddings = self._create_doc_embeddings()

    def _create_doc_embeddings(self):
        """建立所有文件的嵌入向量 (batch 化加速)"""
        all_codes = list(self.documents['code'])
        embeddings = []
        for i in tqdm(range(0, len(all_codes), self.batch_size), desc="Creating document embeddings"):
            batch_codes = all_codes[i:i+self.batch_size]
            inputs = self.tokenizer(batch_codes, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(DEVICE)
            with torch.no_grad():
                # Check if the model has an encoder (i.e., is an encoder-decoder model)
                if hasattr(self.model, 'get_encoder'):
                    outputs = self.model.get_encoder()(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], output_hidden_states=True)
                else:
                     outputs = self.model(**inputs, output_hidden_states=True)
                hidden_states = outputs.hidden_states
                stacked_layers = torch.stack(hidden_states[-num_layers:])
                mean_last_layers = torch.mean(stacked_layers, dim=0)
                batch_embeddings = mean_last_layers.mean(dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())

        embeddings = np.vstack(embeddings)
        # 對所有文件嵌入向量進行 L2 正規化
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / norms
        return embeddings

    def retrieve(self, query, k=10):
        """根據查詢檢索文件"""
        # 對單一 query 編碼
        query_embedding = get_embedding(self.model, self.tokenizer, query).cpu().numpy()
        # 對查詢嵌入向量進行 L2 正規化
        query_norm = np.linalg.norm(query_embedding)
        query_embedding = query_embedding / query_norm

        # 計算餘弦相似度 (經過正規化後點積等同於餘弦相似度)
        scores = np.dot(self.doc_embeddings, query_embedding.T).flatten()
        top_k_indices = np.argsort(scores)[::-1][:k]
        top_k_scores = scores[top_k_indices]
        return top_k_indices, top_k_scores

def split_data(train_queries_df):
    # 90% 的code-query配對用於訓練，剩餘10%的query用於評估並對答案
    # 每個 code 是一個 group
    groups = train_queries_df['code']
    gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    train_idx, val_idx = next(gss.split(train_queries_df, groups=groups))
    train_df = train_queries_df.iloc[train_idx].reset_index(drop=True)
    val_df = train_queries_df.iloc[val_idx].reset_index(drop=True)
    return train_df, val_df


def fine_tune_model(model, tokenizer, train_data_with_negatives, code_id_to_code_map, strategy, epochs=3, lr=2e-5, batch_size=8):
    """微調預訓練模型"""
    model.to(DEVICE)

    # 這是對訓練資料的準備，會產生每個樣本的anchor/positive/negative張量
    dataset = TripletDataset(train_data_with_negatives, code_id_to_code_map, strategy)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,
                            collate_fn=lambda x: collate_fn(x, tokenizer))  # 使用 collate_fn 做 batch tokenizer

    # 設定優化器和損失函數
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # 標準Transformer訓練用優化器
    loss_fn = torch.nn.TripletMarginLoss(margin=1.0) # 三元組損失，目標是讓anchor（查詢）與positive（正確答案之間的距離小於anchor與negative（錯誤答案）之間的距離，至少相差一個margin

    # 訓練模型
    for epoch in range(epochs):
        model.train() # 切換到訓練模式
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
            optimizer.zero_grad() # 清除上一步梯度

            anchor_embeddings = get_layerwise_embeddings(model, batch['anchor'])
            positive_embeddings = get_layerwise_embeddings(model, batch['positive'])
            negative_embeddings = get_layerwise_embeddings(model, batch['negative'])

            # 計算損失
            loss = loss_fn(anchor_embeddings, positive_embeddings, negative_embeddings)
            loss.backward()  # 計算梯度
            optimizer.step()  # 更新參數
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch: {epoch+1}, Average Loss: {avg_loss:.4f}")

    return model

if __name__ == '__main__':
    # --- 1. 準備資料 ---
    print("--- Preparing Data for Fine-tuning ---")

    with open('train_data_with_negatives.json', 'r', encoding='utf-8') as f:
        #  這是事先根據train_queries以及code_snippests製作的每個query對應的code以及用TF-IDF挑選出的前50個負樣本ID
        train_data_with_negatives = json.load(f)

    code_snippets_df = pd.read_csv('code_snippets.csv')
    code_id_to_code_map = pd.Series(code_snippets_df.code.values, index=code_snippets_df.code_id).to_dict()

    # --- 2. 初始化模型 ---
    print("\n--- Initializing Model ---")
    model_name = PRE_TRAINED_MODEL_NAME
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # --- 3. 微調模型 ---
    print("\n--- Fine-tuning model with Multi-Negative Strategy ---")
    fine_tuned_model = fine_tune_model(model, tokenizer, train_data_with_negatives, code_id_to_code_map, STRATEGY, epochs=3, lr=2e-5, batch_size=8)

    # --- 4. 儲存模型 ---
    output_dir = FINE_TUNED_MODEL_PATH
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print(f"\nSaving the final model to {output_dir}...")
    fine_tuned_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print("\n--- Model training complete. ---")

--- Preparing Data for Fine-tuning ---

--- Initializing Model ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]


--- Fine-tuning model with Multi-Negative Strategy ---

Creating training triplets with STRATIFIED negatives...

Creating training triplets with Top-5 Single Negative strategy...


100%|██████████| 500/500 [00:00<00:00, 240003.66it/s]
Epoch 1/3:   0%|          | 0/63 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

Epoch 1/3: 100%|██████████| 63/63 [02:14<00:00,  2.13s/it]


Epoch: 1, Average Loss: 0.1120


Epoch 2/3: 100%|██████████| 63/63 [02:12<00:00,  2.11s/it]


Epoch: 2, Average Loss: 0.0173


Epoch 3/3: 100%|██████████| 63/63 [02:12<00:00,  2.10s/it]


Epoch: 3, Average Loss: 0.0032

Saving the final model to ./microsoft-unixcoder-base...

--- Model training complete. ---


**預訓練密集模型與微調密集模型本地驗證**

In [4]:

# dense_retrieval
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd


# 選擇要測試的model
try_prtrained_model = False
try_fine_tuned_model = True


if __name__ == '__main__':
    # 1. 載入資料並準備驗證集
    print("--- 1. 載入資料並準備驗證集 ---")
    # 注意，此處用於測試的語料庫也來自於train_queries.csv，訓練時保留了10%的query沒有用於訓練
    train_queries_df = pd.read_csv('train_queries.csv')


    # 使用與微調腳本完全相同的分割方式
    _, val_df = split_data(train_queries_df)
    print(f"已載入 {len(val_df)} 筆樣本用於驗證。")

    # 2. 評估預訓練模型
    if try_prtrained_model:
        print("\n--- 2. 評估預訓練模型 ---")
        print(f"模型: {PRE_TRAINED_MODEL_NAME}")
        pretrained_tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        pretrained_model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME).to(DEVICE)

        pretrained_recall, corpus_embeddings_pretrained = evaluate_recall(pretrained_model, pretrained_tokenizer, val_df, train_queries_df)
        print(f"\n預訓練模型 Recall@10: {pretrained_recall:.4f}")

    # 3. 評估微調後的模型
    if try_fine_tuned_model:
        print("\n--- 3. 評估微調後的模型 ---")
        print(f"模型: {FINE_TUNED_MODEL_PATH}")
        try:
            finetuned_tokenizer = AutoTokenizer.from_pretrained(FINE_TUNED_MODEL_PATH)
            finetuned_model = AutoModel.from_pretrained(FINE_TUNED_MODEL_PATH).to(DEVICE)

            # 微調後的模型需要重新計算語料庫的嵌入向量
            finetuned_recall, _ = evaluate_recall(finetuned_model, finetuned_tokenizer, val_df, train_queries_df)
            print(f"\n微調後模型 Recall@10: {finetuned_recall:.4f}")

        except OSError:
            print(f"錯誤: 在 '{FINE_TUNED_MODEL_PATH}' 找不到微調後的模型。")
            print("請先執行 'fine_tune_model.py' 來訓練並儲存模型。")

    print("\n--- 評估完成 ---")

--- 1. 載入資料並準備驗證集 ---
已載入 50 筆樣本用於驗證。

--- 3. 評估微調後的模型 ---
模型: ./microsoft-unixcoder-base

Creating cached embeddings for the corpus...


Corpus Embeddings: 100%|██████████| 16/16 [00:14<00:00,  1.11it/s]
Evaluating Recall@10: 100%|██████████| 50/50 [00:01<00:00, 29.72it/s]


微調後模型 Recall@10: 0.9600

--- 評估完成 ---





**生成四種模型的Submission**

In [14]:
# 生成dense model的submission
import pandas as pd
import os
from tqdm import tqdm
def generate_submission(retriever, test_df, output_path, query_expansion=False):
    """
    Generates a submission file for a given retriever.
    """
    print(f"Generating submission for {output_path}...")
    results = []
    # 使用tqdm顯示進度條
    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc=output_path):
        query_id = row['query_id']
        query = row['query']
        top_k_indices, _ = retriever.retrieve(query, k=10)

        # 直接使用檢索器內部儲存的 documents DataFrame 來獲取 code_id
        top_k_code_ids = retriever.documents.iloc[top_k_indices]['code_id'].tolist()

        results.append({
            'query_id': query_id,
            'code_id': ' '.join(map(str, top_k_code_ids))
        })

    submission_df = pd.DataFrame(results)
    submission_df.to_csv(output_path, index=False)
    print(f"Submission file saved to {output_path}")

if __name__ == '__main__':
    # --- 載入資料 ---
    print("Loading data...")
    code_snippets_df = pd.read_csv('code_snippets.csv')
    test_queries_df = pd.read_csv('test_queries.csv')


    # --- 稀疏模型 ---
    print("\nInitializing sparse models with best parameters...")
    processed_snippets_df = preprocess(code_snippets_df.copy())

    # TF-IDF Retriever with Query Expansion
    tfidf_retriever = TFIDFRetriever(processed_snippets_df)
    generate_submission(tfidf_retriever, test_queries_df, 'submission_tfidf.csv', query_expansion=True)

    # BM25 Retriever with optimized parameters and Query Expansion
    bm25_retriever = BM25Retriever(processed_snippets_df, k1=2.0, b=0.9)
    generate_submission(bm25_retriever, test_queries_df, 'submission_bm25.csv', query_expansion=True)


    # --- 密集模型 ---
    # 檢查微調後的模型是否存在
    finetuned_model_path = FINE_TUNED_MODEL_PATH

    # 預訓練的密集檢索器
    print("\nInitializing pre-trained dense model...")
    pretrained_retriever = DenseRetriever(code_snippets_df, model_name_or_path=PRE_TRAINED_MODEL_NAME)
    generate_submission(pretrained_retriever, test_queries_df, 'submission_pretrained.csv')

    if not os.path.exists(finetuned_model_path):
        print(f"\nFine-tuned model not found at '{finetuned_model_path}'.")
        print("Skipping submission generation for the fine-tuned model.")
    else:
        # 微調後的密集檢索器
        print("\nInitializing fine-tuned dense model...")
        finetuned_retriever = DenseRetriever(code_snippets_df, model_name_or_path=finetuned_model_path)
        generate_submission(finetuned_retriever, test_queries_df, 'submission_finetuned.csv')

    print("\nAll submission files have been generated.")

Loading data...

Initializing sparse models with best parameters...
Generating submission for submission_tfidf.csv...


submission_tfidf.csv: 100%|██████████| 500/500 [00:09<00:00, 51.66it/s]


Submission file saved to submission_tfidf.csv
Generating submission for submission_bm25.csv...


submission_bm25.csv: 100%|██████████| 500/500 [00:11<00:00, 44.08it/s]


Submission file saved to submission_bm25.csv

Initializing pre-trained dense model...


Creating document embeddings: 100%|██████████| 16/16 [00:14<00:00,  1.11it/s]


Generating submission for submission_pretrained.csv...


submission_pretrained.csv: 100%|██████████| 500/500 [00:16<00:00, 30.50it/s]


Submission file saved to submission_pretrained.csv

Initializing fine-tuned dense model...


Creating document embeddings: 100%|██████████| 16/16 [00:14<00:00,  1.11it/s]


Generating submission for submission_finetuned.csv...


submission_finetuned.csv: 100%|██████████| 500/500 [00:16<00:00, 29.59it/s]

Submission file saved to submission_finetuned.csv

All submission files have been generated.





**其他**

**混合模型**

In [18]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict





def reciprocal_rank_fusion(ranked_lists, k=60):
    """
    使用 RRF 演算法融合多個排名列表。
    :param ranked_lists: 一個包含多個排名列表的列表。每個排名列表是 code_id 或 code_content 的列表。
    :param k: RRF 演算法中的常數，通常設為 60。
    :return: 融合併重新排序後的項目列表。
    """

    """
    RRF的核心思想是完全忽略掉原始分數，只關心每個檢索器給出的排名
    對於每一個候選的程式碼（code_id），它的最終RRF分數是它在每個檢索結果列表中的倒數排名分數的總和
    總而言之RRF會獎勵那些在多個不同檢索系統中都穩定地排在前面的項目，它完全繞開了不同系統之間分數無法直接比較的問題
    """
    rrf_scores = defaultdict(float)

    for ranked_list in ranked_lists:
        for rank, item in enumerate(ranked_list):
            rrf_scores[item] += 1 / (k + rank + 1)

    sorted_items = sorted(rrf_scores.items(), key=lambda item: item[1], reverse=True)
    fused_list = [item[0] for item in sorted_items]

    return fused_list

if __name__ == '__main__':
    # --- 模式設定 ---
    # True: 執行本地驗證 (使用 train_queries.csv)
    # False: 產生 Kaggle 提交檔案 (使用 test_queries.csv)
    RUN_VALIDATION = True

    # --- 1. 本地驗證模式 ---
    if RUN_VALIDATION:
        # --- 測試模式設定 ---
        # 'tfidf': 只測試 TF-IDF 的表現
        # 'dense': 只測試 Dense Retriever 的表現
        # 'rrf': 測試 RRF 混合模型的表現
        TEST_MODE = 'rrf' # 可以切換這個值來進行測試

        print(f"--- Running in Local Validation Mode (Test Mode: {TEST_MODE}) ---")

        # 載入訓練資料並分割
        print("Loading and splitting train_queries.csv for validation...")
        train_queries_df = pd.read_csv('train_queries.csv')
        _, val_df = split_data(train_queries_df)

        # 驗證時，整個 train_queries_df 就是我們的語料庫
        corpus_df = train_queries_df
        print(f"Using {len(val_df)} queries for validation against a corpus of {len(corpus_df)} code snippets.")

        # 初始化檢索器 (使用 train_queries 作為語料庫)
        print("\nInitializing retrievers for validation...")
        processed_corpus_df = preprocess(corpus_df.copy())

        if TEST_MODE == 'tfidf' or TEST_MODE == 'rrf':
            tfidf_retriever = TFIDFRetriever(processed_corpus_df)

        if TEST_MODE == 'dense' or TEST_MODE == 'rrf':
            finetuned_model_path = FINE_TUNED_MODEL_PATH
            print(f"Loading dense model from: {finetuned_model_path}")
            dense_retriever = DenseRetriever(corpus_df, model_name_or_path=finetuned_model_path)

        top_n_candidates = 100
        recall_at_10_count = 0

        print(f"\nEvaluating {TEST_MODE} retrieval (top_n={top_n_candidates})...")
        for _, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
            query = row['query']
            true_code_content = row['code']

            if TEST_MODE == 'tfidf':
                tfidf_indices, _ = tfidf_retriever.retrieve(query, k=10, query_expansion=True)
                top_10_codes = corpus_df.iloc[tfidf_indices]['code'].tolist()

            elif TEST_MODE == 'dense':
                dense_indices, _ = dense_retriever.retrieve(query, k=10)
                top_10_codes = corpus_df.iloc[dense_indices]['code'].tolist()

            elif TEST_MODE == 'rrf':
                tfidf_indices, _ = tfidf_retriever.retrieve(query, k=top_n_candidates, query_expansion=True)
                tfidf_ranked_codes = corpus_df.iloc[tfidf_indices]['code'].tolist()

                dense_indices, _ = dense_retriever.retrieve(query, k=top_n_candidates)
                dense_ranked_codes = corpus_df.iloc[dense_indices]['code'].tolist()

                fused_ranked_list = reciprocal_rank_fusion([tfidf_ranked_codes, dense_ranked_codes])
                top_10_codes = fused_ranked_list[:10]

            if true_code_content in top_10_codes:
                recall_at_10_count += 1

        final_recall = recall_at_10_count / len(val_df)
        print(f"\n--- Validation Complete ---")
        print(f"Model: {TEST_MODE}, Local Recall@10: {final_recall:.4f}")

    # --- 2. Kaggle 預測模式 ---
    else:
        print("--- Running in Prediction Mode ---")

        # 載入資料
        print("Loading data for prediction...")
        code_snippets_df = load_code_snippets('code_snippets.csv')
        test_queries_df = pd.read_csv('test_queries.csv')

        # 初始化檢索器 (使用 code_snippets 作為語料庫)
        print("\nInitializing retrievers for prediction...")
        processed_snippets_df = preprocess(code_snippets_df.copy())
        tfidf_retriever = TFIDFRetriever(processed_snippets_df)

        finetuned_model_path = FINE_TUNED_MODEL_PATH
        print(f"Loading dense model from: {finetuned_model_path}")
        dense_retriever = DenseRetriever(code_snippets_df, model_name_or_path=finetuned_model_path)

        #  執行RRF混合檢索
        top_n_candidates = 100
        final_results = []

        print(f"\nGenerating hybrid retrieval submission with RRF (top_n={top_n_candidates})...")
        for _, row in tqdm(test_queries_df.iterrows(), total=test_queries_df.shape[0]):
            query_id = row['query_id']
            query = row['query']

            tfidf_indices, _ = tfidf_retriever.retrieve(query, k=top_n_candidates, query_expansion=True)
            tfidf_ranked_ids = code_snippets_df.iloc[tfidf_indices]['code_id'].tolist()

            dense_indices, _ = dense_retriever.retrieve(query, k=top_n_candidates)
            dense_ranked_ids = code_snippets_df.iloc[dense_indices]['code_id'].tolist()

            fused_ranked_list = reciprocal_rank_fusion([tfidf_ranked_ids, dense_ranked_ids])
            top_10_code_ids = fused_ranked_list[:10]

            final_results.append({
                'query_id': query_id,
                'code_id': ' '.join(map(str, top_10_code_ids))
            })

        # 儲存提交檔案
        submission_df = pd.DataFrame(final_results)
        output_path = 'submission_hybrid_rrf.csv'
        submission_df.to_csv(output_path, index=False)
        print(f"\nHybrid RRF submission file saved to {output_path}")


--- Running in Local Validation Mode (Test Mode: rrf) ---
Loading and splitting train_queries.csv for validation...
Using 50 queries for validation against a corpus of 500 code snippets.

Initializing retrievers for validation...
Loading dense model from: ./microsoft-unixcoder-base


Creating document embeddings: 100%|██████████| 16/16 [00:14<00:00,  1.09it/s]



Evaluating rrf retrieval (top_n=100)...


100%|██████████| 50/50 [00:01<00:00, 25.28it/s]


--- Validation Complete ---
Model: rrf, Local Recall@10: 0.8400



