In [None]:
!pip install pandas scikit-learn openpyxl numpy nltk
!pip install rouge-score
!pip install sentence-transformers
!pip install bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c6332ccb8db1cddb8f07853e5dffa69bb6cd1f31c383e96004e1311160a9f44e
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from bert_score import score as bert_score

# 讀取資料
df = pd.read_excel('/content/1030 turtleqa.xlsx')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

model_cols = [col for col in df.columns if col not in ['Question', 'Truth Answer']]

# Semantic model, 建議中文用 'paraphrase-multilingual-MiniLM-L12-v2'
sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def simple_acc(truth, pred):
    return int(str(truth).strip() == str(pred).strip())

def bleu_score(truth, pred):
    smoothie = SmoothingFunction().method4
    truth_tokens = list(str(truth))
    pred_tokens = list(str(pred))
    return sentence_bleu([truth_tokens], pred_tokens, smoothing_function=smoothie)

def rouge_l_score(truth, pred):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    return scorer.score(str(truth), str(pred))['rougeL'].fmeasure

def char_level_metrics(truth_list, pred_list):
    y_true_all, y_pred_all = [], []
    for t, p in zip(truth_list, pred_list):
        y_true = list(str(t).strip())
        y_pred = list(str(p).strip())
        all_tokens = set(y_true) | set(y_pred)
        y_true_bin = [1 if tk in y_true else 0 for tk in all_tokens]
        y_pred_bin = [1 if tk in y_pred else 0 for tk in all_tokens]
        y_true_all.append(y_true_bin)
        y_pred_all.append(y_pred_bin)
    f1_scores = [f1_score(y_true, y_pred, zero_division=0) for y_true, y_pred in zip(y_true_all, y_pred_all)]
    recall_scores = [recall_score(y_true, y_pred, zero_division=0) for y_true, y_pred in zip(y_true_all, y_pred_all)]
    return np.mean(f1_scores), np.mean(recall_scores)

def semantic_similarity(truth_list, pred_list):
    truth_embeds = sbert_model.encode(list(map(str, truth_list)), convert_to_tensor=True, show_progress_bar=False)
    pred_embeds = sbert_model.encode(list(map(str, pred_list)), convert_to_tensor=True, show_progress_bar=False)
    # 餘弦相似度，取每組平均
    sim_scores = (truth_embeds * pred_embeds).sum(axis=1) / (truth_embeds.norm(dim=1) * pred_embeds.norm(dim=1))
    return sim_scores.cpu().numpy().mean()

def bertscore_metric(truth_list, pred_list):
    # 用 BERTScore 對整組計算
    P, R, F1 = bert_score([str(p) for p in pred_list], [str(t) for t in truth_list], lang='zh', rescale_with_baseline=True)
    return F1.mean().item()

def compute_scores(truth_list, pred_list):
    exact_match_scores, bleu_scores, rouge_scores = [], [], []
    f1, recall = char_level_metrics(truth_list, pred_list)
    for t, p in zip(truth_list, pred_list):
        exact_match_scores.append(simple_acc(t, p))
        bleu_scores.append(bleu_score(t, p))
        rouge_scores.append(rouge_l_score(t, p))
    sem_sim = semantic_similarity(truth_list, pred_list)
    bert_score_val = bertscore_metric(truth_list, pred_list)
    return {
        "Accuracy": np.mean(exact_match_scores),
        "BLEU": np.mean(bleu_scores),
        "ROUGE-L": np.mean(rouge_scores),
        "F1": f1,
        "Recall": recall,
        "SemSim": sem_sim,
        "BERTScore": bert_score_val
    }

results = []
for model in model_cols:
    scores = compute_scores(df['Truth Answer'], df[model])
    scores['Model'] = model
    results.append(scores)

results_df = pd.DataFrame(results)
results_df = results_df[['Model', 'Accuracy', 'BLEU', 'ROUGE-L', 'F1', 'Recall', 'SemSim', 'BERTScore']]

results_df = results_df.round(4)  # 保留小數點後四位

print("模型分數比較：")
print(results_df.sort_values('ROUGE-L', ascending=False))

results_df.to_excel('model_scores.xlsx', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

模型分數比較：
                                 Model  Accuracy    BLEU  ROUGE-L      F1  \
0                 Turtlecare R1_Answer    0.6579  0.3988   0.6883  0.7736   
5                      Qwen2-7B_Answer    0.6579  0.3921   0.6706  0.7592   
6                       GPT-OSS_Answer    0.6579  0.3930   0.6703  0.7668   
1                      Llama3.1_Answer    0.6579  0.3928   0.6667  0.7732   
4  deepseek_r1_distill-llama-8b Answer    0.6579  0.3948   0.6617  0.7684   
2   deepseek_r1_distill qwen-7b_Answer    0.6579  0.3918   0.6604  0.7603   
3               Mistral-7B-v0.1_Answer    0.6579  0.3872   0.6579  0.7567   

   Recall  SemSim  BERTScore  
0  0.7600  0.9312     0.7714  
5  0.8898  0.9464     0.7420  
6  0.7725  0.8989     0.7527  
1  0.8187  0.8691     0.7459  
4  0.8306  0.9285     0.7495  
2  0.8093  0.9261     0.7421  
3  0.8136  0.8838     0.7170  
