In [None]:
from huggingface_hub import login
login("your_huggingface_token_here")

In [2]:
import torch
import numpy as np
import nltk
import sys # 新增：用于退出程序
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# --- NLTK 自动下载检查函数 ---
def check_and_download_nltk_resources():
    """
    检查并自动下载 NLTK 所需的 'punkt' 和 'punkt_tab' 资源。
    """
    required_resources = ['punkt', 'punkt_tab']
    all_ok = True

    print("Checking NLTK resources...")
    for resource in required_resources:
        try:
            # 尝试查找资源，如果找不到会抛出 LookupError
            nltk.data.find(f'tokenizers/{resource}')
            print(f"NLTK Resource '{resource}' is available.")
        except LookupError:
             print(f"NLTK Resource '{resource}' not found. Attempting automatic download...")
             try:
                 nltk.download(resource)
                 print(f"NLTK Resource '{resource}' downloaded successfully.")
             except Exception as e:
                 print(f"CRITICAL ERROR: Failed to download NLTK resource '{resource}'.")
                 print(f"Error: {e}")
                 print("Please ensure you have network access or run 'python -m nltk.downloader {resource}' manually.")
                 all_ok = False

    if not all_ok:
        # 如果下载失败，退出程序，避免后续崩溃
        sys.exit(1)
    print("-" * 50)


class NLIFactualityMetric:
    def __init__(self, model_name="roberta-large-mnli", device=None):
        """
        初始化 NLI 事实性评估指标。
        """

        # 在加载模型之前，首先检查并下载 NLTK 资源
        check_and_download_nltk_resources() # <--- 在这里调用自动检查和下载

        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Loading NLI model: {model_name} on {self.device}...")

        # 1. 加载分词器和模型
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
        self.model.eval()

        # 2. 确定蕴含 (entailment) 标签的索引
        self.entailment_idx = self.model.config.label2id.get('entailment')
        if self.entailment_idx is None:
            self.entailment_idx = 2
            print("Warning: Could not find 'entailment' in label2id, assuming index 2.")

    # ... (score_single_pair, evaluate_summary, evaluate_batch, 以及 if __name__ == "__main__": 保持不变) ...
    # 为了完整性，这里只显示修改的部分，请确保您在本地保留所有方法。

    def score_single_pair(self, premise, hypothesis):
        """计算单句假设对前提的蕴含分数"""

        inputs = self.tokenizer(
            premise,
            hypothesis,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)

        return probs[0][self.entailment_idx].item()

    def evaluate_summary(self, source_doc, summary):
        """
        评估摘要的事实一致性。
        """
        # 1. 分句
        summary_sents = nltk.tokenize.sent_tokenize(summary)
        if not summary_sents:
            return 0.0

        scores = []
        for sent in summary_sents:
            score = self.score_single_pair(source_doc, sent)
            scores.append(score)

        # 2. 聚合分数 (取平均值)
        return np.mean(scores)

    def evaluate_batch(self, examples):
        """
        批量评估。
        """
        results = []
        total = len(examples)
        for i, ex in enumerate(examples):
            score = self.evaluate_summary(ex['source'], ex['summary'])
            results.append(score)

            if (i + 1) % 10 == 0:
                print(f"Processed {i+1}/{total} examples. Current mean score: {np.mean(results):.4f}")
        return results

# --- 使用示例 --- (请确保 main.py 不再重复定义，这里只是一个独立运行的示例)
if __name__ == "__main__":

    metric = NLIFactualityMetric(device='cpu') # 强制使用 CPU 进行简单测试

    # 模拟数据测试
    test_data = [
        {
            "source": "Joe Biden met with congressional leaders at the White House today to discuss a new economic relief package.",
            "summary": "Joe Biden visited the White House today."
        },
        # ... (省略)
    ]

    print("\n=== Running Sanity Check Tests ===")
    scores = metric.evaluate_batch(test_data)
    print(f"Overall Mean Score: {np.mean(scores):.4f}")

Checking NLTK resources...
NLTK Resource 'punkt' not found. Attempting automatic download...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NLTK Resource 'punkt' downloaded successfully.
NLTK Resource 'punkt_tab' not found. Attempting automatic download...
NLTK Resource 'punkt_tab' downloaded successfully.
--------------------------------------------------
Loading NLI model: roberta-large-mnli on cpu...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



=== Running Sanity Check Tests ===
Overall Mean Score: 0.9915


In [4]:
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer
import time
import numpy as np
import torch

# 你自己的 NLI 评估器（保持不变）
# from your_metric_file import NLIFactualityMetric
# 这里假设 NLIFactualityMetric 已经可以正常 import 使用

# --- 配置 ---
MODEL_HUB_ID = "mercuryujia/bart-large-multi-news"
DATASET_NAME = "Awesome075/multi_news_parquet"
SPLIT_NAME = "test"

# 仅评估前 N 个样本以进行快速测试。如果要评估全集，请设置为 None 或更大的数字。
MAX_SAMPLES = 50

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 一次性测试的 beam_size 列表
BEAM_SIZES = [1, 2, 4, 8, 16, 32, 64]

print(f"Using device: {DEVICE}")


def generate_summaries(model, tokenizer, documents, device,
                       max_length=150, batch_size=4, num_beams=4):
    """
    使用 BART 模型批量生成摘要。
    对于每个输入文档，返回 num_beams 个摘要。
    """
    all_generated_summaries = []

    # 按照 batch_size 进行分批
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]

        # 1. 对文档进行分词
        inputs = tokenizer(
            batch,
            max_length=1024,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # 2. 将输入移动到指定设备
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        # 3. 生成摘要
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=num_beams,                 # 使用传入的 num_beams
                num_return_sequences=num_beams,      # 为每个输入返回 num_beams 个结果
                do_sample=False,
                length_penalty=2.0,
                early_stopping=True
            )

        # 4. 解码生成的 ID 为文本
        generated_summaries = tokenizer.batch_decode(
            generated_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        all_generated_summaries.extend(generated_summaries)

        # 打印进度（按 batch 粗略统计）
        if (i // batch_size + 1) % 10 == 0:
            print(f"[Generation] Processed {i + len(batch)}/{len(documents)} examples...")

    return all_generated_summaries


def main():
    total_start_time = time.time()

    # --- 阶段一：模型和数据准备 ---
    print("\n--- Phase 1: Loading Model and Data ---")

    # 1. 加载微调的 BART 模型和分词器
    try:
        tokenizer = BartTokenizer.from_pretrained(MODEL_HUB_ID)
        model = BartForConditionalGeneration.from_pretrained(MODEL_HUB_ID).to(DEVICE)
        model.eval()
        print(f"Successfully loaded model: {MODEL_HUB_ID}")
    except Exception as e:
        print(f"Error loading model from Hub: {e}")
        print("Please ensure the model name is correct and accessible.")
        return

    # 2. 加载测试数据集
    dataset = load_dataset(DATASET_NAME, split=SPLIT_NAME)

    # 限制样本数量
    if MAX_SAMPLES is not None:
        dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))
        print(f"Loaded {len(dataset)} samples for quick testing.")
    else:
        print(f"Loaded {len(dataset)} samples from the full {SPLIT_NAME} split.")

    documents = dataset["document"]
    num_documents = len(documents)

    # 3. 初始化 NLI 评估器（只初始化一次）
    print("\n--- Phase 2: Initializing NLI Metric ---")
    nli_scorer = NLIFactualityMetric(device=DEVICE)

    # 用于记录不同 beam_size 的结果
    beam_results = []

    # === 对每个 BEAM_SIZE 依次跑完整流程 ===
    for beam_size in BEAM_SIZES:
        print("\n" + "=" * 70)
        print(f"      RUNNING EXPERIMENT WITH BEAM SIZE = {beam_size}      ")
        print("=" * 70)

        run_start_time = time.time()

        # --- 阶段三：摘要生成 ---
        print("\n--- Phase 3: Generating Summaries ---")
        print(f"Generating {beam_size} summaries per document...")

        generated_summaries = generate_summaries(
            model,
            tokenizer,
            documents,
            DEVICE,
            num_beams=beam_size
        )

        # 检查摘要数量是否符合预期
        expected_count = num_documents * beam_size
        if len(generated_summaries) != expected_count:
            print(
                f"ERROR: Generated summaries count ({len(generated_summaries)}) "
                f"does not match expected count ({expected_count}) for beam_size={beam_size}."
            )
            print("Please check the 'generate_summaries' function.")
            # 这里直接跳过这个 beam_size，进行下一个
            beam_results.append({
                "beam_size": beam_size,
                "mean_nli_score": None,
                "total_time": time.time() - run_start_time
            })
            continue

        # --- 阶段四：NLI 事实性评估 ---
        print("\n--- Phase 4: NLI Factuality Evaluation ---")

        all_document_avg_scores = []  # 存储每个文档在当前 beam_size 下的平均 NLI 分数

        print(
            f"\nStarting NLI evaluation, processing {num_documents} documents "
            f"({beam_size} summaries each)..."
        )

        for i in range(num_documents):
            doc = documents[i]

            # 提取当前文档的 beam_size 个摘要
            start_idx = i * beam_size
            current_summaries = generated_summaries[start_idx:start_idx + beam_size]

            individual_scores = []
            for gen_sum in current_summaries:
                # 返回单个摘要的句子平均分
                score = nli_scorer.evaluate_summary(doc, gen_sum)
                individual_scores.append(score)

            # 计算这 beam_size 个摘要的平均分，作为该文档的最终 NLI 分数
            doc_avg_nli = np.mean(individual_scores)
            all_document_avg_scores.append(doc_avg_nli)

            # 打印进度
            if (i + 1) % 10 == 0:
                current_overall_mean = np.mean(all_document_avg_scores)
                print(
                    f"[Document] Processed {i + 1}/{num_documents} documents. "
                    f"Current mean document score (beam={beam_size}): {current_overall_mean:.4f}"
                )

        # 当前 beam_size 的整体 NLI 分数
        mean_nli_score = float(np.mean(all_document_avg_scores))
        run_time = time.time() - run_start_time

        print("\n" + "-" * 50)
        print(f"Beam Size: {beam_size}")
        print(f"Number of Documents Evaluated: {num_documents}")
        print(f"Total Summaries Evaluated: {num_documents * beam_size}")
        print(f"MEAN DOCUMENT NLI SCORE: {mean_nli_score:.4f}")
        print(f"Time taken for this beam size: {run_time:.2f} seconds")
        print("-" * 50)

        # 记录结果
        beam_results.append({
            "beam_size": beam_size,
            "mean_nli_score": mean_nli_score,
            "total_time": run_time
        })

    total_time = time.time() - total_start_time

    # --- 阶段五：整体结果输出 ---
    print("\n" + "=" * 70)
    print("               FINAL NLI FACTUALITY SCORE REPORT               ")
    print("=" * 70)
    print(f"Model ID: {MODEL_HUB_ID}")
    print(f"Dataset Split: {DATASET_NAME}/{SPLIT_NAME}")
    print(f"Number of Documents Evaluated: {num_documents}")
    print(f"Beam Sizes Tested: {BEAM_SIZES}")
    print("-" * 70)

    # 输出每个 beam_size 的结果表
    for res in beam_results:
        bs = res["beam_size"]
        score = res["mean_nli_score"]
        t = res["total_time"]
        if score is None:
            print(f"Beam {bs:>2}: ERROR (summary count mismatch), Time = {t:.2f}s")
        else:
            print(f"Beam {bs:>2}: Mean NLI = {score:.4f}, Time = {t:.2f}s")

    print("-" * 70)
    print(f"Total wall-clock time for all experiments: {total_time:.2f} seconds")
    print("=" * 70)


if __name__ == "__main__":
    main()


Using device: cuda

--- Phase 1: Loading Model and Data ---
Successfully loaded model: mercuryujia/bart-large-multi-news
Loaded 50 samples for quick testing.

--- Phase 2: Initializing NLI Metric ---
Checking NLTK resources...
NLTK Resource 'punkt' is available.
NLTK Resource 'punkt_tab' is available.
--------------------------------------------------
Loading NLI model: roberta-large-mnli on cuda...


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The following generation flags are not valid and may be ignored: ['early_stopping', 'length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



      RUNNING EXPERIMENT WITH BEAM SIZE = 1      

--- Phase 3: Generating Summaries ---
Generating 1 summaries per document...
[Generation] Processed 40/50 examples...

--- Phase 4: NLI Factuality Evaluation ---

Starting NLI evaluation, processing 50 documents (1 summaries each)...
[Document] Processed 10/50 documents. Current mean document score (beam=1): 0.0493
[Document] Processed 20/50 documents. Current mean document score (beam=1): 0.0246
[Document] Processed 30/50 documents. Current mean document score (beam=1): 0.0164
[Document] Processed 40/50 documents. Current mean document score (beam=1): 0.0123
[Document] Processed 50/50 documents. Current mean document score (beam=1): 0.0099

--------------------------------------------------
Beam Size: 1
Number of Documents Evaluated: 50
Total Summaries Evaluated: 50
MEAN DOCUMENT NLI SCORE: 0.0099
Time taken for this beam size: 24.67 seconds
--------------------------------------------------

      RUNNING EXPERIMENT WITH BEAM SIZE =

In [None]:
import torch
import gc

def clear_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print(f"Cleared CUDA. Allocated = {torch.cuda.memory_allocated()/1024**2:.2f} MB")

clear_cuda()