In [1]:
!pip install -q transformers sentence-transformers torch tqdm


In [2]:
import json
from pathlib import Path

import torch
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# 检查 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [9]:
data_path = Path("val_candidates_constrained.json")
assert data_path.exists(), f"File not found: {data_path}"

with data_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

print("Sample number:", len(data))
print("The first sample field:", data[0].keys())
print("The number of candidates for the first sample:", len(data[0]["candidates"]))


Sample number: 1000
The first sample field: dict_keys(['id', 'document', 'reference', 'candidates'])
The number of candidates for the first sample: 8


In [4]:
# 你可以改成之前实验用过的 NLI 模型名
NLI_MODEL_NAME = "roberta-large-mnli"

nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
nli_model.to(device)
nli_model.eval()

label_map = {
    0: "contradiction",
    1: "neutral",
    2: "entailment"
}

@torch.no_grad()
def nli_entailment_score(premise: str, hypothesis: str, max_length: int = 512) -> float:
    """
    使用 NLI 模型，计算 premise -> hypothesis 的 entailment 概率。
    返回值范围 [0, 1]。
    """
    inputs = nli_tokenizer(
        premise,
        hypothesis,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    outputs = nli_model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
    # 通常顺序是 [contradiction, neutral, entailment]
    entail_prob = float(probs[2])
    return entail_prob


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
SAS_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

sas_model = SentenceTransformer(SAS_MODEL_NAME)

def sas_score(doc: str, summary: str) -> float:
    """
    语义对齐分数：用 SBERT 对 doc 和 summary 取 embedding，
    返回余弦相似度，范围大约 [-1, 1]，一般会在 [0, 1] 左右。
    """
    # 直接一次性 encode 两句，速度更快
    embeddings = sas_model.encode(
        [doc, summary],
        convert_to_tensor=True,
        normalize_embeddings=True  # 归一化后余弦相似度就是点积
    )
    doc_emb, sum_emb = embeddings[0], embeddings[1]
    score = float(util.cos_sim(doc_emb, sum_emb).item())
    return score


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
W_NLI = 0.6
W_SAS = 0.4

def combined_score(doc: str, summary: str) -> dict:
    """
    返回一个字典：
    {
      "nli": ...,
      "sas": ...,
      "combined": ...
    }
    """
    nli = nli_entailment_score(doc, summary)
    sas = sas_score(doc, summary)
    combined = W_NLI * nli + W_SAS * sas
    return {
        "nli": nli,
        "sas": sas,
        "combined": combined
    }


In [7]:
reranked_data = []

for example in tqdm(data, desc="Scoring and reranking"):
    doc = example["document"]
    candidates = example["candidates"]

    # 对 8 个候选打分
    scored_candidates = []
    for cand in candidates:
        scores = combined_score(doc, cand)
        scored_candidates.append({
            "text": cand,
            "scores": scores
        })

    # 按 combined 分数从高到低排序
    scored_candidates.sort(key=lambda x: x["scores"]["combined"], reverse=True)

    # 你可以选择只保留排序后的文本，也可以把 scores 一起存
    new_example = {
        "id": example["id"],
        "document": example["document"],
        "reference": example["reference"],
        # 版本1：只保留排序后的 candidates 文本（严格符合你现在的需求）
        "candidates": [sc["text"] for sc in scored_candidates],
        # 版本2（可选）：如果你想以后分析，也可以加上这行：
        # "candidates_with_scores": scored_candidates
    }
    reranked_data.append(new_example)

print("已完成所有样本 rerank。")


Scoring and reranking: 100%|██████████| 1000/1000 [22:12<00:00,  1.33s/it]

已完成所有样本 rerank。





In [8]:
output_path = Path("val_candidates_constrained_reranked.json")
with output_path.open("w", encoding="utf-8") as f:
    json.dump(reranked_data, f, ensure_ascii=False, indent=2)

print("已保存到:", output_path.resolve())


已保存到: /content/val_candidates_constrained_reranked.json
