In [30]:
import spacy
import re
import pandas as pd
from nltk.corpus import wordnet as wn
from datasets import load_dataset
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

def load_semeval_data():
    dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
    sentences = dataset['train']['sentence']
    return sentences

def extract_entities(sentences):
    """
    提取 <e1> 和 <e2> 标签内的实体
    """
    entity_pairs = []

    for sentence in sentences:
        # 提取 <e1> 和 <e2> 的内容
        e1_match = re.search(r'<e1>(.*?)</e1>', sentence)
        e2_match = re.search(r'<e2>(.*?)</e2>', sentence)

        if e1_match and e2_match:
            e1 = e1_match.group(1)
            e2 = e2_match.group(1)
            entity_pairs.append((e1, e2))
        else:
            entity_pairs.append((None, None))  # 如果找不到实体

    return entity_pairs

def get_best_hypernym(entity):

    synsets = wn.synsets(entity, pos=wn.NOUN)
    if not synsets:
        return ["No hypernym found"]
    
    best_hypernym = None
    highest_similarity = 0

    # 比较每个同义词集的相似度
    for syn in synsets:
        for hypernym in syn.hypernyms():
            similarity = syn.wup_similarity(hypernym)  # 使用 Wu-Palmer 相似度
            if similarity and similarity > highest_similarity:
                highest_similarity = similarity
                best_hypernym = hypernym

    return [best_hypernym.lemmas()[0].name()] if best_hypernym else ["No hypernym found"]

if __name__ == "__main__":
    print(" 加载 SemEval-2010 Task 8 数据集...")
    sentences = load_semeval_data()
    print(f" 数据集中共有 {len(sentences)} 条句子。")

    print("\n 提取实体 <e1> 和 <e2>...")
    entity_pairs = extract_entities(sentences)

    print("\n 提取实体的上位词（使用 WordNet）...")
    results = []

    for idx, (e1, e2) in enumerate(entity_pairs):
        e1_hypernyms = get_best_hypernym(e1) if e1 else ["No entity found"]
        e2_hypernyms = get_best_hypernym(e2) if e2 else ["No entity found"]

        results.append({
            "Sentence ID": idx + 1,
            "Entity e1": e1,
            "Entity e1 Hypernyms": ", ".join(e1_hypernyms),
            "Entity e2": e2,
            "Entity e2 Hypernyms": ", ".join(e2_hypernyms)
        })

        if idx < 5:
            print(f"句子 {idx+1}:")
            print(f"   实体 e1: {e1} → 上位词: {e1_hypernyms}")
            print(f"   实体 e2: {e2} → 上位词: {e2_hypernyms}")

    df = pd.DataFrame(results)
    output_file = "semeval_entity_hypernyms.csv"
    df.to_csv(output_file, index=False, encoding='utf-8-sig')

    print(f"\n 结果已成功导出为: {output_file}")

[nltk_data] Downloading package wordnet to /Users/gavin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/gavin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


📥 加载 SemEval-2010 Task 8 数据集...
📊 数据集中共有 8000 条句子。

✂️ 提取实体 <e1> 和 <e2>...

🟢 提取实体的上位词（使用 WordNet）...
句子 1:
   实体 e1: configuration → 上位词: ['design']
   实体 e2: elements → 上位词: ['weather']
句子 2:
   实体 e1: child → 上位词: ['offspring']
   实体 e2: cradle → 上位词: ['baby_bed']
句子 3:
   实体 e1: author → 上位词: ['maker']
   实体 e2: disassembler → 上位词: ['No hypernym found']
句子 4:
   实体 e1: ridge → 上位词: ['convex_shape']
   实体 e2: surge → 上位词: ['increase']
句子 5:
   实体 e1: student → 上位词: ['enrollee']
   实体 e2: association → 上位词: ['social_activity']

💾 结果已成功导出为: semeval_entity_hypernyms.csv
