In [8]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 2717
    })
})


In [21]:
import spacy
import re
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_context_words(sentences):
    """
    提取 <e1> 和 <e2> 之间的上下文词，支持多词实体
    """
    context_words_list = []
    nlp = spacy.load("en_core_web_sm")  # 加载 spaCy 仅一次

    for sentence in sentences:
        # 提取 <e1> 和 <e2> 实体
        e1_match = re.search(r'<e1>(.*?)</e1>', sentence)
        e2_match = re.search(r'<e2>(.*?)</e2>', sentence)

        if e1_match and e2_match:
            e1 = e1_match.group(1)
            e2 = e2_match.group(1)

            # 清除实体标签
            clean_sentence = re.sub(r'</?e1>', '', sentence)
            clean_sentence = re.sub(r'</?e2>', '', clean_sentence)

            # 使用 spaCy 分词
            doc = nlp(clean_sentence)
            tokens = [token.text for token in doc]

            # 将多词实体拆分成单词列表
            e1_tokens = e1.split()
            e2_tokens = e2.split()

            # 🔎 找到 e1 和 e2 的索引（支持多词实体）
            def find_sublist(sublist, lst):
                for i in range(len(lst) - len(sublist) + 1):
                    if lst[i:i + len(sublist)] == sublist:
                        return i
                return -1  # 未找到

            e1_index = find_sublist(e1_tokens, tokens)
            e2_index = find_sublist(e2_tokens, tokens)

            if e1_index == -1 or e2_index == -1:
                # 如果找不到实体索引，跳过该句子
                context_words_list.append([])
                continue

            # 确保 e1 在 e2 之前
            if e1_index > e2_index:
                e1_index, e2_index = e2_index, e1_index

            # 提取上下文词
            context_words = tokens[e1_index + len(e1_tokens): e2_index]
            context_words_list.append(context_words)
        else:
            context_words_list.append([])

    return context_words_list

def tfidf_weighted_w2v(context_words_list, vector_size=100, window=5, min_count=1):
    """
    📌 对上下文词进行 TF-IDF 加权的 Word2Vec 向量化
    ----------------------------------------------
    :param context_words_list: List[List[str]], 上下文词列表
    :param vector_size: int, Word2Vec 向量维度
    :param window: int, Word2Vec 窗口大小
    :param min_count: int, Word2Vec 最小词频
    :return: List[np.array], 每个上下文的向量表示
    """

    # 训练 Word2Vec 模型
    w2v_model = Word2Vec(sentences=context_words_list, vector_size=vector_size, window=window, min_count=min_count, workers=4)

    # 将上下文词拼接为字符串以适配 TF-IDF
    joined_contexts = [" ".join(context) for context in context_words_list]

    # 计算 TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(joined_contexts)
    tfidf_vocab = tfidf_vectorizer.vocabulary_

    # 生成上下文词向量
    context_vectors = []
    for context in context_words_list:
        tfidf_weights = tfidf_vectorizer.transform([" ".join(context)]).toarray()[0]
        weighted_vectors = []

        for word in context:
            if word in w2v_model.wv and word in tfidf_vocab:
                tfidf_weight = tfidf_weights[tfidf_vocab[word]]
                weighted_vector = tfidf_weight * w2v_model.wv[word]
                weighted_vectors.append(weighted_vector)

        # 取平均向量作为该上下文的最终表示
        if weighted_vectors:
            context_vectors.append(np.mean(weighted_vectors, axis=0))
        else:
            context_vectors.append(np.zeros(vector_size))

    return context_vectors

In [22]:
# 示例数据 (符合 SemEval-2010 Task 8 格式)
sentences = dataset['train']['sentence']
# 1️⃣ 提取上下文词
context_words_list = extract_context_words(sentences)
for idx, context in enumerate(context_words_list):
    print(f"句子 {idx+1} 的上下文词: {context}")

# 2️⃣ 进行 TF-IDF 加权的 Word2Vec 向量化
context_vectors = tfidf_weighted_w2v(context_words_list)

# 3️⃣ 输出上下文词向量
for idx, vec in enumerate(context_vectors):
    print(f"句子 {idx+1} 的上下文向量:\n", vec)

句子 1 的上下文词: ['of', 'antenna']
句子 2 的上下文词: ['was', 'carefully', 'wrapped', 'and', 'bound', 'into', 'the']
句子 3 的上下文词: ['of', 'a', 'keygen', 'uses', 'a']
句子 4 的上下文词: ['uprises', 'from', 'the']
句子 5 的上下文词: []
句子 6 的上下文词: ['that', 'is', 'Peru', "'s", 'largest']
句子 7 的上下文词: ['in', 'the', 'distal', 'part', 'of', 'the', 'stomach', 'caused', 'by', 'Helicobacter', 'pylori']
句子 8 的上下文词: ['have', 'been', 'moving', 'back', 'into']
句子 9 的上下文词: ['was', 'contained', 'in', 'a']
句子 10 的上下文词: ['was', 'pipetted', 'into', 'a', '25', 'mL', 'glass']
句子 11 的上下文词: ['collected', 'in', 'this']
句子 12 的上下文词: ['has', 'sunk', 'into']
句子 13 的上下文词: ['explaining', 'the']
句子 14 的上下文词: ['has', 'been', 'caused', 'by', 'water', 'hammer']
句子 15 的上下文词: []
句子 16 的上下文词: ['to', 'remind', 'them', 'about', 'the']
句子 17 的上下文词: ['finds', 'harmony', ',', 'sophistication', 'in', 'Appalachian']
句子 18 的上下文词: ["'s", 'products', 'have', 'included', 'flower', 'pots', ',', 'Finnish', 'rooster', '-', 'whistles', ',', 'pans', ',']
句子 19 的上下

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

