In [1]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('reuters')
from nltk.corpus import reuters
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [26]:
# 1. 数据预处理
def load_data():
    sentences = reuters.sents()
    # 将句子中的单词转换为小写，并过滤非字母字符
    sentences = ... # TODO
    return sentences

def build_vocab(sentences, min_count=5):
    word_counts = Counter()
    for sentence in sentences:
        word_counts.update(sentence)
    # 去除低频词
    vocab = ... # TODO
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return word_to_idx, idx_to_word

In [33]:
sentences = load_data()
word_to_idx, idx_to_word = build_vocab(sentences)

In [30]:
I love natural language processing class
# 2. 创建训练数据（Skip-gram模型）
def create_training_data(sentences, word_to_idx, window_size=2):
    pairs = []
    for sentence in sentences:
        sentence = [word for word in sentence if word in word_to_idx]
        for idx, center_word in enumerate(sentence):
            # 上下文窗口
            context_range = ... # TODO
            for context_idx in context_range:
                context_word = sentence[context_idx]
                pairs.append((word_to_idx[center_word], word_to_idx[context_word]))
    return pairs

In [34]:
pairs = create_training_data(sentences, word_to_idx)

In [31]:
# 负采样
def get_negatives(pairs, vocab_size, neg_count=5):
    word_freq = np.zeros(vocab_size)
    for _, context_word in pairs:
        word_freq[context_word] += 1
    word_freq = word_freq ** 0.75
    # 计算频率
    word_freq = ... # TODO
    # 思考负采样矩阵大小
    negatives = np.random.choice(range(vocab_size), size=..., p=word_freq) # TODO
    return negatives

In [35]:
negatives = get_negatives(pairs, vocab_size=len(word_to_idx))

In [32]:
# 3. 定义Word2Vec模型
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, center_words, pos_context_words, neg_context_words):
        # 计算embeds
        center_embeds ...  # TODO [batch_size, embed_dim]
        pos_embeds = ...  # TODO [batch_size, embed_dim]
        neg_embeds = ...  # TODO [batch_size, neg_count, embed_dim]
        
        # 正样本损失
        pos_score = ... # TODO
        pos_loss = torch.log(torch.sigmoid(pos_score))
        
        # 负样本损失
        neg_score = ... # TODO
        neg_loss = torch.log(torch.sigmoid(-neg_score)).sum(dim=1)
        
        # 总损失
        loss = -(pos_loss + neg_loss)
        return loss.mean()

In [36]:
model = Word2Vec(vocab_size=len(word_to_idx), embedding_dim=100)

In [7]:
# 4. 训练模型
def train_model(model, pairs, negatives, epochs, learning_rate, batch_size):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    num_batches = len(pairs) // batch_size
    for epoch in range(epochs):
        total_loss = 0
        for i in range(num_batches):
            batch_pairs = pairs[i*batch_size:(i+1)*batch_size]
            batch_negatives = negatives[i*batch_size:(i+1)*batch_size]
            center_words = torch.tensor([pair[0] for pair in batch_pairs], dtype=torch.long)
            pos_context_words = torch.tensor([pair[1] for pair in batch_pairs], dtype=torch.long)
            neg_context_words = torch.tensor(batch_negatives, dtype=torch.long)
            
            # 训练
            optimizer.zero_grad()
            loss = ... # TODO
            # 反向传播
            # TODO
            # 更新参数
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

In [None]:
train_model(model, pairs, negatives, epochs=5, learning_rate=0.001, batch_size=1024)

In [38]:
# 5. 结果评估与可视化
def get_embedding_weights(model):
    return model.embedding.weight.data.cpu().numpy()

def visualize_embeddings(embeddings, idx_to_word, num_points=500):
    tsne = TSNE(n_components=2)
    reduced_embeddings = tsne.fit_transform(embeddings[:num_points])
    plt.figure(figsize=(14, 14))
    for i in range(num_points):
        plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1])
        plt.annotate(idx_to_word[i], xy=(reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
    plt.show()

In [None]:
embeddings = get_embedding_weights(model)
visualize_embeddings(embeddings, idx_to_word)

完成了以上代码后，请尝试在报告中回答以下几个问题：

1. 为什么我们需要做负采样？为什么负采样时依据概率分布进行？

2. 除了以上的负采样方法外，请自己再实现1-2种负采样方法，并给出具体的效果。

3. 替换Skip-gram模型为CBOW模型，修改Word2Vec()模型，给出具体的代码和实现效果。