# Skip Gram

先准备句子，进行分词，形成词汇表

In [None]:
# sentences = [
#     "The New York Times is a daily newspaper based in New York City",
#     "The effort failed once local California newspapers came into prominence",
#     "Shortly after assuming control of the paper Ochs coined the paper slogan"
# ]
# sentences = [
#     "Kage is Teacher",
#     "Niuzong is Boss",
#     "Mazong is Boss",
#     "Xiaoxue is Student",
#     "Xiaobing is Student"
# ]
sentences = [
    "Cat is animal","Dog is animal","Lion is animal","Merlin is bird","Pidgin is bird"
]
words = list(set((' '.join(sentences)).split()))
print(f"words :{words}")
print(f"vocabular size:{len(words)}")

映射一个 word => index 的列表

In [None]:
word_to_idx = {word:idx for idx,word in enumerate(words)}
print(word_to_idx)

`windows size` 是窗口大小，取目标词前后各 window size 个词作为上下文: `[max(idx-window_size,0):min(idx+window_size+1,len(splitted_sentence))]`

In [None]:
window_size = 2
data = []
for sentence in sentences:
    splitted_sentence = sentence.split()
    for idx,word in enumerate(splitted_sentence):
        for neighbor in splitted_sentence[max(idx-window_size,0):min(idx+window_size+1,len(splitted_sentence))]:
            if neighbor != word:
                data.append((neighbor,word))
print(data)

进行独热编码，用`pytorch`中的`tensor`表示

In [None]:
import torch
def one_hot_encoding(word,word_to_idx):
    tensor = torch.zeros(len(word_to_idx))
    tensor[word_to_idx[word]] = 1
    return tensor


# 对data全体进行独热编码
encoded_data = [(one_hot_encoding(context,word_to_idx),word_to_idx[target]) for context,target in data]
print(encoded_data)


定义神经网络准备训练

In [None]:
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self,voc_size,embedding_size):
        super(SkipGram,self).__init__()
        self.input_to_hidden = nn.Linear(voc_size,embedding_size,bias=False)
        self.hidden_to_output = nn.Linear(embedding_size,voc_size,bias=False)
    
    def forward(self,X):
        hidden = self.input_to_hidden(X)
        output = self.hidden_to_output(hidden)
        return output

embedding_size = 2
skipgram_model = SkipGram(voc_size=len(words),embedding_size=embedding_size)
print("Skip-Gram:",skipgram_model)

训练

In [None]:
import torch.optim as optim

learning_rate = 1e-3
epochs = 1000
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(skipgram_model.parameters(),lr=learning_rate)

loss_valus = []

for epoch in range(epochs):
    loss_sum = 0
    for context,target in data:
        X = one_hot_encoding(target,word_to_idx).float().unsqueeze(0)
        y_true = torch.tensor([word_to_idx[context]],dtype=torch.long)
        y_pred = skipgram_model(X)
        loss = criterion(y_pred,y_true)
        # 累计损失
        loss_sum += loss.item()
        # 清空梯度
        optimizer.zero_grad() 
        # 反向传播
        loss.backward()
        # 更新参数
        optimizer.step()
    if (epoch+1) % 100 == 0:
        print(f"Epoch:{epoch+1},Loss:{loss_sum/len(data)}")
        loss_valus.append(loss_sum/len(data))

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = ["SimHei"]
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
plt.plot(range(1, epochs // 100 + 1), loss_valus)
plt.title("训练损失曲线")
plt.xlabel("轮次")
plt.ylabel("损失")
plt.show()

获取词嵌入信息

In [None]:
for word,idx in word_to_idx.items():
    print(f"{word}:{skipgram_model.input_to_hidden.weight[:,idx].detach().numpy()}")

# print("\nSkip-Gram词嵌入:")
# for word, idx in word_to_idx.items(): # 输出每个单词的嵌入向量
#     print(f"{word}: \
#     {skipgram_model.input_to_hidden.weight[:, idx].detach().numpy()}")  

In [None]:
print(skipgram_model.input_to_hidden.weight.shape)

生成词嵌入图像

In [None]:
fig,ax = plt.subplots()
for word,idx in word_to_idx.items():
    vec = skipgram_model.input_to_hidden.weight[:,idx].detach().numpy()
    ax.scatter(vec[0],vec[1])
    ax.annotate(word,(vec[0],vec[1]),fontsize=9)
plt.title('二维词嵌入')
plt.xlabel('向量维度1')
plt.ylabel('向量维度2')
plt.show()