# NPLM
Neural Probabilistic Language Model

神经概率语言模型

<img src="attachments/nplm-structure.jpg" width="300">


## 构建语料库



In [1]:
sentences = [
    "我 喜欢 玩具",
    "我 爱 爸爸",
    "我 讨厌 挨打"
] # 这里的中文用了空格来简化分词lol

word_list = list(set(' '.join(sentences).split()))
word_to_idx = {word:idx for idx,word in enumerate(word_list)}
voc_size = len(word_list)
idx_to_word =  {idx:word for idx,word in enumerate(word_list)}

print(word_to_idx)



{'喜欢': 0, '玩具': 1, '我': 2, '讨厌': 3, '爸爸': 4, '挨打': 5, '爱': 6}


## 准备数据集

In [2]:
import torch
import random

batch_size = 2

def make_batch():
    input_batch = []
    target_batch = []
    selected_sentences = random.sample(sentences,batch_size)
    
    for sentence in selected_sentences:
        words = sentence.split()
        input = [word_to_idx[n] for n in words[:-1]] # 最后一个词以外的词作为输入
        target = word_to_idx[words[-1]] # 最后一个词作为目标
        input_batch.append(input)
        target_batch.append(target)
    
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)
    
    return input_batch,target_batch

input_batch,target_batch = make_batch()
print(f"输入批处理数据:{input_batch}")


input_words = []
for input_idx in input_batch:
    input_words.append([idx_to_word[idx.item()] for idx in input_idx])
    # for idx in input_idx:
    #     input_words.append([idx_to_word[idx.item()]])
        
print(f"输入批处理数据对应的原始词:{input_words}")
print(f"目标批处理数据:{target_batch}")

target_words = [idx_to_word[idx.item()] for idx in target_batch]
print(f"目标批处理数据对应的原始词:{target_words}")
        

输入批处理数据:tensor([[2, 6],
        [2, 0]])
输入批处理数据对应的原始词:[['我', '爱'], ['我', '喜欢']]
目标批处理数据:tensor([4, 1])
目标批处理数据对应的原始词:['爸爸', '玩具']


## 定义模型

In [3]:
import torch.nn as nn

embedding_size = 2
n_step = 2
n_hidden = 2

class NPLM(nn.Module):
    def __init__(self):
        super(NPLM,self).__init__()
        self.C = nn.Embedding(voc_size,embedding_size)
        self.linear1 = nn.Linear(n_step*embedding_size,n_hidden)
        self.linear2 = nn.Linear(n_hidden,voc_size)
        
    def forward(self,X):
        X = self.C(X)
        X = X.view(-1,n_step*embedding_size)
        hidden = torch.tanh(self.linear1(X))
        output = self.linear2(hidden)
        return output
    
nplm_model = NPLM()
print(nplm_model)

NPLM(
  (C): Embedding(7, 2)
  (linear1): Linear(in_features=4, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=7, bias=True)
)


## 训练

In [4]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(nplm_model.parameters(),lr=0.1)

epochs = 5000

for epoch in range(epochs):
    optimizer.zero_grad()
    input_batch,target_batch = make_batch()
    output = nplm_model(input_batch)
    loss = criterion(output,target_batch)
    if (epoch+1) % 1000 == 0:
        print('Epoch:','%4d' % (epoch+1),'cost=','{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

Epoch: 1000 cost= 0.246516
Epoch: 2000 cost= 0.001458
Epoch: 3000 cost= 0.000286
Epoch: 4000 cost= 0.000135
Epoch: 5000 cost= 0.000074


## 预测

In [5]:
input_strs = [['我','讨厌'],['我','喜欢']]
input_indices = [[word_to_idx[word] for word in seq]for seq in input_strs]
input_batch = torch.LongTensor(input_indices)
predict = nplm_model(input_batch).data.max(1)[1]
predict_strs = [idx_to_word[n.item()]for n in predict.squeeze()]
for input_seq,pred in zip(input_strs,predict_strs):
    print(input_seq,'->',pred)

['我', '讨厌'] -> 挨打
['我', '喜欢'] -> 玩具
