# NPLM
Neural Probabilistic Language Model

神经概率语言模型

<img src="attachments/nplm-structure.jpg" width="300">


## 构建语料库



In [12]:
sentences = [
    "我 喜欢 玩具",
    "我 爱 爸爸",
    "我 讨厌 挨打"
] # 这里的中文用了空格来简化分词lol

word_list = list(set(' '.join(sentences).split()))
word_to_idx = {word:idx for idx,word in enumerate(word_list)}
voc_size = len(word_list)
idx_to_word =  {idx:word for idx,word in enumerate(word_list)}

print(word_to_idx)



{'讨厌': 0, '爱': 1, '喜欢': 2, '我': 3, '爸爸': 4, '挨打': 5, '玩具': 6}


## 准备数据集

In [21]:
import torch
import random

batch_size = 2

def make_batch():
    input_batch = []
    target_batch = []
    selected_sentences = random.sample(sentences,batch_size)
    
    for sentence in selected_sentences:
        words = sentence.split()
        input = [word_to_idx[n] for n in words[:-1]] # 最后一个词以外的词作为输入
        target = word_to_idx[words[-1]] # 最后一个词作为目标
        input_batch.append(input)
        target_batch.append(target)
    
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)
    
    return input_batch,target_batch

input_batch,target_batch = make_batch()
print(f"输入批处理数据:{input_batch}")


input_words = []
for input_idx in input_batch:
    input_words.append([idx_to_word[idx.item()] for idx in input_idx])
    # for idx in input_idx:
    #     input_words.append([idx_to_word[idx.item()]])
        
print(f"输入批处理数据对应的原始词:{input_words}")
print(f"目标批处理数据:{target_batch}")

target_words = [idx_to_word[idx.item()] for idx in target_batch]
print(f"目标批处理数据对应的原始词:{target_words}")
        

输入批处理数据:tensor([[3, 1],
        [3, 2]])
输入批处理数据对应的原始词:[['我', '爱'], ['我', '喜欢']]
目标批处理数据:tensor([4, 6])
目标批处理数据对应的原始词:['爸爸', '玩具']
