# content
- 任务描述
- bilstm原理
- pytorch BiLstm使用
- 构建数据集
- BiLSTM 网络拓扑
- 模型训练
- 预测

# 任务描述
每一个句子中前一个单词预测后一个单词，例如：

sentence =  'Lorem ipsum dolor sit amet consectetur adipisicing elit '

input: lorem 

target: ipsum
# bilstm原理
![deque](pic/bilstm原理.png)
# pytorch BiLstm使用
       ——————————————————————
       rnn = nn.LSTM(input_size=10, hidden_size=20, num_layers=2,bidirectional=True)#(input_size,hidden_size,num_layers)
       input = torch.randn(5, 3, 10)#(seq_len, batch, input_size)
       h0 = torch.randn(4, 3, 20) #(num_layers,batch,output_size)
       c0 = torch.randn(4, 3, 20) #(num_layers,batch,output_size)
       output, (hn, cn) = rnn(input, (h0, c0))
       ——————————————————————
       output.shape #(seq_len, batch, hidden_size*2)
       torch.Size([5, 3, 40])
       ——————————————————————
       hn.shape #(num_layers*2, batch, hidden_size)
       torch.Size([4, 3, 20])
       ——————————————————————
       cn.shape #(num_layers*2, batch, hidden_size)
       torch.Size([4, 3, 20])

In [38]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sentence = (
    'Lorem ipsum dolor sit amet consectetur adipisicing elit '
    'sed do eiusmod tempor incididunt ut labore et dolore magna '
    'aliqua Ut enim ad minim veniam quis nostrud exercitation'
)
# 构建数据集
word_dict = {w: i for i, w in enumerate(list(set(sentence.split())))}
number_dict = {i: w for i, w in enumerate(list(set(sentence.split())))}

n_class = len(word_dict)
max_len = len(sentence.split())

'''
构造批量数据集
1.输入单词使用单词在字典中的index的位置one-hot编码
2.目标单词使用词典中单词的index编码
3.一个batch有词典维度个one-hot编码组成的矩阵，矩阵的第一行为输入单词的one-hot编码，
  其他行为字典第一个词的one-hot编码(凑数？？)
'''
def make_batch(sentence):
    input_batch = []
    target_batch = []

    words = sentence.split()
    for i, word in enumerate(words[:-1]):
        input = [word_dict[n] for n in words[:(i + 1)]]
        input = input + [0] * (max_len - len(input))
        target = word_dict[words[i + 1]]
        input_batch.append(np.eye(n_class)[input])
        '''
        # numpy.eye(N, M=None, k=0, dtype=<class 'float'>, order='C')
        # Return a 2-D array with ones on the diagonal and zeros elsewhere
        '''
        target_batch.append(target)
    return Variable(torch.Tensor(input_batch)), Variable(torch.LongTensor(target_batch))

In [39]:
# BiLSTM 网络拓扑
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, batch_size):
        super(BiLSTM, self).__init__()

        # 超参
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            bidirectional=True)

        self.fc = nn.Linear(self.hidden_size * 2, self.output_size)

    def forward(self, inputs):

        inputs = inputs.transpose(0, 1)
        # input : [n_step, batch_size, input_size]

        h0 = Variable(torch.zeros(self.num_layers * 2,
                                  self.batch_size, self.hidden_size)).to(device)
        c0 = Variable(torch.zeros(self.num_layers * 2, 
                                  self.batch_size, self.hidden_size)).to(device)

        out, (_, _) = self.lstm(inputs, (h0, c0))
        # 当lstm进行并行batch计算时:inputs的维度 ——> (n_step, batch_size, input_size)
        # out: (n_step, batch_size, hidden_size*2) (27, 26, 10)

        out = self.fc(out[-1])
        # out[-1] (26, 10) (batch_size, hidden*2) 只取最后一个时刻输出的隐层状态
        # Decode the hidden state of the last time step
        return out

In [44]:
# 模型训练

# 获取数据
input_batch, target_batch = make_batch(sentence)
# input_batch  (26,27,27) (n_step, seq_len, input_size(seq_embedding))
# target_batch  (26) (target)

# 超参
# Hyper-parameters
input_size = n_class
hidden_size = 5
learning_rate = 0.003
num_layers = 1  # LSTM 神经网络层数
batch_size = 26
output_size = 27

# 模型初始化
model = BiLSTM(input_size, hidden_size, output_size, num_layers, batch_size).to(device)

# 确认损失函数/参数优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(10000):

    # 清空梯度
    optimizer.zero_grad()
    output = model(input_batch.to(device))
    loss = criterion(output, target_batch.to(device))

    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d'%(epoch+1), 'cost=', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

# 预测
predict = model(input_batch.to(device)).data.max(1, keepdim=True)[1]
print(sentence)
print([number_dict[n.item()] for n in predict.squeeze()])

Epoch: 1000 cost= 1.000267
Epoch: 2000 cost= 0.558328
Epoch: 3000 cost= 0.466403
Epoch: 4000 cost= 0.417631
Epoch: 5000 cost= 0.394575
Epoch: 6000 cost= 0.380743
Epoch: 7000 cost= 0.374061
Epoch: 8000 cost= 0.426788
Epoch: 9000 cost= 0.421817
Epoch: 10000 cost= 0.369939
Lorem ipsum dolor sit amet consectetur adipisicing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis nostrud exercitation
['dolor', 'ipsum', 'dolor', 'dolor', 'dolor', 'adipisicing', 'elit', 'sed', 'do', 'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', 'Ut', 'enim', 'ad', 'minim', 'veniam', 'quis', 'quis', 'exercitation']
