# Encoder
## Feed Forward前馈神经网络层

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
class PositionwiseFeedForward(nn.Module):
    """
      :d_model 输入输出维度 特征维度
      :hidden 前馈神经网络隐藏层维度
      :dropout dropout比率 正则化的一个基础,防止过拟合
    """
    def __init__(self, d_model, hidden, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden)
        self.fc2 = nn.Linear(hidden, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)   # 防止过拟合

## EncoderLayer

In [4]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, ffn_hidden, n_head, dropout=0.1):
    self.attention = MultiHeadAttention(d_model, n_head)
    self.norm1 = LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, dropout)
    self.norm2 = LayerNorm(d_model)
    self.dropout2 = nn.Dropout(dropout)
  
  def forward(self, x, mask=None):
    # Multi-Head Attention
    _x = x # 暂存一下原始x
    x = self.attention(x, x, x, mask) # mask 考虑忽略掉某些位置
    x = self.dropout1(x)
    x = self.norm1(x + _x) # 残差连接
    _x = x # 再次暂存第一部分的输出

    # Feed Forward
    x = self.ffn(x)
    x = self.dropout2(x)
    x = self.norm2(x + _x) # 残差连接

    return x

In [None]:
class Encoder(nn.Module):
  """
  :enc_voc: 输入词表大小
  :max_len: 输入序列最大长度
  :d_model: 特征维度
  :ffn_hidden: 前馈神经网络隐藏层维度
  :n_head: 多头注意力机制头数
  :n_layers: 编码器层数
  :dropout: dropout比率
  :device: 设备
  """
  def __init__(self, enc_voc_size, max_len, d_model, \
    ffn_hidden, n_head, n_layers, dropout=0.1, device):
    self.embedding = TransformerEmbedding(enc_voc_size, d_model, max_len, dropout, device) # 映射到高维空间
    self.layers = nn.ModuleList(
      [
        EncoderLayer(d_model, ffn_hidden, n_head, dropout) 
        for _ in range(n_layers)
      ]
    )
  
  def forward(self, x, s_mask):
    # x: (batch_size, seq_len) 从词汇表中的索引映射到高维空间
    x = self.embedding(x) # (batch_size, seq_len, d_model)
    for layer in self.layers:
      x = layer(x, s_mask) # (batch_size, seq_len, d_model)
    return x # 返回所有编码器处理后的x
