In [11]:
!pip install importnb
import os
import sys
import importnb
from torch import nn
import torch
import numpy as np

Collecting importnb
  Downloading importnb-2023.11.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: importnb
Successfully installed importnb-2023.11.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
notebook_path = os.getcwd()
parent_dir = os.path.dirname(notebook_path)
sys.path.append(parent_dir)
with __import__('importnb').Notebook(): 
    from utils.tools import MultiHeadAttention
    from utils.tools import AddPositionalEncoding
    from utils.tools import TransformerFFN

In [13]:
class TransformerEncoderLayer(nn.Module):
    def __init__(
        self,
        d_model:int,
        d_ff:int,
        num_head:int,
        dropout_rate:float,
        layer_norm_eps:float,
    ) -> None:
        super().__init__()
        ###layerの宣言
        self.mha = MultiHeadAttention(num_head,d_model)
        self.layernorm_mha = nn.LayerNorm(d_model,eps=layer_norm_eps)
        self.dropout_mha = nn.Dropout(dropout_rate)

        self.ffn = TransformerFFN(d_model,d_ff)
        self.dropout_ffn = nn.Dropout(dropout_rate)
        self.layernorm_ffn = nn.LayerNorm(d_model,eps=layer_norm_eps)

    def forward(
        self,
        x:torch.Tensor,
        mask:torch.Tensor=None
    ) -> torch.Tensor:
        
        ###attention層を通す
        output = self.__get_mha_output(x,mask)
        ###add+layernorm
        output = self.layernorm_mha(output+x)
        
        ###FFN層を通す
        output = self.__get_ffn_output(output)
        ###add+layernorm
        output = self.layernorm_ffn(output+x)

        return output
        
    def __get_mha_output(
        self,
        x:torch.Tensor,
        mask:torch.Tensor=None
    ) -> torch.Tensor:
        x = self.mha(x,x,x,mask)
        x = self.dropout_mha(x)
        return x
        
    def __get_ffn_output(
        self,
        x:torch.Tensor,
    ) -> torch.Tensor:
        x = self.ffn(x)
        x = self.dropout_ffn(x)
        return x

In [14]:
"""
Decoder layer
"""

class TransformerDecoderLayer(nn.Module):
    def __init__(
        self,
        d_model:int,
        d_ff:int,
        num_head:int,
        dropout_rate:float,
        layer_norm_eps:float,
    ) -> None:
        
        super().__init__()
        self.mmha = MultiHeadAttention(num_head, d_model)
        self.mha = MultiHeadAttention(num_head, d_model)
        self.layer_norm_1 = nn.LayerNorm(d_model,eps=layer_norm_eps)
        self.layer_norm_2 = nn.LayerNorm(d_model,eps=layer_norm_eps)
        self.layer_norm_3 = nn.LayerNorm(d_model,eps=layer_norm_eps)
        
        self.FF = FeedForward(d_model, d_ff)
        self.dropout_1 = nn.Dropout(dropout_rate)
        self.dropout_2 = nn.Dropout(dropout_rate)
        self.dropout_3 = nn.Dropout(dropout_rate)

    def forward(
        self,
        x:torch.Tensor,
        mask:torch.Tensor=None
    ) -> torch.Tensor:
        
        Q = K = V = x
        x = self.mmha(Q, K, V, mask)
        x = self.dropout_1(x)
        x = x + Q
        x = self.layer_norm_1(x)
        Q = x
        K = V = y
        x = self.mha(Q, K, V)
        x = self.dropout_2(x)
        x = x + Q
        x = self.layer_norm_2(x)
        _x = x
        x = self.FF(x)
        x = self.dropout_3(x)
        x = x + _x
        x = self.layer_norm_3(x)
        return x

In [None]:
class Decoder(nn.Module):

  def __init__(
        self,
        dec_vocab_size:int,
        dim:int,
        d_ff:int,
        num_head:int,
        dropout_rate:float,
        layer_norm_eps:float,
    ) -> None:
    super().__init__() 
      
    self.dim = dim
    self.embed = nn.Embedding(dec_vocab_size, dim)
    self.PE = AddPositionalEncoding(dim)
    self.TransformerDecoderLayer = nn.ModuleList([DecoderBlock(dim, head_num) for _ in range(6)])
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(dim, dec_vocab_size)

  def forward(self, x, y, mask):
    x = self.embed(x)
    x = x*(self.dim**0.5)
    x = self.PE(x)
    x = self.dropout(x)
    for i in range(6):
      x = self.DecoderBlocks[i](x, y, mask)
    x = self.linear(x)
    return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, enc_vocab_size, dec_vocab_size, dim, head_num):
    super().__init__() 
    self.encoder = Encoder(enc_vocab_size, dim, head_num, dropout_rate = 0.1)
    self.decoder = Decoder(dec_vocab_size, dim, head_num, dropout_rate = 0.1)

  def forward(self, enc_input, dec_input, mask):
    enc_output = self.encoder(enc_input)
    output = self.decoder(dec_input, enc_output, mask)
    return output