<a href="https://colab.research.google.com/github/naoya526/jpn2ita/blob/main/Bert_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Module

I used materials below:
[1]`06_Attention_and_Transformers_in_BERT.ipynb`
[2]`English_to_italian_automatic_translation.ipynb`

In [None]:
import os
import re
import random
import itertools
import math
from pathlib import Path

import tqdm

import numpy as np
print(np.__version__)

import torch
print(torch.__version__)

import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

import transformers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, BertConfig

### Suppress useless warnings
import warnings
warnings.filterwarnings("ignore", message="The secret `HF_TOKEN` does not exist")

from collections import defaultdict
from transformers import AutoTokenizer

2.2.6


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x77011f836c20>>
Traceback (most recent call last):
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


2.7.1+cu126


  from .autonotebook import tqdm as notebook_tqdm


## Define Model
---


### Encoder (Bert) part
Here, There's the function for implementing Encoder(Bert). I implemented with refering to [1]`06_Attention_and_Transformers_in_BERT.ipynb` and the paper.
- `MultiHeadAttention`
- `PositionwiseFeedForward`
- `Encoder Block`
- `BertEmbeddings` (Embedding for words)
- `Bert`
Bert is highly possible to understand meaning, but it is not enough for produce translation.
Hence, In the next part, I implement Decoder. It is quite similar to Bert.

In [2]:
class MultiHeadAttention(nn.Module):
    """
    - Query, Key, Value
    - Scaled Dot Product Attention: softmax(QK^T / sqrt(d_k))V
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.dropout = torch.nn.Dropout(dropout)

        # Q, K, V linear Conversion
        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)

        self.out_proj = torch.nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.shape

        # step1: Q, K, V
        query = self.query(x)  # (batch, seq_len, d_model)
        key = self.key(x)      # (batch, seq_len, d_model)
        value = self.value(x)  # (batch, seq_len, d_model)

        # step2: Multi-Head
        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim)  # 修正: query.shape → batch_size
        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim)

        # step3: Change Dimention for Calclate Efficiently
        query = query.permute(0, 2, 1, 3)  # (batch, num_heads, seq_len, head_dim)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)

        # ステップ4: Scaled Dot-Product Attention
        # scores = Q @ K^T / sqrt(d_k)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # ステップ5: マスク処理（オプション）
        if mask is not None:
            # mask形状: (batch, 1, 1, seq_len) または (batch, 1, seq_len, seq_len) など、scores形状にブロードキャスト可能な形状
            # scores形状: (batch, num_heads, seq_len, seq_len)
            # 0を-1e9に変換（Softmaxで0になるように）→ 加算によるマスキングに変更
            # scores = scores.masked_fill(mask == 0, -1e9) # 元のコード
            scores = scores + mask # 加算によるマスキングに変更

        # ステップ6: Softmax + Dropout
        weights = F.softmax(scores, dim=-1)  # (batch, num_heads, seq_len, seq_len)
        weights = self.dropout(weights)
        # ステップ7: Value との積
        context = torch.matmul(weights, value)
        # ステップ8: ヘッドを結合して元の形状に戻す
        context = context.permute(0, 2, 1, 3)
        # → (batch, seq_len, d_model)
        context = context.contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)

        # ステップ9: 最終的な線形変換
        return self.out_proj(context)  # 修正: output_linear → out_proj

class PositionwiseFeedForward(nn.Module):
    """
    ヒント:
    - 2層のフィードフォワードネットワーク
    - 中間層では次元を拡張（通常4倍）
    - GELU活性化関数を使用
    - ドロップアウトも忘れずに
    """
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)  # 入力次元 → 中間次元
        self.linear2 = nn.Linear(d_ff, d_model)  # 中間次元
        self.dropout = nn.Dropout(dropout)
        self.gelu = nn.GELU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.dropout(x)
        return x

class EncoderBlock(nn.Module):
    """
    ヒント:
    - Multi-Head Attention + Residual Connection + Layer Norm
    - Feed Forward + Residual Connection + Layer Norm
    - Which is better??: Pre-LN vs Post-LN
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model,num_heads)
        self.ffn = PositionwiseFeedForward(d_model,d_ff)

        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        #Attention block
        #TODO implement transformer block
        residual = x
        #print("Took Residual...",x.shape)
        x = self.layer_norm1(x)
        #print("calculating layer norm...",x.shape)
        x = self.dropout(self.attention(x,mask))
        #print("calculating Attention...",x.shape)
        x = x + residual
        #print("calculating Residual Connection...",x.shape)
        #ffnn
        residual = x
        x = self.layer_norm2(x)
        #print("calculating layer norm...",x.shape)
        x = self.dropout(self.ffn(x))
        #print("calculating ffn...",x.shape)
        x = x + residual
        return x


class BertEmbeddings(nn.Module):
    """
    - Token Embeddings (語彙サイズ × d_model)
    - Position Embeddings (最大系列長 × d_model)
    - Segment Embeddings (2 × d_model, NSPタスク用)
    - 3つを足し合わせてLayerNormとDropout
    """
    def __init__(self, vocab_size, d_model, max_seq_len=512, dropout=0.1):
        super().__init__()
        # TODO: 3種類の埋め込みを実装
        self.d_model = d_model
        self.token = torch.nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.position = torch.nn.Embedding(max_seq_len, d_model)
        self.segment = torch.nn.Embedding(2, d_model)  # 2つのセグメント（0と1）
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        #Embedding: Lookup table that keep meaning vector of words
    def forward(self, input_ids, token_type_ids=None):
        # TODO: 埋め込みの計算を実装
        batch_size, seq_len = input_ids.shape
        # Step 1: Token Embeddings
        token_embeddings = self.token(input_ids)
        # Step 2: Position Embeddings
        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        position_ids = position_ids.expand(batch_size, -1)  # 🔧 バッチ次元を拡張
        position_embeddings = self.position(position_ids)
        # Step 3: Segment Embeddings
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)  # 全て0（単一文）
        segment_embeddings = self.segment(token_type_ids)  # (batch, seq_len, d_model)
        embeddings = token_embeddings + position_embeddings + segment_embeddings
        embeddings = self.dropout(self.layer_norm(embeddings))

        return embeddings

class Bert(nn.Module):
    """
    BERT実装の最終形

    学習のヒント:
    1. 論文を読んで全体像を理解
    2. 小さな部品から実装（Attention → FFN → Block → Full Model）
    3. 各層で print(tensor.shape) してサイズを確認
    4. 簡単なダミーデータでテスト
    5. 事前学習は計算量が大きいので、小さいモデルから開始

    重要な概念:
    - Bidirectional: 左右両方向の文脈を見る
    - Masked Language Model: ランダムにマスクした単語を予測
    - Next Sentence Prediction: 2つの文が連続するかを予測
    - Attention Weights: どの単語に注目しているかの可視化
    """

    def __init__(self, vocab_size, d_model=768, num_layers=12, num_heads=12, d_ff=3072, max_seq_len=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.heads = num_heads
        # paper noted 4*d_model size for ff
        self.feed_forward_hidden = d_model * 4
        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BertEmbeddings(vocab_size, d_model, max_seq_len, dropout)

        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderBlock(d_model, num_heads, d_model * 4, dropout) for _ in range(num_layers)])

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # TODO: BERT全体のforward passを実装
        if attention_mask is None:
            attention_mask = (input_ids != 0).float()
        if attention_mask.dim() == 2:
            # (batch, seq_len) → (batch, 1, 1, seq_len)
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            # print("squeeze is required") # デバッグプリントを削除
        elif attention_mask.dim() == 4:
            # 既に正しい形状の場合はそのまま使用
            extended_attention_mask = attention_mask
            # print("squeeze is not required") # デバッグプリントを削除
        else:
             raise ValueError(f"Attention mask should be 2D or 4D, but got {attention_mask.dim()}D")

        # 0を-1e9に変換（Softmaxで0になるように） - 加算によるマスキングのために値を調整
        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9


        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(input_ids, token_type_ids)
        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, extended_attention_mask) # 修正後のMultiHeadAttentionは加算マスクを期待
        return x

### Decoder part
This part, I implemented these functions:
- `CrossAttention`(English Queue, Italian Key, Italian Value)
- `DecoderBlock`
- `BertTranslationModel`(Bert + Decoder Embedding + DecoderBlock*`num_layers`)

In [3]:
class CrossAttention(nn.Module):
    """
    this module is implemented with modifying MultiHeadAttention.
    Query: English
    Key, Value: Italian
    You can see the difference in forward input
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__() # initialization
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads  # dimention of each head
        self.dropout = torch.nn.Dropout(dropout)

        # Q, K, V の線形変換（修正：torch.nn.linear → torch.nn.Linear）
        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)

        # 最終的な出力変換
        self.out_proj = torch.nn.Linear(d_model, d_model)

    def forward(self, query_input, key_value_input, mask=None): # here is the difference
        batch_size, q_len, _ = query_input.shape
        _, kv_len, _ = key_value_input.shape
        # ステップ1: Q, K, V を線形変換で生成
        query = self.query(query_input)  # (batch, seq_len, d_model)
        key = self.key(key_value_input)      # (batch, seq_len, d_model)
        value = self.value(key_value_input)  # (batch, seq_len, d_model)

        # ステップ2: Multi-Head用に次元を変形
        query = query.view(batch_size, q_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, kv_len, self.num_heads, self.head_dim)  # 修正: query.shape → batch_size
        value = value.view(batch_size, kv_len, self.num_heads, self.head_dim)

        query = query.permute(0, 2, 1, 3)  # (batch, num_heads, seq_len, head_dim)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)

        # ステップ4: Scaled Dot-Product Attention
        # scores = Q @ K^T / sqrt(d_k)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # ステップ5: マスク処理（オプション）
        if mask is not None:
            # mask形状: (batch, 1, 1, seq_len) → scores形状: (batch, num_heads, seq_len, seq_len)
            scores = scores + mask  # ブロードキャストで加算

        # ステップ6: Softmax + Dropout
        weights = F.softmax(scores, dim=-1)  # (batch, num_heads, seq_len, seq_len)
        weights = self.dropout(weights)
        # ステップ7: Value との積
        context = torch.matmul(weights, value)
        # ステップ8: ヘッドを結合して元の形状に戻す
        context = context.permute(0, 2, 1, 3)
        # → (batch, seq_len, d_model)
        context = context.contiguous().view(batch_size, q_len, self.num_heads * self.head_dim)

        # ステップ9: 最終的な線形変換
        return self.out_proj(context)  # 修正: output_linear → out_proj

class DecoderBlock(nn.Module):
    """
    Basically similar to EncoderBlock, but refer to the infomation of Input(English context)
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        #First, implement Self Attention
        self.self_attention = MultiHeadAttention(d_model,num_heads)
        #Second, implement Cross Attention
        self.cross_attention = CrossAttention(d_model, num_heads)
        #Third, FFNN
        self.ffn = PositionwiseFeedForward(d_model,d_ff)

        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm3 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, self_mask=None, cross_mask=None):
        #Self Attention
        residual = x
        x = self.layer_norm1(x)
        x = self.self_attention(x,mask=self_mask)
        x = self.dropout(x) + residual

        #Cross Attention
        residual = x
        x = self.layer_norm2(x)
        x = self.cross_attention(
            query_input=x,
            key_value_input=encoder_output,
            mask=cross_mask
        )
        x = self.dropout(x) + residual

        residual = x
        x = self.layer_norm3(x)
        x = self.ffn(x)
        x = self.dropout(x) + residual
        return x

class BertTranslationModel(nn.Module):
    """
    Ita2Eng Translation Model
    Encoder: Bert
    Decoder: BertEmbedding, DecoderBlock*N, FFN
    """
    def __init__(self,
                 ita_vocab_size,  # イタリア語語彙サイズ
                 eng_vocab_size,  # 英語語彙サイズ
                 max_seq_len,
                 d_model=512,
                 num_layers=6,
                 num_heads=8,
                 dropout=0.1):
        super().__init__()

        self.encoder = Bert(
            vocab_size=eng_vocab_size,
            d_model=d_model,
            num_layers=num_layers,
            num_heads=num_heads,
            max_seq_len=max_seq_len,
            dropout=dropout
        )

        self.decoder_embeddings = BertEmbeddings(
            vocab_size=ita_vocab_size,
            d_model=d_model,
            max_seq_len=max_seq_len,
            dropout=dropout
        )

        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_model * 4, #based on the paper of Bert
                dropout=dropout)
            for _ in range(num_layers)
        ])

        self.output_proj = nn.Linear(d_model, ita_vocab_size)

    def forward(self,
                eng_ids,
                ita_ids,
                eng_mask=None,
                ita_mask=None,
                eng_token_type_ids=None,
                ita_token_type_ids=None):
        # understand english
        encoder_output = self.encoder(input_ids=eng_ids, attention_mask=eng_mask, token_type_ids=eng_token_type_ids)
        # produce Italian
        decoder_input = self.decoder_embeddings(input_ids=ita_ids, token_type_ids=ita_token_type_ids)

        for decoder_block in self.decoder_blocks:
            decoder_input = decoder_block(
                x=decoder_input,
                encoder_output=encoder_output,
                self_mask=ita_mask,               # 英語のCausal mask
                cross_mask=eng_mask
                )
        logits = self.output_proj(decoder_input)
        return logits

# Translation Dataset

In [None]:
def download_scentence():
    eng_sentences = []
    with open("text-eng.txt", "r") as f:
        for line in f:
            line = line.strip().replace('<sos>', '').replace('<eos>', '').strip()
            if line:
                eng_sentences.append(line)

    ita_sentences = []
    with open("text-ita.txt", "r") as f:
        for line in f:
            line = line.strip().replace('<sos>', '').replace('<eos>', '').strip()
            if line:
                ita_sentences.append(line)
    return eng_sentences, ita_sentences


class TranslationDataset(Dataset):
    def __init__(self, data_pair, eng_vocab, ita_vocab, seq_len=64):
        self.data_pair = data_pair
        self.eng_vocab = eng_vocab  # {word: id} dictionary
        self.ita_vocab = ita_vocab  # {word: id} dictionary
        self.seq_len = seq_len
        
        # Create reverse mappings
        self.eng_id_to_word = {v: k for k, v in eng_vocab.items()}
        self.ita_id_to_word = {v: k for k, v in ita_vocab.items()}
        
        # Special token IDs
        self.eng_pad_id = eng_vocab.get('[PAD]', 0)
        self.eng_cls_id = eng_vocab.get('[CLS]', 1)
        self.eng_sep_id = eng_vocab.get('[SEP]', 2)
        self.eng_unk_id = eng_vocab.get('[UNK]', 3)
        
        self.ita_pad_id = ita_vocab.get('[PAD]', 0)
        self.ita_cls_id = ita_vocab.get('[CLS]', 1)
        self.ita_sep_id = ita_vocab.get('[SEP]', 2)
        self.ita_unk_id = ita_vocab.get('[UNK]', 3)

    def __len__(self):
        return len(self.data_pair)

    def tokenize_sentence(self, sentence, vocab, unk_id):
        """Convert sentence to list of token IDs"""
        words = sentence.lower().split()  # Simple word tokenization
        token_ids = []
        for word in words:
            # Remove punctuation and get token ID
            word = word.strip('.,!?;:"()[]{}')
            token_id = vocab.get(word, unk_id)
            token_ids.append(token_id)
        return token_ids

    def __getitem__(self, item):
        eng_sentence, ita_sentence = self.data_pair[item]

        # Tokenize English sentence to word IDs
        eng_token_ids = self.tokenize_sentence(eng_sentence, self.eng_vocab, self.eng_unk_id)
        
        # Add CLS and SEP tokens, then pad/truncate
        eng_token_ids = [self.eng_cls_id] + eng_token_ids + [self.eng_sep_id]
        if len(eng_token_ids) > self.seq_len:
            eng_token_ids = eng_token_ids[:self.seq_len]
        else:
            eng_token_ids += [self.eng_pad_id] * (self.seq_len - len(eng_token_ids))
        
        # Create attention mask for English
        eng_attention_mask = [1 if token_id != self.eng_pad_id else 0 for token_id in eng_token_ids]

        # Tokenize Italian sentence to word IDs
        ita_token_ids = self.tokenize_sentence(ita_sentence, self.ita_vocab, self.ita_unk_id)
        
        # Add CLS and SEP tokens, then pad/truncate
        ita_full_ids = [self.ita_cls_id] + ita_token_ids + [self.ita_sep_id]
        if len(ita_full_ids) > self.seq_len:
            ita_full_ids = ita_full_ids[:self.seq_len]
        else:
            ita_full_ids += [self.ita_pad_id] * (self.seq_len - len(ita_full_ids))

        # Create decoder input (shift right: [CLS] + original[:-1])
        decoder_input_ids = [self.ita_cls_id] + ita_full_ids[:-1]
        
        # Target is the original sequence (what we want to predict)
        ita_target_ids = ita_full_ids

        # Create attention mask for Italian
        ita_attention_mask = [1 if token_id != self.ita_pad_id else 0 for token_id in ita_full_ids]

        # Create causal mask for decoder self-attention
        causal_mask = torch.triu(torch.ones(self.seq_len, self.seq_len), diagonal=1).bool()
        causal_mask = causal_mask.masked_fill(causal_mask, float('-inf'))
        causal_mask = causal_mask.masked_fill(~causal_mask, 0.0)

        output = {
            "eng_ids": torch.tensor(eng_token_ids, dtype=torch.long),
            "eng_mask": torch.tensor(eng_attention_mask, dtype=torch.long),
            "eng_token_type_ids": torch.zeros(self.seq_len, dtype=torch.long),
            "ita_ids": torch.tensor(decoder_input_ids, dtype=torch.long),
            "ita_mask": torch.tensor(ita_attention_mask, dtype=torch.long),
            "ita_token_type_ids": torch.zeros(self.seq_len, dtype=torch.long),
            "ita_causal_mask": causal_mask,
            "ita_target_ids": torch.tensor(ita_target_ids, dtype=torch.long),
        }

        return output

## Use model
In this part, I followed the configuration of [2]`English_to_italian_automatic_translation.ipynb`.

---
### Prepare Dataset
for Bert, `<sos>`and `<eos>` are not required. Hence, ignore these token.

In [4]:
# Download the files
URL = "https://drive.google.com/file/d/1_npGYZk13fs5hE0kAggiSrmKkqW3OrLT/view?usp=sharing"
!gdown --fuzzy $URL -O- | tar -xz

Traceback (most recent call last):
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/bin/gdown", line 8, in <module>
    sys.exit(main())
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/gdown/__main__.py", line 172, in main
    download(
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/gdown/download.py", line 202, in download
    res = sess.get(url, stream=True, verify=verify)
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/requests/sessions.py", line 602, in get
    return self.request("GET", url, **kwargs)
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10/site-packages/requests/sessions.py", line 724, in send
    history = [resp for resp in gen]
  File "/home/naoya/pv2/deeplearning/jpn2ita/.venv/lib/python3.10

### Incremental approach to token vocabulary building
In the lesson of Deep Learning, I learned the sophisticated way of tokenizing words called WordPiece tokenization.

The WordPiece tokenization algorithm builds its vocabulary incrementally, starting from a basic alphabet and iteratively merging subword units based on their frequency and co-occurrence patterns. (cited from [1]`06_Attention_and_Transformers_in_Bert.ipynb`)

---

#### The demonstration of pretrained tokenizer

In [1], Tokenizer `bert-base-cased` was used for English(For tokenization of English, it's used in this project as well). In this project, Tokenizier [3]`dbmdz/bert-base-italian-cased` for italian is used.
[3]https://huggingface.co/dbmdz/bert-base-italian-cased  
\
In this section, With using small scentence, The procedure will be explained.

These procedure will be iterated:
- Compute word frequencies
- Split Words into Alphabet
- Compute score of each pair
- Merge the pair

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
from collections import defaultdict

from TranslationDataset import TranslationDataset, download_sentence, build_vocabulary
from Decoder import BertTranslationModel

def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs=5):
    """Train the BertTranslationModel"""
    print(f"Starting training for {epochs} epochs...")
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        train_iterator = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in train_iterator:
            # Move batch to device
            eng_ids = batch['eng_ids'].to(device)
            eng_mask = batch['eng_mask'].to(device)
            eng_token_type_ids = batch['eng_token_type_ids'].to(device)
            ita_ids = batch['ita_ids'].to(device)
            ita_mask = batch['ita_mask'].to(device)
            ita_target_ids = batch['ita_target_ids'].to(device)
            ita_token_type_ids = batch['ita_token_type_ids'].to(device)
            ita_causal_mask = batch['ita_causal_mask'].to(device)
            
            # Prepare masks for the model forward pass
            batch_size, ita_seq_len = ita_ids.shape
            _, eng_seq_len = eng_ids.shape
            num_heads = model.decoder_blocks[0].cross_attention.num_heads
            
            # Eng mask for decoder cross-attention
            eng_cross_mask = eng_mask.unsqueeze(1).unsqueeze(2)
            eng_cross_mask = eng_cross_mask.expand(batch_size, num_heads, ita_seq_len, eng_seq_len)
            eng_cross_mask = (1.0 - eng_cross_mask.float()) * -1e9
            
            # Ita causal mask for decoder self-attention
            ita_causal_mask = ita_causal_mask.unsqueeze(1)
            ita_causal_mask = ita_causal_mask.expand(batch_size, num_heads, ita_seq_len, ita_seq_len)
            
            # Forward pass
            logits = model(
                eng_ids=eng_ids,
                ita_ids=ita_ids,
                eng_mask=eng_cross_mask,
                ita_mask=ita_causal_mask,
                eng_token_type_ids=eng_token_type_ids,
                ita_token_type_ids=ita_token_type_ids
            )
            
            # Calculate loss
            loss = criterion(logits.view(-1, logits.size(-1)), ita_target_ids.view(-1))
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            train_iterator.set_postfix({'loss': loss.item()})
        
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} finished. Average training loss: {avg_train_loss:.4f}")
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        val_iterator = tqdm.tqdm(val_dataloader, desc=f"Validation Epoch {epoch+1}")
        
        with torch.no_grad():
            for batch in val_iterator:
                # Move batch to device
                eng_ids = batch['eng_ids'].to(device)
                eng_mask = batch['eng_mask'].to(device)
                eng_token_type_ids = batch['eng_token_type_ids'].to(device)
                ita_ids = batch['ita_ids'].to(device)
                ita_mask = batch['ita_mask'].to(device)
                ita_target_ids = batch['ita_target_ids'].to(device)
                ita_token_type_ids = batch['ita_token_type_ids'].to(device)
                ita_causal_mask = batch['ita_causal_mask'].to(device)
                
                # Prepare masks (same as training)
                batch_size, ita_seq_len = ita_ids.shape
                _, eng_seq_len = eng_ids.shape
                num_heads = model.decoder_blocks[0].cross_attention.num_heads
                
                eng_cross_mask = eng_mask.unsqueeze(1).unsqueeze(2)
                eng_cross_mask = eng_cross_mask.expand(batch_size, num_heads, ita_seq_len, eng_seq_len)
                eng_cross_mask = (1.0 - eng_cross_mask.float()) * -1e9
                
                ita_causal_mask = ita_causal_mask.unsqueeze(1)
                ita_causal_mask = ita_causal_mask.expand(batch_size, num_heads, ita_seq_len, ita_seq_len)
                
                # Forward pass
                logits = model(
                    eng_ids=eng_ids,
                    ita_ids=ita_ids,
                    eng_mask=eng_cross_mask,
                    ita_mask=ita_causal_mask,
                    eng_token_type_ids=eng_token_type_ids,
                    ita_token_type_ids=ita_token_type_ids
                )
                
                # Calculate validation loss
                val_loss = criterion(logits.view(-1, logits.size(-1)), ita_target_ids.view(-1))
                total_val_loss += val_loss.item()
                val_iterator.set_postfix({'val_loss': val_loss.item()})
        
        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Epoch {epoch+1} validation loss: {avg_val_loss:.4f}")
        print("-" * 50)

def main():
    # Configuration
    MAXLEN = 10
    BATCH_SIZE = 32
    EPOCHS = 5
    LEARNING_RATE = 1e-4
    
    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load and prepare data
    print("Loading dataset...")
    eng_sentences, ita_sentences = download_sentence()
    data_pair = list(zip(eng_sentences, ita_sentences))
    
    # Build vocabularies
    print("Building vocabularies...")
    eng_vocab = build_vocabulary(eng_sentences, min_freq=2)
    ita_vocab = build_vocabulary(ita_sentences, min_freq=2)
    
    print(f"English vocabulary size: {len(eng_vocab)}")
    print(f"Italian vocabulary size: {len(ita_vocab)}")
    
    # Split data
    train_size = int(0.8 * len(data_pair))
    train_data = data_pair[:train_size]
    val_data = data_pair[train_size:]
    
    print(f"Total pairs: {len(data_pair)}")
    print(f"Training pairs: {len(train_data)}")
    print(f"Validation pairs: {len(val_data)}")
    
    # Create datasets and dataloaders
    train_dataset = TranslationDataset(train_data, eng_vocab, ita_vocab, seq_len=MAXLEN)
    val_dataset = TranslationDataset(val_data, eng_vocab, ita_vocab, seq_len=MAXLEN)
    
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    print(f"Number of training batches: {len(train_dataloader)}")
    print(f"Number of validation batches: {len(val_dataloader)}")
    
    # Initialize model
    print("Initializing model...")
    model = BertTranslationModel(
        ita_vocab_size=len(ita_vocab),
        eng_vocab_size=len(eng_vocab),
        max_seq_len=MAXLEN,
        d_model=768,
        num_layers=6,
        num_heads=12,
        dropout=0.1
    )
    
    model.to(device)
    print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")
    
    # Setup loss function and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=ita_vocab['[PAD]'])
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    
    print("Loss function:", criterion)
    print("Optimizer:", optimizer)
    
    # Train the model
    train_model(model, train_dataloader, val_dataloader, criterion, optimizer, device, EPOCHS)
    
    # Save the model
    model_save_path = "bert_translation_model.pth"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")
    
    # Test with a sample
    print("\n=== Testing with sample translations ===")
    model.eval()
    test_samples = [
        ("Hello world", "Ciao mondo"),
        ("Good morning", "Buongiorno"),
        ("Thank you", "Grazie")
    ]
    
    with torch.no_grad():
        for eng_text, expected_ita in test_samples:
            print(f"English: {eng_text}")
            print(f"Expected Italian: {expected_ita}")
            
            # Simple tokenization for testing
            eng_words = eng_text.lower().split()
            eng_ids = [eng_vocab.get(word.strip('.,!?;:"()[]{}'), eng_vocab['[UNK]']) for word in eng_words]
            eng_ids = [eng_vocab['[CLS]']] + eng_ids + [eng_vocab['[SEP]']]
            
            if len(eng_ids) < MAXLEN:
                eng_ids += [eng_vocab['[PAD]']] * (MAXLEN - len(eng_ids))
            else:
                eng_ids = eng_ids[:MAXLEN]
            
            eng_tensor = torch.tensor([eng_ids], device=device)
            eng_mask = torch.tensor([[1 if id != eng_vocab['[PAD]'] else 0 for id in eng_ids]], device=device)
            
            # Generate prediction (simplified)
            ita_input = torch.tensor([[ita_vocab['[CLS]']] + [ita_vocab['[PAD]']] * (MAXLEN-1)], device=device)
            
            batch_size, ita_seq_len = ita_input.shape
            _, eng_seq_len = eng_tensor.shape
            num_heads = model.decoder_blocks[0].cross_attention.num_heads
            
            eng_cross_mask = eng_mask.unsqueeze(1).unsqueeze(2)
            eng_cross_mask = eng_cross_mask.expand(batch_size, num_heads, ita_seq_len, eng_seq_len)
            eng_cross_mask = (1.0 - eng_cross_mask.float()) * -1e9
            
            causal_mask = torch.triu(torch.ones(MAXLEN, MAXLEN), diagonal=1).bool()
            causal_mask = causal_mask.masked_fill(causal_mask, float('-inf'))
            causal_mask = causal_mask.masked_fill(~causal_mask, 0.0)
            ita_causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch_size, num_heads, ita_seq_len, ita_seq_len).to(device)
            
            logits = model(
                eng_ids=eng_tensor,
                ita_ids=ita_input,
                eng_mask=eng_cross_mask,
                ita_mask=ita_causal_mask,
                eng_token_type_ids=torch.zeros_like(eng_tensor),
                ita_token_type_ids=torch.zeros_like(ita_input)
            )
            
            predictions = torch.argmax(logits[0], dim=-1)
            
            # Decode prediction
            ita_id_to_word = {v: k for k, v in ita_vocab.items()}
            predicted_words = [ita_id_to_word.get(token_id.item(), '[UNK]') for token_id in predictions[:5]]
            predicted_text = ' '.join([w for w in predicted_words if w not in ['[PAD]', '[CLS]', '[SEP]']])
            
            print(f"Predicted Italian: {predicted_text}")
            print()
if __name__ == "__main__":
    main()