<a href="https://colab.research.google.com/github/naoya526/jpn2ita/blob/main/Bert_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Module

I used materials below:
[1]`06_Attention_and_Transformers_in_BERT.ipynb`
[2]`English_to_italian_automatic_translation.ipynb`

In [1]:
import os
import re
import random
import itertools
import math
from pathlib import Path

import tqdm

import numpy as np
print(np.__version__)

import torch
print(torch.__version__)

import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

import transformers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, BertConfig

### Suppress useless warnings
import warnings
warnings.filterwarnings("ignore", message="The secret `HF_TOKEN` does not exist")

from collections import defaultdict
from transformers import AutoTokenizer

2.0.2
2.6.0+cu124


## Define Model
---


### Encoder (Bert) part
Here, There's the function for implementing Encoder(Bert). I implemented with refering to [1]`06_Attention_and_Transformers_in_BERT.ipynb` and the paper.
- `MultiHeadAttention`
- `PositionwiseFeedForward`
- `Encoder Block`
- `BertEmbeddings` (Embedding for words)
- `Bert`
Bert is highly possible to understand meaning, but it is not enough for produce translation.
Hence, In the next part, I implement Decoder. It is quite similar to Bert.

In [2]:
class MultiHeadAttention(nn.Module):
    """
    - Query, Key, Value
    - Scaled Dot Product Attention: softmax(QK^T / sqrt(d_k))V
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.dropout = torch.nn.Dropout(dropout)

        # Q, K, V linear Conversion
        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)

        self.out_proj = torch.nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.shape

        # step1: Q, K, V
        query = self.query(x)  # (batch, seq_len, d_model)
        key = self.key(x)      # (batch, seq_len, d_model)
        value = self.value(x)  # (batch, seq_len, d_model)

        # step2: Multi-Head
        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim)  # 修正: query.shape → batch_size
        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim)

        # step3: Change Dimention for Calclate Efficiently
        query = query.permute(0, 2, 1, 3)  # (batch, num_heads, seq_len, head_dim)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)

        # ステップ4: Scaled Dot-Product Attention
        # scores = Q @ K^T / sqrt(d_k)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # ステップ5: マスク処理（オプション）
        if mask is not None:
            # mask形状: (batch, 1, 1, seq_len) または (batch, 1, seq_len, seq_len) など、scores形状にブロードキャスト可能な形状
            # scores形状: (batch, num_heads, seq_len, seq_len)
            # 0を-1e9に変換（Softmaxで0になるように）→ 加算によるマスキングに変更
            # scores = scores.masked_fill(mask == 0, -1e9) # 元のコード
            scores = scores + mask # 加算によるマスキングに変更

        # ステップ6: Softmax + Dropout
        weights = F.softmax(scores, dim=-1)  # (batch, num_heads, seq_len, seq_len)
        weights = self.dropout(weights)
        # ステップ7: Value との積
        context = torch.matmul(weights, value)
        # ステップ8: ヘッドを結合して元の形状に戻す
        context = context.permute(0, 2, 1, 3)
        # → (batch, seq_len, d_model)
        context = context.contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)

        # ステップ9: 最終的な線形変換
        return self.out_proj(context)  # 修正: output_linear → out_proj

class PositionwiseFeedForward(nn.Module):
    """
    ヒント:
    - 2層のフィードフォワードネットワーク
    - 中間層では次元を拡張（通常4倍）
    - GELU活性化関数を使用
    - ドロップアウトも忘れずに
    """
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)  # 入力次元 → 中間次元
        self.linear2 = nn.Linear(d_ff, d_model)  # 中間次元
        self.dropout = nn.Dropout(dropout)
        self.gelu = nn.GELU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.dropout(x)
        return x

class EncoderBlock(nn.Module):
    """
    ヒント:
    - Multi-Head Attention + Residual Connection + Layer Norm
    - Feed Forward + Residual Connection + Layer Norm
    - Which is better??: Pre-LN vs Post-LN
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model,num_heads)
        self.ffn = PositionwiseFeedForward(d_model,d_ff)

        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        #Attention block
        #TODO implement transformer block
        residual = x
        #print("Took Residual...",x.shape)
        x = self.layer_norm1(x)
        #print("calculating layer norm...",x.shape)
        x = self.dropout(self.attention(x,mask))
        #print("calculating Attention...",x.shape)
        x = x + residual
        #print("calculating Residual Connection...",x.shape)
        #ffnn
        residual = x
        x = self.layer_norm2(x)
        #print("calculating layer norm...",x.shape)
        x = self.dropout(self.ffn(x))
        #print("calculating ffn...",x.shape)
        x = x + residual
        return x


class BertEmbeddings(nn.Module):
    """
    - Token Embeddings (語彙サイズ × d_model)
    - Position Embeddings (最大系列長 × d_model)
    - Segment Embeddings (2 × d_model, NSPタスク用)
    - 3つを足し合わせてLayerNormとDropout
    """
    def __init__(self, vocab_size, d_model, max_seq_len=512, dropout=0.1):
        super().__init__()
        # TODO: 3種類の埋め込みを実装
        self.d_model = d_model
        self.token = torch.nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.position = torch.nn.Embedding(max_seq_len, d_model)
        self.segment = torch.nn.Embedding(2, d_model)  # 2つのセグメント（0と1）
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        #Embedding: Lookup table that keep meaning vector of words
    def forward(self, input_ids, token_type_ids=None):
        # TODO: 埋め込みの計算を実装
        batch_size, seq_len = input_ids.shape
        # Step 1: Token Embeddings
        token_embeddings = self.token(input_ids)
        # Step 2: Position Embeddings
        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        position_ids = position_ids.expand(batch_size, -1)  # 🔧 バッチ次元を拡張
        position_embeddings = self.position(position_ids)
        # Step 3: Segment Embeddings
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)  # 全て0（単一文）
        segment_embeddings = self.segment(token_type_ids)  # (batch, seq_len, d_model)
        embeddings = token_embeddings + position_embeddings + segment_embeddings
        embeddings = self.dropout(self.layer_norm(embeddings))

        return embeddings

class Bert(nn.Module):
    """
    BERT実装の最終形

    学習のヒント:
    1. 論文を読んで全体像を理解
    2. 小さな部品から実装（Attention → FFN → Block → Full Model）
    3. 各層で print(tensor.shape) してサイズを確認
    4. 簡単なダミーデータでテスト
    5. 事前学習は計算量が大きいので、小さいモデルから開始

    重要な概念:
    - Bidirectional: 左右両方向の文脈を見る
    - Masked Language Model: ランダムにマスクした単語を予測
    - Next Sentence Prediction: 2つの文が連続するかを予測
    - Attention Weights: どの単語に注目しているかの可視化
    """

    def __init__(self, vocab_size, d_model=768, num_layers=12, num_heads=12, d_ff=3072, max_seq_len=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.heads = num_heads
        # paper noted 4*d_model size for ff
        self.feed_forward_hidden = d_model * 4
        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BertEmbeddings(vocab_size, d_model, max_seq_len, dropout)

        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderBlock(d_model, num_heads, d_model * 4, dropout) for _ in range(num_layers)])

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # TODO: BERT全体のforward passを実装
        if attention_mask is None:
            attention_mask = (input_ids != 0).float()
        if attention_mask.dim() == 2:
            # (batch, seq_len) → (batch, 1, 1, seq_len)
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            # print("squeeze is required") # デバッグプリントを削除
        elif attention_mask.dim() == 4:
            # 既に正しい形状の場合はそのまま使用
            extended_attention_mask = attention_mask
            # print("squeeze is not required") # デバッグプリントを削除
        else:
             raise ValueError(f"Attention mask should be 2D or 4D, but got {attention_mask.dim()}D")

        # 0を-1e9に変換（Softmaxで0になるように） - 加算によるマスキングのために値を調整
        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9


        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(input_ids, token_type_ids)
        # running over multiple transformer blocks
        for encoder in self.encoder_blocks:
            x = encoder.forward(x, extended_attention_mask) # 修正後のMultiHeadAttentionは加算マスクを期待
        return x

### Decoder part
This part, I implemented these functions:
- `CrossAttention`(English Queue, Italian Key, Italian Value)
- `DecoderBlock`
- `BertTranslationModel`(Bert + Decoder Embedding + DecoderBlock*`num_layers`)

In [3]:
class CrossAttention(nn.Module):
    """
    this module is implemented with modifying MultiHeadAttention.
    Query: English
    Key, Value: Italian
    You can see the difference in forward input
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__() # initialization
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads  # dimention of each head
        self.dropout = torch.nn.Dropout(dropout)

        # Q, K, V の線形変換（修正：torch.nn.linear → torch.nn.Linear）
        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)

        # 最終的な出力変換
        self.out_proj = torch.nn.Linear(d_model, d_model)

    def forward(self, query_input, key_value_input, mask=None): # here is the difference
        batch_size, q_len, _ = query_input.shape
        _, kv_len, _ = key_value_input.shape
        # ステップ1: Q, K, V を線形変換で生成
        query = self.query(query_input)  # (batch, seq_len, d_model)
        key = self.key(key_value_input)      # (batch, seq_len, d_model)
        value = self.value(key_value_input)  # (batch, seq_len, d_model)

        # ステップ2: Multi-Head用に次元を変形
        query = query.view(batch_size, q_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, kv_len, self.num_heads, self.head_dim)  # 修正: query.shape → batch_size
        value = value.view(batch_size, kv_len, self.num_heads, self.head_dim)

        query = query.permute(0, 2, 1, 3)  # (batch, num_heads, seq_len, head_dim)
        key = key.permute(0, 2, 1, 3)
        value = value.permute(0, 2, 1, 3)

        # ステップ4: Scaled Dot-Product Attention
        # scores = Q @ K^T / sqrt(d_k)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # ステップ5: マスク処理（オプション）
        if mask is not None:
            # mask形状: (batch, 1, 1, seq_len) → scores形状: (batch, num_heads, seq_len, seq_len)
            scores = scores + mask  # ブロードキャストで加算

        # ステップ6: Softmax + Dropout
        weights = F.softmax(scores, dim=-1)  # (batch, num_heads, seq_len, seq_len)
        weights = self.dropout(weights)
        # ステップ7: Value との積
        context = torch.matmul(weights, value)
        # ステップ8: ヘッドを結合して元の形状に戻す
        context = context.permute(0, 2, 1, 3)
        # → (batch, seq_len, d_model)
        context = context.contiguous().view(batch_size, q_len, self.num_heads * self.head_dim)

        # ステップ9: 最終的な線形変換
        return self.out_proj(context)  # 修正: output_linear → out_proj

class DecoderBlock(nn.Module):
    """
    Basically similar to EncoderBlock, but refer to the infomation of Input(English context)
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        #First, implement Self Attention
        self.self_attention = MultiHeadAttention(d_model,num_heads)
        #Second, implement Cross Attention
        self.cross_attention = CrossAttention(d_model, num_heads)
        #Third, FFNN
        self.ffn = PositionwiseFeedForward(d_model,d_ff)

        self.layer_norm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layer_norm3 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, self_mask=None, cross_mask=None):
        #Self Attention
        residual = x
        x = self.layer_norm1(x)
        x = self.self_attention(x,mask=self_mask)
        x = self.dropout(x) + residual

        #Cross Attention
        residual = x
        x = self.layer_norm2(x)
        x = self.cross_attention(
            query_input=x,
            key_value_input=encoder_output,
            mask=cross_mask
        )
        x = self.dropout(x) + residual

        residual = x
        x = self.layer_norm3(x)
        x = self.ffn(x)
        x = self.dropout(x) + residual
        return x

class BertTranslationModel(nn.Module):
    """
    Ita2Eng Translation Model
    Encoder: Bert
    Decoder: BertEmbedding, DecoderBlock*N, FFN
    """
    def __init__(self,
                 ita_vocab_size,  # イタリア語語彙サイズ
                 eng_vocab_size,  # 英語語彙サイズ
                 max_seq_len,
                 d_model=512,
                 num_layers=6,
                 num_heads=8,
                 dropout=0.1):
        super().__init__()

        self.encoder = Bert(
            vocab_size=eng_vocab_size,
            d_model=d_model,
            num_layers=num_layers,
            num_heads=num_heads,
            max_seq_len=max_seq_len,
            dropout=dropout
        )

        self.decoder_embeddings = BertEmbeddings(
            vocab_size=ita_vocab_size,
            d_model=d_model,
            max_seq_len=max_seq_len,
            dropout=dropout
        )

        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_model * 4, #based on the paper of Bert
                dropout=dropout)
            for _ in range(num_layers)
        ])

        self.output_proj = nn.Linear(d_model, ita_vocab_size)

    def forward(self,
                eng_ids,
                ita_ids,
                eng_mask=None,
                ita_mask=None,
                eng_token_type_ids=None,
                ita_token_type_ids=None):
        # understand english
        encoder_output = self.encoder(input_ids=eng_ids, attention_mask=eng_mask, token_type_ids=eng_token_type_ids)
        # produce Italian
        decoder_input = self.decoder_embeddings(input_ids=ita_ids, token_type_ids=ita_token_type_ids)

        for decoder_block in self.decoder_blocks:
            decoder_input = decoder_block(
                x=decoder_input,
                encoder_output=encoder_output,
                self_mask=ita_mask,               # 英語のCausal mask
                cross_mask=eng_mask
                )
        logits = self.output_proj(decoder_input)
        return logits

## Use model
In this part, I followed the configuration of [2]`English_to_italian_automatic_translation.ipynb`.

---
### Prepare Dataset
for Bert, `<sos>`and `<eos>` are not required. Hence, ignore these token.

In [4]:
# Download the files
URL = "https://drive.google.com/file/d/1_npGYZk13fs5hE0kAggiSrmKkqW3OrLT/view?usp=sharing"
!gdown --fuzzy $URL -O- | tar -xz

Downloading...
From: https://drive.google.com/uc?id=1_npGYZk13fs5hE0kAggiSrmKkqW3OrLT
To: <_io.BufferedWriter name='<stdout>'>
100% 3.92M/3.92M [00:00<00:00, 15.4MB/s]


In [5]:
# for Bert, <sos> and <eos> are not required
#SPECIAL = ["<sos>", "<eos>", "<pad>"]
SPECIAL = ["<pad>"]
MAXLEN = 20

f = open("text-eng.txt")
# Define the list of all tokens in the English set ...
ENG_VOCABULARY = []
for line in f:
    line = line.strip()
    # Remove <sos> and <eos>
    line = line.replace('<sos>', '').replace('<eos>', '').strip()
    if line == "":
        continue

    ENG_VOCABULARY.append(line)
f.close()
print(ENG_VOCABULARY[:50])

f = open("text-ita.txt")
# Define the list of all tokens in the Italian set ...
ITA_VOCABULARY = []
for line in f:
    line = line.strip()
    # Remove <sos> and <eos>
    line = line.replace('<sos>', '').replace('<eos>', '').strip()
    if line == "":
        continue
    ITA_VOCABULARY.append(line)
f.close()
print(ITA_VOCABULARY[:50])
# Make sure that the three special tokens have the same indices in the two vocabularies.
# Assign here the three indices...

PAD = SPECIAL[0]

# Inverse mappings.
ENG_INVERSE = {w: n for n, w in enumerate(ENG_VOCABULARY)}
ITA_INVERSE = {w: n for n, w in enumerate(ITA_VOCABULARY)}
#print(ENG_INVERSE)
print(len(ENG_VOCABULARY), len(ITA_VOCABULARY))

['hi .', 'hi .', 'run !', 'run !', 'run !', 'who ?', 'wow !', 'duck !', 'duck !', 'jump !', 'jump !', 'jump !', 'jump .', 'jump .', 'jump .', 'stay .', 'stay .', 'stay .', 'stay .', 'stay .', 'stay .', 'stay .', 'stay .', 'stay .', 'stop !', 'stop !', 'stop !', 'wait !', 'wait !', 'wait !', 'wait .', 'wait .', 'wait .', 'do it .', 'do it .', 'do it .', 'do it .', 'do it .', 'do it .', 'go on .', 'go on .', 'go on .', 'go on .', 'go on .', 'go on .', 'hello !', 'hello !', 'hello !', 'hello .', 'i hid .']
['ciao !', 'ciao .', 'corri !', 'corra !', 'correte !', 'chi ?', 'wow !', 'amore !', 'tesoro !', 'salta !', 'salti !', 'saltate !', 'salta .', 'salti .', 'saltate .', 'resta .', 'stai .', 'stia .', 'state .', 'resti .', 'restate .', 'rimani .', 'rimanga .', 'rimanete .', 'fermati !', 'fermatevi !', 'si fermi !', 'aspetta !', 'aspettate !', 'aspetti !', 'aspetta .', 'aspetti .', 'aspettate .', 'fallo .', 'falla .', 'lo faccia .', 'la faccia .', 'fatelo .', 'fatela .', 'vai avanti .', 'co

### Incremental approach to token vocabulary building
In the lesson of Deep Learning, I learned the sophisticated way of tokenizing words called WordPiece tokenization.

The WordPiece tokenization algorithm builds its vocabulary incrementally, starting from a basic alphabet and iteratively merging subword units based on their frequency and co-occurrence patterns. (cited from [1]`06_Attention_and_Transformers_in_Bert.ipynb`)

---

#### The demonstration of pretrained tokenizer

In [1], Tokenizer `bert-base-cased` was used for English(For tokenization of English, it's used in this project as well). In this project, Tokenizier [3]`dbmdz/bert-base-italian-cased` for italian is used.
[3]https://huggingface.co/dbmdz/bert-base-italian-cased  
\
In this section, With using small scentence, The procedure will be explained.

These procedure will be iterated:
- Compute word frequencies
- Split Words into Alphabet
- Compute score of each pair
- Merge the pair

In [6]:
eng_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  # English
ita_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")  # Italian

### Example bilingual corpus
eng_corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

ita_corpus = [
    "Questo è il corso di Hugging Face.",
    "Questo capitolo riguarda la tokenizzazione.",
    "Questa sezione mostra diversi algoritmi di tokenizzazione.",
    "Speriamo che tu sia in grado di capire come vengono addestrati e generano token.",
]

### Get frequency for English
eng_word_freqs = defaultdict(int)
for text in eng_corpus:
    words_with_offsets = eng_tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    print(f"English: {new_words}")
    for word in new_words:
        eng_word_freqs[word] += 1

### Get frequency for Italian
ita_word_freqs = defaultdict(int)
for text in ita_corpus:
    words_with_offsets = ita_tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    print(f"Italian: {new_words}")
    for word in new_words:
        ita_word_freqs[word] += 1

print(f"\nEnglish Word Frequency: {eng_word_freqs}")
print(f"Italian Word Frequency: {ita_word_freqs}")

# Get vocabulary sizes for model initialization
eng_vocab_size = eng_tokenizer.vocab_size
ita_vocab_size = ita_tokenizer.vocab_size
print(f"\nEnglish vocab size: {eng_vocab_size}")
print(f"Italian vocab size: {ita_vocab_size}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

English: ['This', 'is', 'the', 'Hugging', 'Face', 'Course', '.']
English: ['This', 'chapter', 'is', 'about', 'tokenization', '.']
English: ['This', 'section', 'shows', 'several', 'tokenizer', 'algorithms', '.']
English: ['Hopefully', ',', 'you', 'will', 'be', 'able', 'to', 'understand', 'how', 'they', 'are', 'trained', 'and', 'generate', 'tokens', '.']
Italian: ['Questo', 'è', 'il', 'corso', 'di', 'Hugging', 'Face', '.']
Italian: ['Questo', 'capitolo', 'riguarda', 'la', 'tokenizzazione', '.']
Italian: ['Questa', 'sezione', 'mostra', 'diversi', 'algoritmi', 'di', 'tokenizzazione', '.']
Italian: ['Speriamo', 'che', 'tu', 'sia', 'in', 'grado', 'di', 'capire', 'come', 'vengono', 'addestrati', 'e', 'generano', 'token', '.']

English Word Frequency: defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 

In [7]:
### split all word into alphabet
alphabet = []
for word in ita_word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
print(f'All alphabets: {alphabet}')

### insert special token and subword
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in ita_word_freqs.keys()}
print(f'\nSplitted Words: {splits}')

All alphabets: ['##a', '##c', '##d', '##e', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##z', '.', 'F', 'H', 'Q', 'S', 'a', 'c', 'd', 'e', 'g', 'i', 'l', 'm', 'r', 's', 't', 'v', 'è']

Splitted Words: {'Questo': ['Q', '##u', '##e', '##s', '##t', '##o'], 'è': ['è'], 'il': ['i', '##l'], 'corso': ['c', '##o', '##r', '##s', '##o'], 'di': ['d', '##i'], 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'], 'Face': ['F', '##a', '##c', '##e'], '.': ['.'], 'capitolo': ['c', '##a', '##p', '##i', '##t', '##o', '##l', '##o'], 'riguarda': ['r', '##i', '##g', '##u', '##a', '##r', '##d', '##a'], 'la': ['l', '##a'], 'tokenizzazione': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##z', '##a', '##z', '##i', '##o', '##n', '##e'], 'Questa': ['Q', '##u', '##e', '##s', '##t', '##a'], 'sezione': ['s', '##e', '##z', '##i', '##o', '##n', '##e'], 'mostra': ['m', '##o', '##s', '##t', '##r', '##a'], 'diversi': ['d', '##i', '##v', '##e', '##r', '##s',

In [8]:
 ### compute score for merging

def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)

    for word, freq in ita_word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

pair_scores = compute_pair_scores(splits)
print(f'Scores for each Pair: {pair_scores}')

Scores for each Pair: {('Q', '##u'): 0.16666666666666666, ('##u', '##e'): 0.025, ('##e', '##s'): 0.02857142857142857, ('##s', '##t'): 0.08928571428571429, ('##t', '##o'): 0.01875, ('i', '##l'): 0.16666666666666666, ('c', '##o'): 0.02, ('##o', '##r'): 0.01, ('##r', '##s'): 0.02857142857142857, ('##s', '##o'): 0.007142857142857143, ('d', '##i'): 0.05263157894736842, ('H', '##u'): 0.16666666666666666, ('##u', '##g'): 0.027777777777777776, ('##g', '##g'): 0.027777777777777776, ('##g', '##i'): 0.008771929824561403, ('##i', '##n'): 0.0043859649122807015, ('##n', '##g'): 0.027777777777777776, ('F', '##a'): 0.06666666666666667, ('##a', '##c'): 0.06666666666666667, ('##c', '##e'): 0.05, ('c', '##a'): 0.02666666666666667, ('##a', '##p'): 0.044444444444444446, ('##p', '##i'): 0.03508771929824561, ('##i', '##t'): 0.013157894736842105, ('##o', '##l'): 0.016666666666666666, ('##l', '##o'): 0.016666666666666666, ('r', '##i'): 0.05263157894736842, ('##i', '##g'): 0.008771929824561403, ('##g', '##u'): 

In [9]:
### finding pair with best score

best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score

print(best_pair, max_score)
vocab.append("ab")

### merge pair ###
def merge_pair(a, b, splits):
    for word in ita_word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

splits = merge_pair("Q", "##u", splits)
print(splits["Questo"])

('S', '##p') 0.3333333333333333
['Qu', '##e', '##s', '##t', '##o']


In [10]:
### keep looping to merge more pair

vocab_size = 70
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)

print(f'Final Vocab: {vocab}')

Final Vocab: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##c', '##d', '##e', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##z', '.', 'F', 'H', 'Q', 'S', 'a', 'c', 'd', 'e', 'g', 'i', 'l', 'm', 'r', 's', 't', 'v', 'è', 'ab', 'Hu', 'Sp', 'ch', 'il', 'al', 'ad', 'add', 'Hug', 'Hugg', 'alg', '##gu', 'tu', '##st', '##tm', '##rs', '##ng', 'in', 'Fa', 'Fac', '##ap', 'cap', '##gua', '##guar', '##guard', '##guarda', 'la', '##ad']


In [11]:
### encode a word ###
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

print(encode_word("Questo"))
# This one should be unknown (within this corpus)
print(encode_word("Qaesto"))

['Q', '##u', '##e', '##st', '##o']
['Q', '##a', '##e', '##st', '##o']


## BERTDataset

In [12]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):

        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(item)

        # Step 2: replace random words in sentence with mask / random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
        # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            # 15% chance of altering token
            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        '''return random sentence pair'''
        t1, t2 = self.get_corpus_line(index)

        # negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''return sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''return random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]

# Task
Create a translation model using `BertTranslationModel` based on loaded text data. The model should translate from English to Italian. Outline the steps for Tokenization, Word Embedding, and training.

## データセットの準備

### Subtask:
翻訳モデル用に、英語とイタリア語のペアを含むデータセットを準備します。これには、テキストデータの読み込み、トークン化、パディング、テンソルへの変換などが含まれます。`BERTDataset` クラスを翻訳タスクに合わせて修正または新しく定義する必要があるかもしれません。


**Reasoning**:
The first step is to load the English and Italian sentence pairs from the text files and store them as a list of tuples.



In [13]:
eng_sentences = []
with open("text-eng.txt", "r") as f:
    for line in f:
        line = line.strip().replace('<sos>', '').replace('<eos>', '').strip()
        if line:
            eng_sentences.append(line)

ita_sentences = []
with open("text-ita.txt", "r") as f:
    for line in f:
        line = line.strip().replace('<sos>', '').replace('<eos>', '').strip()
        if line:
            ita_sentences.append(line)

data_pair = list(zip(eng_sentences, ita_sentences))

# Split data into training and validation sets
# Using a simple split for now, can use train_test_split later if needed
train_size = int(0.8 * len(data_pair))
train_data = data_pair[:train_size]
val_data = data_pair[train_size:]

print(f"Total pairs: {len(data_pair)}")
print(f"Training pairs: {len(train_data)}")
print(f"Validation pairs: {len(val_data)}")

Total pairs: 333112
Training pairs: 266489
Validation pairs: 66623


In [14]:
class TranslationDataset(Dataset):
    def __init__(self, data_pair, eng_tokenizer, ita_tokenizer, seq_len=64):
        self.data_pair = data_pair
        self.eng_tokenizer = eng_tokenizer
        self.ita_tokenizer = ita_tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data_pair)

    def __getitem__(self, item):
        eng_sentence, ita_sentence = self.data_pair[item]

        # Tokenize English sentence
        eng_tokens = self.eng_tokenizer(
            eng_sentence,
            padding='max_length',
            truncation=True,
            max_length=self.seq_len,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )

        # Tokenize Italian sentence for decoder input and target
        # Need to add [CLS] and [SEP] for decoder input, and [SEP] for target
        # The target is the input shifted by one token, effectively predicting the next token
        ita_input_tokens = self.ita_tokenizer(
            ita_sentence,
            padding='max_length',
            truncation=True,
            max_length=self.seq_len,
            return_attention_mask=True, # This mask is for cross-attention from encoder
            return_token_type_ids=True,
            return_tensors='pt'
        )

        # Create decoder input by adding [CLS] and shifting
        ita_input_ids = ita_input_tokens['input_ids'].squeeze(0)
        ita_attention_mask = ita_input_tokens['attention_mask'].squeeze(0)
        ita_token_type_ids = ita_input_tokens['token_type_ids'].squeeze(0)

        # Create the causal mask for the decoder self-attention
        # This ensures the decoder at a given position only attends to previous positions
        seq_len = ita_input_ids.size(0)
        casual_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
        casual_mask = casual_mask.masked_fill(casual_mask, float('-inf'))
        casual_mask = casual_mask.masked_fill(~casual_mask, 0.0)


        # Create the target labels for Italian (shifted input)
        # The target is the input sequence shifted by one position, with the first token ([CLS]) removed.
        # The padding token at the end of the input sequence will correspond to the last real token in the target.
        ita_target_ids = ita_input_ids.clone()
        # Shift target sequence by one position
        ita_target_ids = torch.cat([ita_target_ids[1:], torch.tensor([self.ita_tokenizer.pad_token_id])])

        output = {
            "eng_ids": eng_tokens['input_ids'].squeeze(0),
            "eng_mask": eng_tokens['attention_mask'].squeeze(0),
            "eng_token_type_ids": eng_tokens['token_type_ids'].squeeze(0),
            "ita_ids": ita_input_ids,
            "ita_mask": ita_attention_mask, # This will be used for cross attention.
            "ita_token_type_ids": ita_token_type_ids,
            "ita_casual_mask": casual_mask, # This will be used for self attention in decoder.
            "ita_target_ids": ita_target_ids,
        }

        return output


**Reasoning**:
Create Dataset objects for the training and validation sets using the defined `TranslationDataset` class and then create DataLoaders for both datasets to handle batching and shuffling during training and evaluation.



In [15]:
train_dataset = TranslationDataset(train_data, eng_tokenizer, ita_tokenizer, seq_len=MAXLEN)
val_dataset = TranslationDataset(val_data, eng_tokenizer, ita_tokenizer, seq_len=MAXLEN)

BATCH_SIZE = 32

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

print(f"Number of batches in training DataLoader: {len(train_dataloader)}")
print(f"Number of batches in validation DataLoader: {len(val_dataloader)}")

Number of batches in training DataLoader: 8328
Number of batches in validation DataLoader: 2082


## モデルの定義

### Subtask:
`BertTranslationModel` クラスを使用してモデルをインスタンス化します。この際、適切な語彙サイズ、モデルの次元、レイヤー数、ヘッド数などを指定します。


**Reasoning**:
BertTranslationModel クラスを適切な引数でインスタンス化し、その構造を確認します。



In [16]:
model = BertTranslationModel(
    ita_vocab_size=ita_tokenizer.vocab_size,
    eng_vocab_size=eng_tokenizer.vocab_size,
    max_seq_len=MAXLEN,
    d_model=768, # 使用するBERTモデルの一般的な次元
    num_layers=6, # レイヤー数は適宜設定
    num_heads=12, # ヘッド数は適宜設定 (d_modelで割り切れるように)
    dropout=0.1
)

print(model)

BertTranslationModel(
  (encoder): Bert(
    (embedding): BertEmbeddings(
      (token): Embedding(28996, 768, padding_idx=0)
      (position): Embedding(20, 768)
      (segment): Embedding(2, 768)
      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_blocks): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=768, out_features=3072, bias=True)
          (linear2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(

## 損失関数とオプティマイザの設定

### Subtask:
翻訳タスクに適した損失関数（例: `CrossEntropyLoss`）とオプティマイザ（例: `Adam`）を定義します。


**Reasoning**:
Define the loss function and the optimizer for the translation task as instructed.



In [17]:
criterion = nn.CrossEntropyLoss(ignore_index=ita_tokenizer.pad_token_id)
optimizer = Adam(model.parameters(), lr=1e-4)

print("Loss function (criterion):", criterion)
print("Optimizer:", optimizer)

Loss function (criterion): CrossEntropyLoss()
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0
)


## 訓練ループの実装

### Subtask:
モデルを訓練するためのループを作成します。これには、データのバッチ処理、モデルのフォワードパス、損失の計算、バックプロパゲーション、パラメータの更新などが含まれます。


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

EPOCHS = 5 # Define the number of training epochs

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    train_iterator = tqdm.tqdm(train_dataloader, desc=f"Epoch {epoch+1}")

    for i, batch in enumerate(train_iterator):
        # Move batch to device
        eng_ids = batch['eng_ids'].to(device)
        eng_mask = batch['eng_mask'].to(device)
        eng_token_type_ids = batch['eng_token_type_ids'].to(device)
        ita_ids = batch['ita_ids'].to(device)
        # ita_mask from dataset is for cross-attention, shape (batch_size, seq_len)
        ita_mask = batch['ita_mask'].to(device)
        ita_target_ids = batch['ita_target_ids'].to(device)
        ita_token_type_ids = batch['ita_token_type_ids'].to(device)
        # ita_casual_mask from dataset is for self-attention, shape (seq_len, seq_len)
        ita_casual_mask = batch['ita_casual_mask'].to(device)


        # Prepare masks for the model forward pass
        # Eng mask for encoder self-attention and decoder cross-attention
        # Needs to be broadcastable to (batch, num_heads, q_len/seq_len, kv_len/seq_len)
        batch_size, ita_seq_len = ita_ids.shape
        _, eng_seq_len = eng_ids.shape
        num_heads = model.decoder_blocks[0].cross_attention.num_heads # Assuming same number of heads for self and cross attention

        # Eng mask for encoder self-attention
        # Reshape to (batch, 1, 1, eng_seq_len) for broadcasting
        eng_encoder_mask = eng_mask.unsqueeze(1).unsqueeze(2)
        # Convert to additive mask format (0.0 and -1e9)
        eng_encoder_mask = (1.0 - eng_encoder_mask.float()) * -1e9 # Ensure float for calculation


        # Eng mask for decoder cross-attention
        # Needs shape (batch, num_heads, ita_seq_len, eng_seq_len) for addition
        # Start with (batch, 1, 1, eng_seq_len) and expand
        eng_cross_mask = eng_mask.unsqueeze(1).unsqueeze(2)
        eng_cross_mask = eng_cross_mask.expand(batch_size, num_heads, ita_seq_len, eng_seq_len)
        # Convert to additive mask format (0.0 and -1e9)
        eng_cross_mask = (1.0 - eng_cross_mask.float()) * -1e9 # Ensure float for calculation


        # Ita causal mask for decoder self-attention
        # Dataset provides (seq_len, seq_len) mask. After DataLoader, it's (batch_size, seq_len, seq_len).
        # Needs shape (batch_size, num_heads, seq_len, seq_len) for addition.
        # Add head dimension and expand.
        ita_casual_mask = ita_casual_mask.unsqueeze(1) # shape (batch_size, 1, seq_len, seq_len)
        ita_casual_mask = ita_casual_mask.expand(batch_size, num_heads, ita_seq_len, ita_seq_len)
        # The mask from dataset is already in additive format (-inf and 0.0)


        # Forward pass
        # Pass appropriate masks to the model
        # The model expects eng_mask to be the cross-attention mask and ita_mask to be the self-attention mask for the decoder
        logits = model(
            eng_ids=eng_ids,
            ita_ids=ita_ids,
            eng_mask=eng_cross_mask, # Use eng_cross_mask for cross-attention in the model
            ita_mask=ita_casual_mask, # Use ita_casual_mask for self-attention in the model
            eng_token_type_ids=eng_token_type_ids,
            ita_token_type_ids=ita_token_type_ids
            )

        # Calculate loss, ignoring padding tokens
        # Reshape logits and target for CrossEntropyLoss
        loss = criterion(logits.view(-1, logits.size(-1)), ita_target_ids.view(-1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_iterator.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} finished. Average training loss: {avg_train_loss:.4f}")

    # Optional: Add validation loop here
    # model.eval()
    # with torch.no_grad():
    #     total_val_loss = 0
    #     for batch in val_dataloader:
    #         ... calculate validation loss ...
    #     avg_val_loss = total_val_loss / len(val_dataloader)
    #     print(f"Epoch {epoch+1} validation loss: {avg_val_loss:.4f}")

Using device: cuda


Epoch 1: 100%|██████████| 8328/8328 [31:44<00:00,  4.37it/s, loss=0.00184]


Epoch 1 finished. Average training loss: 0.2999


Epoch 2:  79%|███████▉  | 6615/8328 [25:07<06:19,  4.52it/s, loss=0.00198]