In [None]:
# -*- coding: utf-8 -*-
"""
Tugas Individu: Machine Translation dengan PyTorch
Mata Kuliah: Pembelajaran Mesin 2
Topik: Penerapan Deep Learning dalam NLP dan Skenario Nyata (Machine Translation)

Tujuan Pembelajaran:
- Membangun sistem penerjemah otomatis berbasis deep learning.
- Membandingkan baseline RNN+Attention dengan Transformer.
- Mengevaluasi performa dengan metrik standar (mis. SacreBLEU, chrF) serta menulis laporan ilmiah sesuai format IEEE.
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import Counter
import io
import os
import math
import time
import random
import re
import unicodedata
import sentencepiece as spm
from sacrebleu.metrics import BLEU, CHRF
from tqdm.notebook import tqdm
import torch.nn.functional as F

# --- 0. Konfigurasi Awal ---
# Set seed untuk reproduktifitas
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Menggunakan perangkat: {device}")

# Path ke dataset yang diunggah
DATA_PATH = '/content/ind.txt'
MODEL_DIR = 'spt_models'
os.makedirs(MODEL_DIR, exist_ok=True)

# --- 1. Persiapan Data ---

print("\n--- 1. Persiapan Data ---")

# Fungsi untuk membersihkan teks
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    return s.strip()

# Memuat dan membersihkan dataset
def load_data(path):
    en_sentences = []
    id_sentences = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                en_sentences.append(preprocess_sentence(parts[0]))
                id_sentences.append(preprocess_sentence(parts[1]))
    return en_sentences, id_sentences

en_data, id_data = load_data(DATA_PATH)
print(f"Jumlah pasangan kalimat yang dimuat: {len(en_data)}")

# Membagi data menjadi train, valid, test
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1

assert TRAIN_RATIO + VAL_RATIO + TEST_RATIO == 1.0

# Gabungkan data untuk shuffling
combined_data = list(zip(en_data, id_data))
random.shuffle(combined_data)
en_data, id_data = zip(*combined_data)

total_samples = len(en_data)
train_split = int(total_samples * TRAIN_RATIO)
val_split = int(total_samples * VAL_RATIO)

train_en, train_id = en_data[:train_split], id_data[:train_split]
val_en, val_id = en_data[train_split:train_split + val_split], id_data[train_split:train_split + val_split]
test_en, test_id = en_data[train_split + val_split:], id_data[train_split + val_split:]

print(f"Jumlah data train: {len(train_en)}")
print(f"Jumlah data validasi: {len(val_en)}")
print(f"Jumlah data test: {len(test_en)}")

# Tokenisasi Subword dengan SentencePiece
# Membangun model SentencePiece untuk bahasa Inggris
def train_sentencepiece(data, model_prefix, vocab_size=8000):
    temp_file = f"{model_prefix}_temp.txt"
    with open(temp_file, 'w', encoding='utf-8') as f:
        for sentence in data:
            f.write(sentence + '\n')

    spm.SentencePieceTrainer.train(
        f'--input={temp_file} --model_prefix={MODEL_DIR}/{model_prefix} '
        f'--vocab_size={vocab_size} --model_type=bpe '
        f'--character_coverage=1.0 --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3'
    )
    os.remove(temp_file)
    print(f"Model SentencePiece '{model_prefix}' dilatih dan disimpan di {MODEL_DIR}/.")

# Check if models exist before training
if not os.path.exists(f'{MODEL_DIR}/en_bpe.model'):
    train_sentencepiece(en_data, 'en_bpe', vocab_size=8000)
if not os.path.exists(f'{MODEL_DIR}/id_bpe.model'):
    train_sentencepiece(id_data, 'id_bpe', vocab_size=8000)


# Memuat model SentencePiece
sp_en = spm.SentencePieceProcessor()
sp_en.load(f'{MODEL_DIR}/en_bpe.model')

sp_id = spm.SentencePieceProcessor()
sp_id.load(f'{MODEL_DIR}/id_bpe.model')

# Tambahkan token khusus secara manually jika SentencePiece tidak otomatis menyertakannya
# Cek apakah token sudah ada ( biasanya SentencePiece akan menyertakan pad, unk, bos, eos)
# Jika tidak, kita perlu memetakan ID ini ke token SentencePiece
PAD_IDX_EN = sp_en.pad_id() # 0
UNK_IDX_EN = sp_en.unk_id() # 1
BOS_IDX_EN = sp_en.bos_id() # 2
EOS_IDX_EN = sp_en.eos_id() # 3

PAD_IDX_ID = sp_id.pad_id() # 0
UNK_IDX_ID = sp_id.unk_id() # 1
BOS_IDX_ID = sp_id.bos_id() # 2
EOS_IDX_ID = sp_id.eos_id() # 3

print(f"Ukuran vocab EN: {sp_en.get_piece_size()}")
print(f"Ukuran vocab ID: {sp_id.get_piece_size()}")

# Dataset PyTorch
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, src_tokenizer, trg_tokenizer):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_tokenized = [self.src_tokenizer.bos_id()] + self.src_tokenizer.encode(self.src_sentences[idx]) + [self.src_tokenizer.eos_id()]
        trg_tokenized = [self.trg_tokenizer.bos_id()] + self.trg_tokenizer.encode(self.trg_sentences[idx]) + [self.trg_tokenizer.eos_id()]
        return torch.tensor(src_tokenized, dtype=torch.long), torch.tensor(trg_tokenized, dtype=torch.long)

# Fungsi collate_fn untuk DataLoader
def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(src_sample)
        trg_batch.append(trg_sample)

    src_padded = pad_sequence(src_batch, padding_value=PAD_IDX_EN, batch_first=True)
    trg_padded = pad_sequence(trg_batch, padding_value=PAD_IDX_ID, batch_first=True)

    src_lengths = torch.tensor([len(s) for s in src_batch], dtype=torch.long)
    trg_lengths = torch.tensor([len(t) for t in trg_batch], dtype=torch.long)

    return src_padded, src_lengths, trg_padded, trg_lengths

# Membuat instance Dataset dan DataLoader
BATCH_SIZE = 128

train_dataset = TranslationDataset(list(train_en), list(train_id), sp_en, sp_id)
val_dataset = TranslationDataset(list(val_en), list(val_id), sp_en, sp_id)
test_dataset = TranslationDataset(list(test_en), list(test_id), sp_en, sp_id)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("\nData loader siap.")


# --- 2. Implementasi Baseline (RNN + Attention) ---

print("\n--- 2. Implementasi Baseline (RNN + Attention) ---")

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX_EN)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True) # Made bidirectional
        self.fc = nn.Linear(hid_dim * 2, hid_dim) # Linear layer to combine bidirectional hidden states
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        # src = [batch size, src len]
        # src_len = [batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [batch size, src len, emb dim]

        # Pack padded sequence
        packed_embedded = pack_padded_sequence(embedded, src_len.cpu(), batch_first=True, enforce_sorted=False)

        packed_output, hidden = self.rnn(packed_embedded)

        # Unpack sequence
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        # output = [batch size, src len, hid dim * 2]
        # hidden = [n layers * 2, batch size, hid dim]

        # Combine bidirectional hidden states
        # hidden[-2,:,:] is the final forward hidden state
        # hidden[-1,:,:] is the final backward hidden state
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        # hidden = [batch size, hid dim]

        # Unsqueeze to add a sequence length dimension for compatibility with decoder's initial hidden state expectation
        hidden = hidden.unsqueeze(0) # [1, batch size, hid dim]

        return output, hidden

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        # Input to attn is concatenation of encoder output (bidirectional) and decoder hidden state (unidirectional)
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [n layers, batch size, dec hid dim] -> Decoder's hidden state at current timestep
        # encoder_outputs = [batch size, src len, enc hid dim * 2] -> From bidirectional encoder

        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        # Use the decoder's hidden state from the last layer and repeat it src_len times
        hidden = hidden[-1].unsqueeze(1) # [batch size, 1, dec hid dim]
        hidden = hidden.repeat(1, src_len, 1) # [batch size, src len, dec hid dim]

        # concatenate hidden state with encoder outputs
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        # attention = [batch size, src len]

        return F.softmax(attention, dim=1)


class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX_ID)
        # Input to decoder GRU is concatenation of embedded input, weighted encoder output (bidirectional)
        self.rnn = nn.GRU((hid_dim * 2) + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) # GRU input size matches concatenated input
        self.fc_out = nn.Linear(hid_dim + (hid_dim * 2) + emb_dim, output_dim) # Corrected FC input size
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input = [batch size]
        # hidden = [n layers, batch size, hid dim] -> Decoder's hidden state at previous timestep
        # encoder_outputs = [batch size, src len, enc hid dim * 2] -> From bidirectional encoder

        input = input.unsqueeze(1) # [batch size, 1]

        embedded = self.dropout(self.embedding(input))
        # embedded = [batch size, 1, emb dim]

        # Calculate attention weights
        attn_weights = self.attention(hidden, encoder_outputs) # [batch size, src len]
        attn_weights = attn_weights.unsqueeze(1) # [batch size, 1, src len]

        # Calculate weighted sum of encoder outputs
        weighted_encoder_outputs = torch.bmm(attn_weights, encoder_outputs)
        # weighted_encoder_outputs = [batch size, 1, enc hid dim * 2]

        rnn_input = torch.cat((embedded, weighted_encoder_outputs), dim=2)
        # rnn_input = [batch size, 1, (enc hid dim * 2) + emb dim]

        # GRU expects [seq_len, batch_size, input_size] when batch_first=False
        # or [batch_size, seq_len, input_size] when batch_first=True
        # Since batch_first=True in GRU definition, input should be [batch_size, 1, input_size]
        output, hidden = self.rnn(rnn_input, hidden)
        # output = [batch size, 1, hid dim]
        # hidden = [n layers, batch size, hid dim]

        # Squeeze output to remove the sequence length dimension of 1
        output = output.squeeze(1) # [batch size, hid dim]

        embedded = embedded.squeeze(1) # [batch size, emb dim]
        weighted_encoder_outputs = weighted_encoder_outputs.squeeze(1) # [batch size, enc hid dim * 2]

        print(f"Shape of output before fc_out: {output.shape}")
        print(f"Shape of weighted_encoder_outputs before fc_out: {weighted_encoder_outputs.shape}")
        print(f"Shape of embedded before fc_out: {embedded.shape}")


        # Concatenate decoder output, weighted encoder outputs, and embedded input for final prediction
        prediction = self.fc_out(torch.cat((output, weighted_encoder_outputs, embedded), dim=1))
        # prediction = [batch size, output dim]

        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        # src = [batch size, src len]
        # src_len = [batch size]
        # trg = [batch size, trg len]
        # teacher_forcing_ratio is probability to use teacher forcing

        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)
        # encoder_outputs = [batch size, src len, enc hid dim * 2]
        # hidden = [1, batch size, hid dim] - after combining and unsqueezing in EncoderRNN

        # first input to the decoder is the <bos> token
        input = trg[:, 0] # [batch size] (first token of each target sentence)

        # The initial hidden state for the decoder should be the final combined hidden state from the encoder
        # The decoder GRU has n_layers, but the combined encoder hidden state is [1, batch size, hid dim]
        # We need to repeat this hidden state for each layer of the decoder GRU
        decoder_hidden = hidden.repeat(self.decoder.n_layers, 1, 1) # [n layers, batch size, hid dim]


        for t in range(1, trg_len):
            # insert input token embedding, previous hidden state and all encoder outputs
            # then get next output token prediction and new hidden state
            output, decoder_hidden = self.decoder(input, decoder_hidden, encoder_outputs)

            # store output prediction
            outputs[:, t, :] = output

            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # else, use predicted token
            input = trg[:, t] if teacher_force else top1

        return outputs

# Inisialisasi model RNN
INPUT_DIM_RNN = sp_en.get_piece_size()
OUTPUT_DIM_RNN = sp_id.get_piece_size()
ENC_EMB_DIM_RNN = 256
DEC_EMB_DIM_RNN = 256
HID_DIM_RNN = 512
N_LAYERS_RNN = 2
ENC_DROPOUT_RNN = 0.5
DEC_DROPOUT_RNN = 0.5

attn = Attention(HID_DIM_RNN, HID_DIM_RNN) # enc_hid_dim*2 is handled internally in Attention
enc = EncoderRNN(INPUT_DIM_RNN, ENC_EMB_DIM_RNN, HID_DIM_RNN, N_LAYERS_RNN, ENC_DROPOUT_RNN)
dec = DecoderRNN(OUTPUT_DIM_RNN, DEC_EMB_DIM_RNN, HID_DIM_RNN, N_LAYERS_RNN, DEC_DROPOUT_RNN, attn)

model_rnn = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            if name.find('rnn.') != -1: # Initialize RNN weights with orthogonal initialization
                 nn.init.orthogonal_(param.data)
            else: # Initialize other weights with normal distribution
                nn.init.normal_(param.data, mean=0, std=0.01)
        else: # Initialize biases to zero
            nn.init.constant_(param.data, 0)

model_rnn.apply(init_weights)

TRG_PAD_IDX_RNN = PAD_IDX_ID
criterion_rnn = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX_RNN)
optimizer_rnn = optim.Adam(model_rnn.parameters())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model RNN memiliki {count_parameters(model_rnn):,} parameter.")

# Fungsi pelatihan dan evaluasi
def train_epoch(model, dataloader, optimizer, criterion, clip, teacher_forcing_ratio):
    model.train()
    epoch_loss = 0
    for i, (src, src_len, trg, trg_len) in enumerate(dataloader):
        src, trg = src.to(device), trg.to(device)
        src_len, trg_len = src_len.to(device), trg_len.to(device)

        optimizer.zero_grad()

        output = model(src, src_len, trg, teacher_forcing_ratio)
        # trg = [batch size, trg len]
        # output = [batch size, trg len, output dim]

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim) # exclude <bos>
        trg = trg[:, 1:].reshape(-1) # exclude <bos>

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def evaluate_epoch(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, src_len, trg, trg_len) in enumerate(dataloader):
            src, trg = src.to(device), trg.to(device)
            src_len, trg_len = src_len.to(device), trg_len.to(device)

            output = model(src, src_len, trg, 0) # turn off teacher forcing

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')

print("\nMemulai pelatihan model RNN...")
for epoch in tqdm(range(N_EPOCHS)):
    start_time = time.time()

    train_loss = train_epoch(model_rnn, train_dataloader, optimizer_rnn, criterion_rnn, CLIP, 0.5)
    valid_loss = evaluate_epoch(model_rnn, val_dataloader, criterion_rnn)

    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_rnn.state_dict(), 'rnn_attention_model.pt')

    print(f'Epoch: {epoch+1:02} | Waktu: {epoch_mins:.0f}m {epoch_secs:.0f}s')
    print(f'\tKerugian Latihan: {train_loss:.3f} | PPL Latihan: {math.exp(train_loss):.3f}')
    print(f'\tKerugian Validasi: {valid_loss:.3f} | PPL Validasi: {math.exp(valid_loss):.3f}')

print("\nPelatihan model RNN selesai.")
model_rnn.load_state_dict(torch.load('rnn_attention_model.pt'))

Menggunakan perangkat: cuda

--- 1. Persiapan Data ---
Jumlah pasangan kalimat yang dimuat: 14881
Jumlah data train: 11904
Jumlah data validasi: 1488
Jumlah data test: 1489
Ukuran vocab EN: 8000
Ukuran vocab ID: 8000

Data loader siap.

--- 2. Implementasi Baseline (RNN + Attention) ---
Model RNN memiliki 31,173,952 parameter.

Memulai pelatihan model RNN...


  0%|          | 0/10 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Shape of output before fc_out: torch.Size([128, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([128, 1024])
Shape of embedded before fc_out: torch.Size([128, 256])
Shape of output before fc_out: torch.Size([128, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([128, 1024])
Shape of embedded before fc_out: torch.Size([128, 256])
Shape of output before fc_out: torch.Size([128, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([128, 1024])
Shape of embedded before fc_out: torch.Size([128, 256])
Shape of output before fc_out: torch.Size([128, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([128, 1024])
Shape of embedded before fc_out: torch.Size([128, 256])
Shape of output before fc_out: torch.Size([128, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([128, 1024])
Shape of embedded before fc_out: torch.Size([128, 256])
Shape of output befo

<All keys matched successfully>

In [None]:
# --- 3. Implementasi Transformer (Wajib) ---

print("\n--- 3. Implementasi Transformer ---")

class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim, padding_idx=PAD_IDX_EN)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor(hid_dim, dtype=torch.float32))

    def forward(self, src, src_mask):
        # src = [batch size, src len]
        # src_mask = [batch size, 1, 1, src len]

        batch_size = src.shape[0]
        src_len = src.shape[1]

        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = [batch size, src len]

        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        # src = [batch size, src len, hid dim]

        for layer in self.layers:
            src = layer(src, src_mask)
        # src = [batch size, src len, hid dim]

        return src

class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforward(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        # src = [batch size, src len, hid dim]
        # src_mask = [batch size, 1, 1, src len]

        # self attention
        _src = self.self_attention(src, src, src, src_mask)

        # dropout, add and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))

        # positionwise feedforward
        _src = self.positionwise_feedforward(src)

        # dropout, add and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))

        return src

class MultiHeadAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_heads = n_heads

        assert hid_dim % n_heads == 0

        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor(hid_dim // n_heads, dtype=torch.float32))

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        # query, key, value = [batch size, seq len, hid dim]

        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)

        # Q, K, V = [batch size, seq len, hid dim]

        Q = Q.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)

        # Q, K, V = [batch size, n heads, seq len, hid dim // n heads]

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        # energy = [batch size, n heads, seq len, seq len]

        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim=-1)
        # attention = [batch size, n heads, seq len, seq len]

        x = torch.matmul(self.dropout(attention), V)
        # x = [batch size, n heads, seq len, hid dim // n heads]

        x = x.permute(0, 2, 1, 3).contiguous()
        # x = [batch size, seq len, n heads, hid dim // n heads]

        x = x.view(batch_size, -1, self.hid_dim)
        # x = [batch size, seq len, hid dim]

        x = self.fc_o(x)
        # x = [batch size, seq len, hid dim]

        return x

class PositionwiseFeedforward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x = [batch size, seq len, hid dim]
        x = self.dropout(torch.relu(self.fc_1(x)))
        # x = [batch size, seq len, pf dim]
        x = self.fc_2(x)
        # x = [batch size, seq len, hid dim]
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim, padding_idx=PAD_IDX_ID)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor(hid_dim, dtype=torch.float32))

    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch size, trg len]
        # enc_src = [batch size, src len, hid dim]
        # trg_mask = [batch size, 1, trg len, trg len]
        # src_mask = [batch size, 1, 1, src len]

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos = [batch size, trg len]

        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        # trg = [batch size, trg len, hid dim]

        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        # trg = [batch size, trg len, hid dim]

        output = self.fc_out(trg)
        # output = [batch size, trg len, output dim]

        return output

class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforward(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch size, trg len, hid dim]
        # enc_src = [batch size, src len, hid dim]
        # trg_mask = [batch size, 1, trg len, trg len]
        # src_mask = [batch size, 1, 1, src len]

        # self attention
        _trg = self.self_attention(trg, trg, trg, trg_mask)

        # dropout, add and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        # encoder attention
        _trg = self.encoder_attention(trg, enc_src, enc_src, src_mask)

        # dropout, add and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        # positionwise feedforward
        _trg = self.positionwise_feedforward(trg)

        # dropout, add and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        return trg

class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        # src = [batch size, src len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask = [batch size, 1, 1, src len]
        return src_mask

    def make_trg_mask(self, trg):
        # trg = [batch size, trg len]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask = [batch size, 1, 1, trg len]

        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        # trg_sub_mask = [trg len, trg len]

        trg_mask = trg_pad_mask & trg_sub_mask
        # trg_mask = [batch size, 1, trg len, trg len]
        return trg_mask

    def forward(self, src, trg):
        # src = [batch size, src len]
        # trg = [batch size, trg len]

        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.encoder(src, src_mask)
        # enc_src = [batch size, src len, hid dim]

        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        # output = [batch size, trg len, output dim]

        return output

# Inisialisasi model Transformer
INPUT_DIM_TRANSFORMER = sp_en.get_piece_size()
OUTPUT_DIM_TRANSFORMER = sp_id.get_piece_size()
HID_DIM_TRANSFORMER = 256
ENC_LAYERS_TRANSFORMER = 3
DEC_LAYERS_TRANSFORMER = 3
ENC_HEADS_TRANSFORMER = 8
DEC_HEADS_TRANSFORMER = 8
ENC_PF_DIM_TRANSFORMER = 512
DEC_PF_DIM_TRANSFORMER = 512
ENC_DROPOUT_TRANSFORMER = 0.1
DEC_DROPOUT_TRANSFORMER = 0.1

enc_transformer = TransformerEncoder(INPUT_DIM_TRANSFORMER, HID_DIM_TRANSFORMER, ENC_LAYERS_TRANSFORMER, ENC_HEADS_TRANSFORMER, ENC_PF_DIM_TRANSFORMER, ENC_DROPOUT_TRANSFORMER, device)
dec_transformer = TransformerDecoder(OUTPUT_DIM_TRANSFORMER, HID_DIM_TRANSFORMER, DEC_LAYERS_TRANSFORMER, DEC_HEADS_TRANSFORMER, DEC_PF_DIM_TRANSFORMER, DEC_DROPOUT_TRANSFORMER, device)

model_transformer = Transformer(enc_transformer, dec_transformer, PAD_IDX_EN, PAD_IDX_ID, device).to(device)

model_transformer.apply(init_weights) # menggunakan inisialisasi bobot yang sama

TRG_PAD_IDX_TRANSFORMER = PAD_IDX_ID
criterion_transformer = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX_TRANSFORMER)
optimizer_transformer = optim.Adam(model_transformer.parameters(), lr=0.0005) # Menggunakan learning rate yang lebih kecil

print(f"Model Transformer memiliki {count_parameters(model_transformer):,} parameter.")

# Fungsi pelatihan dan evaluasi untuk Transformer
def train_epoch_transformer(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, _, trg, _) in enumerate(dataloader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg[:, :-1]) # decoder input tidak termasuk token EOS
        # output = [batch size, trg len - 1, output dim]
        # trg = [batch size, trg len]

        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1) # target tidak termasuk token BOS

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def evaluate_epoch_transformer(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, _, trg, _) in enumerate(dataloader):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg[:, :-1]) # turn off teacher forcing

            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

N_EPOCHS_TRANSFORMER = 10
CLIP_TRANSFORMER = 1
best_valid_loss_transformer = float('inf')

print("\nMemulai pelatihan model Transformer...")
for epoch in tqdm(range(N_EPOCHS_TRANSFORMER)):
    start_time = time.time()

    train_loss = train_epoch_transformer(model_transformer, train_dataloader, optimizer_transformer, criterion_transformer, CLIP_TRANSFORMER)
    valid_loss = evaluate_epoch_transformer(model_transformer, val_dataloader, criterion_transformer)

    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if valid_loss < best_valid_loss_transformer:
        best_valid_loss_transformer = valid_loss
        torch.save(model_transformer.state_dict(), 'transformer_model.pt')

    print(f'Epoch: {epoch+1:02} | Waktu: {epoch_mins:.0f}m {epoch_secs:.0f}s')
    print(f'\tKerugian Latihan: {train_loss:.3f} | PPL Latihan: {math.exp(train_loss):.3f}')
    print(f'\tKerugian Validasi: {valid_loss:.3f} | PPL Validasi: {math.exp(valid_loss):.3f}')

print("\nPelatihan model Transformer selesai.")
model_transformer.load_state_dict(torch.load('transformer_model.pt'))


--- 3. Implementasi Transformer ---
Model Transformer memiliki 10,156,864 parameter.

Memulai pelatihan model Transformer...


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 01 | Waktu: 0m 5s
	Kerugian Latihan: 8.248 | PPL Latihan: 3820.928
	Kerugian Validasi: 6.861 | PPL Validasi: 954.720
Epoch: 02 | Waktu: 0m 5s
	Kerugian Latihan: 5.884 | PPL Latihan: 359.189
	Kerugian Validasi: 5.380 | PPL Validasi: 217.082
Epoch: 03 | Waktu: 0m 5s
	Kerugian Latihan: 5.127 | PPL Latihan: 168.586
	Kerugian Validasi: 4.923 | PPL Validasi: 137.448
Epoch: 04 | Waktu: 0m 5s
	Kerugian Latihan: 4.650 | PPL Latihan: 104.611
	Kerugian Validasi: 4.553 | PPL Validasi: 94.920
Epoch: 05 | Waktu: 0m 5s
	Kerugian Latihan: 4.293 | PPL Latihan: 73.204
	Kerugian Validasi: 4.315 | PPL Validasi: 74.838
Epoch: 06 | Waktu: 0m 5s
	Kerugian Latihan: 4.031 | PPL Latihan: 56.325
	Kerugian Validasi: 4.147 | PPL Validasi: 63.238
Epoch: 07 | Waktu: 0m 5s
	Kerugian Latihan: 3.822 | PPL Latihan: 45.676
	Kerugian Validasi: 4.008 | PPL Validasi: 55.036
Epoch: 08 | Waktu: 0m 5s
	Kerugian Latihan: 3.637 | PPL Latihan: 37.959
	Kerugian Validasi: 3.913 | PPL Validasi: 50.052
Epoch: 09 | Waktu: 0m 5s

<All keys matched successfully>

In [None]:
# --- 4. Evaluasi & Analisis ---

print("\n--- 4. Evaluasi & Analisis ---")

# Fungsi translasi untuk model RNN
def translate_sentence_rnn(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        processed_sentence = preprocess_sentence(sentence)
        tokens = [src_tokenizer.bos_id()] + src_tokenizer.encode(processed_sentence) + [src_tokenizer.eos_id()]
    else: # already tokenized
        tokens = sentence

    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(tokens)]).to(device)

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor, src_len)

    # The initial hidden state for the decoder should be the final combined hidden state from the encoder
    # Repeat this hidden state for each layer of the decoder GRU
    decoder_hidden = hidden.repeat(model.decoder.n_layers, 1, 1) # [n layers, batch size, hid dim]

    # First input to the decoder is the <bos> token
    trg_indexes = [trg_tokenizer.bos_id()]

    for i in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, decoder_hidden = model.decoder(trg_tensor, decoder_hidden, encoder_outputs)

        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_tokenizer.eos_id():
            break

    # Convert token IDs to words and remove special tokens
    trg_tokens = [trg_tokenizer.id_to_piece(idx) for idx in trg_indexes if idx not in [trg_tokenizer.bos_id(), trg_tokenizer.eos_id(), trg_tokenizer.pad_id()]]
    return "".join(trg_tokens).replace(" ", " ").strip()


# Fungsi translasi untuk model Transformer
def translate_sentence_transformer(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()
    if isinstance(sentence, str):
        processed_sentence = preprocess_sentence(sentence)
        src_tokens = [src_tokenizer.bos_id()] + src_tokenizer.encode(processed_sentence) + [src_tokenizer.eos_id()]
    else: # already tokenized IDs
        src_tokens = sentence

    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    # First input to the decoder is the <bos> token
    trg_indexes = [trg_tokenizer.bos_id()]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        pred_token = output[:, -1, :].argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_tokenizer.eos_id():
            break

    # Convert token IDs to words and remove special tokens
    trg_tokens = [trg_tokenizer.id_to_piece(idx) for idx in trg_indexes if idx not in [trg_tokenizer.bos_id(), trg_tokenizer.eos_id(), trg_tokenizer.pad_id()]]
    return "".join(trg_tokens).replace(" ", " ").strip()

# Menyiapkan metrik
bleu_metric = BLEU()
chrf_metric = CHRF()

def calculate_metrics(model_name, model, test_dataloader, src_tokenizer, trg_tokenizer, translate_fn, device):
    references = []
    candidates = []

    for i, (src, src_len, trg, trg_len) in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc=f"Mengevaluasi {model_name}"):
        for j in range(src.shape[0]):
            src_sentence_ids = src[j].tolist()
            # Remove padding and EOS from source for translation, keep BOS
            # Need to handle padding and EOS correctly when getting the original token IDs
            src_for_translation = [idx for idx in src_sentence_ids if idx != src_tokenizer.pad_id() and idx != src_tokenizer.eos_id()]
            # Ensure BOS is the first token if it was included
            if src_sentence_ids[0] == src_tokenizer.bos_id():
                 src_for_translation = [src_tokenizer.bos_id()] + [idx for idx in src_sentence_ids[1:] if idx != src_tokenizer.pad_id() and idx != src_tokenizer.eos_id()]
            else:
                 src_for_translation = [idx for idx in src_sentence_ids if idx != src_tokenizer.pad_id() and idx != src_tokenizer.eos_id()]


            # Translate the sentence
            translated_sentence = translate_fn(src_for_translation, src_tokenizer, trg_tokenizer, model, device)
            candidates.append(translated_sentence)

            # Convert target sentence to original text for reference
            trg_sentence_ids = trg[j].tolist()
            reference_sentence_pieces = [trg_tokenizer.id_to_piece(idx) for idx in trg_sentence_ids if idx not in [trg_tokenizer.bos_id(), trg_tokenizer.eos_id(), trg_tokenizer.pad_id()]]
            reference_sentence = "".join(reference_sentence_pieces).replace(" ", " ").strip()
            references.append([reference_sentence]) # SacreBLEU expects list of references for each candidate

    bleu_score = bleu_metric.corpus_score(candidates, references).score
    chrf_score = chrf_metric.corpus_score(candidates, references).score

    return bleu_score, chrf_score, candidates, references

# Evaluasi model RNN
print("\nEvaluasi model RNN (baseline)...")
rnn_bleu, rnn_chrf, rnn_candidates, rnn_references = calculate_metrics("RNN", model_rnn, test_dataloader, sp_en, sp_id, translate_sentence_rnn, device)
print(f"RNN BLEU Score: {rnn_bleu:.2f}")
print(f"RNN chrF Score: {rnn_chrf:.2f}")

# Evaluasi model Transformer
print("\nEvaluasi model Transformer...")
transformer_bleu, transformer_chrf, transformer_candidates, transformer_references = calculate_metrics("Transformer", model_transformer, test_dataloader, sp_en, sp_id, translate_sentence_transformer, device)
print(f"Transformer BLEU Score: {transformer_bleu:.2f}")
print(f"Transformer chrF Score: {transformer_chrf:.2f}")

# Sajikan contoh hasil terjemahan dan analisis kesalahan
print("\n--- Contoh Hasil Terjemahan dan Analisis Kesalahan ---")
sample_sentences_en = [
    "I am learning machine translation.",
    "This is a very interesting project.",
    "How are you today?",
    "She likes to read books.",
    "He doesn't know much about anything.",
    "I want to go to the beach with you."
]

for i, eng_sentence in enumerate(sample_sentences_en):
    # Dapatkan referensi dari dataset test jika kalimat ada di sana
    ref_idx = -1
    # Preprocess the sample English sentence for comparison
    processed_eng_sentence = preprocess_sentence(eng_sentence)
    for k, (src_test, trg_test) in enumerate(zip(test_en, test_id)):
        if preprocess_sentence(src_test) == processed_eng_sentence:
            ref_idx = k
            break

    if ref_idx != -1:
        original_id = test_id[ref_idx]
    else:
        original_id = "[Tidak ditemukan di dataset test]"

    translated_rnn = translate_sentence_rnn(eng_sentence, sp_en, sp_id, model_rnn, device)
    translated_transformer = translate_sentence_transformer(eng_sentence, sp_en, sp_id, model_transformer, device)

    print(f"\nContoh {i+1}:")
    print(f"Asli (EN): {eng_sentence}")
    print(f"Referensi (ID): {original_id}")
    print(f"Terjemahan RNN: {translated_rnn}")
    print(f"Terjemahan Transformer: {translated_transformer}")

    # Analisis sederhana: Bandingkan panjang, keberadaan OOV (Out-of-Vocabulary)
    # Anda bisa memperluas ini dengan analisis lebih dalam (misalnya, kesalahan morfologi, tata bahasa)

    # Untuk OOV, kita bisa tokenize ulang dan cek keberadaan UNK token
    rnn_tokens = sp_id.encode(translated_rnn)
    transformer_tokens = sp_id.encode(translated_transformer)

    if UNK_IDX_ID in rnn_tokens:
        print("\tAnalisis RNN: Mungkin ada token OOV.")
    if UNK_IDX_ID in transformer_tokens:
        print("\tAnalisis Transformer: Mungkin ada token OOV.")

    print(f"\tPanjang Terjemahan RNN: {len(rnn_tokens)} token.")
    print(f"\tPanjang Terjemahan Transformer: {len(transformer_tokens)} token.")

# --- Ablation Study (Contoh: Ukuran Vocab) ---
# Bagian ini hanya kerangka, untuk menjalankannya secara penuh,
# Anda perlu melatih ulang model dengan parameter yang berbeda.

print("\n--- 5. Ablation Study (Contoh: Ukuran Vocab) ---")
print("Untuk melakukan Ablation Study, Anda perlu melatih ulang model dengan pengaturan parameter yang berbeda.")
print("Berikut adalah contoh bagaimana Anda bisa mengubah ukuran vocab:")

# Contoh perubahan parameter untuk SentencePiece
# Misal, mengubah vocab_size menjadi 4000 atau 16000 dan melatih ulang.

# # Contoh untuk ukuran vocab 4000
# print("\nMelatih ulang SentencePiece dengan vocab_size=4000...")
# train_sentencepiece(en_data, 'en_bpe_v4000', vocab_size=4000)
# train_sentencepiece(id_data, 'id_bpe_v4000', vocab_size=4000)
# sp_en_v4000 = spm.SentencePieceProcessor()
# sp_en_v4000.load(f'{MODEL_DIR}/en_bpe_v4000.model')
# sp_id_v4000 = spm.SentencePieceProcessor()
# sp_id_v4000.load(f'{MODEL_DIR}/id_bpe_v4000.model')

# # Re-initialize datasets and dataloaders with new tokenizers
# train_dataset_v4000 = TranslationDataset(list(train_en), list(train_id), sp_en_v4000, sp_id_v4000)
# train_dataloader_v4000 = DataLoader(train_dataset_v4000, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
# # Latih ulang dan evaluasi model_rnn_v4000 dan model_transformer_v4000
# # Lalu bandingkan metriknya.

# --- Kesimpulan ---
print("\n--- Kesimpulan dan Saran ---")
# The final conclusions will be printed here after evaluation
# print("Model RNN (Baseline) dan Transformer telah dilatih dan dievaluasi.")
# print(f"Hasil Metrik RNN: BLEU={rnn_bleu:.2f}, chrF={rnn_chrf:.2f}")
# print(f"Hasil Metrik Transformer: BLEU={transformer_bleu:.2f}, chrF={transformer_chrf:.2f}")

# print("\nAnalisis awal menunjukkan bahwa Transformer umumnya memiliki potensi untuk menghasilkan terjemahan yang lebih baik, meskipun kinerja aktual sangat bergantung pada ukuran dataset, hyperparameter, dan waktu pelatihan.")
# print("Untuk tugas Anda, fokuslah pada membandingkan kedua arsitektur ini secara mendalam.")
# print("Pastikan untuk mendokumentasikan semua percobaan dan hasilnya dalam laporan IEEE Anda.")


--- 4. Evaluasi & Analisis ---

Evaluasi model RNN (baseline)...


Mengevaluasi RNN:   0%|          | 0/12 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 25

Mengevaluasi Transformer:   0%|          | 0/12 [00:00<?, ?it/s]

Transformer BLEU Score: 0.00
Transformer chrF Score: 55.58

--- Contoh Hasil Terjemahan dan Analisis Kesalahan ---
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])
Shape of output before fc_out: torch.Size([1, 512])
Shape of weighted_encoder_outputs before fc_out: torch.Size([1, 1024])
Shape of embedded before fc_out: torch.Size([1, 256])


## Ablation Study: Ukuran Vocab

Sekarang, mari kita lakukan studi ablasi untuk melihat dampak ukuran kosakata terhadap kinerja model. Kita akan melatih ulang model Transformer dengan ukuran kosakata yang berbeda dan membandingkan hasilnya.

In [56]:
# --- 5. Ablation Study (Contoh: Ukuran Vocab) ---

print("\n--- 5. Ablation Study (Contoh: Ukuran Vocab) ---")

# Adjusted VOCAB_SIZES based on the error message (max 14695 for this dataset)
VOCAB_SIZES = [4000, 8000, 14000] # Ukuran vocab yang akan diuji, 8000 adalah baseline

results = {}

for vocab_size in VOCAB_SIZES:
    print(f"\nMelakukan Ablation Study dengan vocab_size={vocab_size}...")

    # Melatih ulang model SentencePiece dengan ukuran vocab baru
    en_model_prefix = f'en_bpe_v{vocab_size}'
    id_model_prefix = f'id_bpe_v{vocab_size}'

    # Check if models exist before training
    if not os.path.exists(f'{MODEL_DIR}/{en_model_prefix}.model'):
        train_sentencepiece(en_data, en_model_prefix, vocab_size=vocab_size)
    else:
        print(f"Model SentencePiece '{en_model_prefix}' sudah ada.")
    if not os.path.exists(f'{MODEL_DIR}/{id_model_prefix}.model'):
        train_sentencepiece(id_data, id_model_prefix, vocab_size=vocab_size)
    else:
        print(f"Model SentencePiece '{id_model_prefix}' sudah ada.")


    sp_en_ablation = spm.SentencePieceProcessor()
    sp_en_ablation.load(f'{MODEL_DIR}/{en_model_prefix}.model')

    sp_id_ablation = spm.SentencePieceProcessor()
    sp_id_ablation.load(f'{MODEL_DIR}/{id_model_prefix}.model')

    # Re-initialize datasets and dataloaders with new tokenizers
    # Use the same train, val, test data splits as before
    train_dataset_ablation = TranslationDataset(list(train_en), list(train_id), sp_en_ablation, sp_id_ablation)
    val_dataset_ablation = TranslationDataset(list(val_en), list(val_id), sp_en_ablation, sp_id_ablation)
    test_dataset_ablation = TranslationDataset(list(test_en), list(test_id), sp_en_ablation, sp_id_ablation)


    # Need a collate_fn that uses the correct pad indices for the current vocab size
    def collate_fn_ablation(batch):
        src_batch, trg_batch = [], []
        for src_sample, trg_sample in batch:
            src_batch.append(src_sample)
            trg_batch.append(trg_sample)

        # Use the correct pad indices for the current SentencePiece processors
        src_padded = pad_sequence(src_batch, padding_value=sp_en_ablation.pad_id(), batch_first=True)
        trg_padded = pad_sequence(trg_batch, padding_value=sp_id_ablation.pad_id(), batch_first=True)

        src_lengths = torch.tensor([len(s) for s in src_batch], dtype=torch.long)
        trg_lengths = torch.tensor([len(t) for t in trg_batch], dtype=torch.long)

        return src_padded, src_lengths, trg_padded, trg_lengths


    train_dataloader_ablation = DataLoader(train_dataset_ablation, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_ablation)
    val_dataloader_ablation = DataLoader(val_dataset_ablation, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn_ablation)
    test_dataloader_ablation = DataLoader(test_dataset_ablation, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn_ablation)


    # Inisialisasi dan latih model Transformer dengan ukuran vocab baru
    INPUT_DIM_ABLATION = sp_en_ablation.get_piece_size()
    OUTPUT_DIM_ABLATION = sp_id_ablation.get_piece_size()

    enc_transformer_ablation = TransformerEncoder(INPUT_DIM_ABLATION, HID_DIM_TRANSFORMER, ENC_LAYERS_TRANSFORMER, ENC_HEADS_TRANSFORMER, ENC_PF_DIM_TRANSFORMER, ENC_DROPOUT_TRANSFORMER, device)
    dec_transformer_ablation = TransformerDecoder(OUTPUT_DIM_ABLATION, HID_DIM_TRANSFORMER, DEC_LAYERS_TRANSFORMER, DEC_HEADS_TRANSFORMER, DEC_PF_DIM_TRANSFORMER, DEC_DROPOUT_TRANSFORMER, device)

    model_transformer_ablation = Transformer(enc_transformer_ablation, dec_transformer_ablation, sp_en_ablation.pad_id(), sp_id_ablation.pad_id(), device).to(device)

    model_transformer_ablation.apply(init_weights)

    criterion_transformer_ablation = nn.CrossEntropyLoss(ignore_index=sp_id_ablation.pad_id())
    optimizer_transformer_ablation = optim.Adam(model_transformer_ablation.parameters(), lr=0.0005)


    print(f"Model Transformer (vocab_size={vocab_size}) memiliki {count_parameters(model_transformer_ablation):,} parameter.")

    best_valid_loss_ablation = float('inf')
    N_EPOCHS_ABLATION = 5 # Reduced epochs for faster ablation study

    print(f"Memulai pelatihan model Transformer (vocab_size={vocab_size})...")
    for epoch in tqdm(range(N_EPOCHS_ABLATION)):
        start_time = time.time()

        train_loss = train_epoch_transformer(model_transformer_ablation, train_dataloader_ablation, optimizer_transformer_ablation, criterion_transformer_ablation, CLIP_TRANSFORMER)
        valid_loss = evaluate_epoch_transformer(model_transformer_ablation, val_dataloader_ablation, criterion_transformer_ablation)

        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        if valid_loss < best_valid_loss_ablation:
            best_valid_loss_ablation = valid_loss
            torch.save(model_transformer_ablation.state_dict(), f'transformer_model_v{vocab_size}.pt')

        print(f'Epoch: {epoch+1:02} | Waktu: {epoch_mins:.0f}m {epoch_secs:.0f}s')
        print(f'\tKerugian Latihan: {train_loss:.3f} | PPL Latihan: {math.exp(train_loss):.3f}')
        print(f'\tKerugian Validasi: {valid_loss:.3f} | PPL Validasi: {math.exp(valid_loss):.3f}')

    print(f"Pelatihan model Transformer (vocab_size={vocab_size}) selesai.")
    model_transformer_ablation.load_state_dict(torch.load(f'transformer_model_v{vocab_size}.pt'))

    # Evaluasi model dengan ukuran vocab baru
    bleu_score, chrf_score, _, _ = calculate_metrics(f"Transformer (vocab_size={vocab_size})", model_transformer_ablation, test_dataloader_ablation, sp_en_ablation, sp_id_ablation, translate_sentence_transformer, device)
    results[vocab_size] = {'BLEU': bleu_score, 'chrF': chrf_score}

# Tampilkan hasil ablation study
print("\nHasil Ablation Study (Ukuran Vocab):")
for vocab_size, metrics in results.items():
    print(f"Vocab Size {vocab_size}:")
    print(f"  BLEU Score: {metrics['BLEU']:.2f}")
    print(f"  chrF Score: {metrics['chrF']:.2f}")

# Anda dapat menambahkan analisis lebih lanjut di sini berdasarkan hasil


--- 5. Ablation Study (Contoh: Ukuran Vocab) ---

Melakukan Ablation Study dengan vocab_size=4000...
Model SentencePiece 'en_bpe_v4000' sudah ada.
Model SentencePiece 'id_bpe_v4000' sudah ada.
Model Transformer (vocab_size=4000) memiliki 7,080,864 parameter.
Memulai pelatihan model Transformer (vocab_size=4000)...


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 01 | Waktu: 0m 5s
	Kerugian Latihan: 7.668 | PPL Latihan: 2139.344
	Kerugian Validasi: 6.513 | PPL Validasi: 673.688
Epoch: 02 | Waktu: 0m 4s
	Kerugian Latihan: 5.793 | PPL Latihan: 328.067
	Kerugian Validasi: 5.389 | PPL Validasi: 219.034
Epoch: 03 | Waktu: 0m 5s
	Kerugian Latihan: 5.157 | PPL Latihan: 173.601
	Kerugian Validasi: 4.929 | PPL Validasi: 138.179
Epoch: 04 | Waktu: 0m 4s
	Kerugian Latihan: 4.717 | PPL Latihan: 111.798
	Kerugian Validasi: 4.570 | PPL Validasi: 96.505
Epoch: 05 | Waktu: 0m 5s
	Kerugian Latihan: 4.391 | PPL Latihan: 80.719
	Kerugian Validasi: 4.344 | PPL Validasi: 77.051
Pelatihan model Transformer (vocab_size=4000) selesai.


Mengevaluasi Transformer (vocab_size=4000):   0%|          | 0/12 [00:00<?, ?it/s]


Melakukan Ablation Study dengan vocab_size=8000...
Model SentencePiece 'en_bpe_v8000' dilatih dan disimpan di spt_models/.
Model SentencePiece 'id_bpe_v8000' dilatih dan disimpan di spt_models/.
Model Transformer (vocab_size=8000) memiliki 10,156,864 parameter.
Memulai pelatihan model Transformer (vocab_size=8000)...


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 01 | Waktu: 0m 5s
	Kerugian Latihan: 8.252 | PPL Latihan: 3836.576
	Kerugian Validasi: 6.866 | PPL Validasi: 959.475
Epoch: 02 | Waktu: 0m 5s
	Kerugian Latihan: 5.879 | PPL Latihan: 357.544
	Kerugian Validasi: 5.369 | PPL Validasi: 214.627
Epoch: 03 | Waktu: 0m 5s
	Kerugian Latihan: 5.116 | PPL Latihan: 166.692
	Kerugian Validasi: 4.916 | PPL Validasi: 136.400
Epoch: 04 | Waktu: 0m 5s
	Kerugian Latihan: 4.665 | PPL Latihan: 106.126
	Kerugian Validasi: 4.581 | PPL Validasi: 97.588
Epoch: 05 | Waktu: 0m 5s
	Kerugian Latihan: 4.354 | PPL Latihan: 77.787
	Kerugian Validasi: 4.360 | PPL Validasi: 78.227
Pelatihan model Transformer (vocab_size=8000) selesai.


Mengevaluasi Transformer (vocab_size=8000):   0%|          | 0/12 [00:00<?, ?it/s]


Melakukan Ablation Study dengan vocab_size=14000...
Model SentencePiece 'en_bpe_v14000' dilatih dan disimpan di spt_models/.
Model SentencePiece 'id_bpe_v14000' dilatih dan disimpan di spt_models/.
Model Transformer (vocab_size=14000) memiliki 14,770,864 parameter.
Memulai pelatihan model Transformer (vocab_size=14000)...


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 01 | Waktu: 0m 6s
	Kerugian Latihan: 8.730 | PPL Latihan: 6184.275
	Kerugian Validasi: 7.187 | PPL Validasi: 1322.726
Epoch: 02 | Waktu: 0m 5s
	Kerugian Latihan: 5.984 | PPL Latihan: 397.160
	Kerugian Validasi: 5.372 | PPL Validasi: 215.257
Epoch: 03 | Waktu: 0m 6s
	Kerugian Latihan: 5.106 | PPL Latihan: 164.984
	Kerugian Validasi: 4.892 | PPL Validasi: 133.178
Epoch: 04 | Waktu: 0m 5s
	Kerugian Latihan: 4.598 | PPL Latihan: 99.318
	Kerugian Validasi: 4.507 | PPL Validasi: 90.681
Epoch: 05 | Waktu: 0m 6s
	Kerugian Latihan: 4.241 | PPL Latihan: 69.483
	Kerugian Validasi: 4.285 | PPL Validasi: 72.632
Pelatihan model Transformer (vocab_size=14000) selesai.


Mengevaluasi Transformer (vocab_size=14000):   0%|          | 0/12 [00:00<?, ?it/s]


Hasil Ablation Study (Ukuran Vocab):
Vocab Size 4000:
  BLEU Score: 0.00
  chrF Score: 64.95
Vocab Size 8000:
  BLEU Score: 0.00
  chrF Score: 61.62
Vocab Size 14000:
  BLEU Score: 0.00
  chrF Score: 52.74
