# **Machine Translation dengan PyTorch**

## 1. Install library yang dibutuhkan di Kaggle Notebook

In [1]:
!pip install torch torchvision torchaudio
!pip install torchtext
!pip install spacy
!pip install sacrebleu
!python -m spacy download en_core_web_sm
!python -m spacy download id_core_web_sm
!pip install --upgrade --force-reinstall scikit-learn numpy pandas

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

[38;5;1m✘ No compatible package found for 'id_core_web_sm' (spaCy v3.8.7)[0m

Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

## 2. Persiapan Data (Preprocessing)
Langkah ini sangat krusial. Dataset dari ManyThings.org biasanya berupa file teks dengan pasangan kalimat per baris.

### A. Memuat dan Membersihkan Data
Dataset yang Anda unduh mungkin memiliki beberapa baris yang tidak relevan (misalnya, baris yang berisi informasi hak cipta). Anda perlu menghapusnya.

In [2]:
import pandas as pd

data_path = "/kaggle/input/eng-ind-dataset/ind.txt"

# Baca file secara manual
lines = open(data_path, "r", encoding="utf-8").read().splitlines()

eng, indo = [], []
for line in lines:
    parts = line.split("\t")
    if len(parts) >= 2:
        eng.append(parts[0])
        indo.append(parts[1])

df = pd.DataFrame({"eng": eng, "id": indo})
print(df.head())

    eng      id
0   Hi.    Hai.
1  Run!   Lari!
2  Run.   Lari!
3  Who?  Siapa?
4  Wow!    Wow!


### B. Tokenisasi
Tokenisasi adalah proses memecah kalimat menjadi unit-unit yang lebih kecil (token). Untuk tokenisasi subword, Anda bisa menggunakan library seperti SentencePiece atau Hugging Face Tokenizers. Metode ini sangat efektif untuk menangani kata-kata yang tidak dikenal (Out-of-Vocabulary / OOV).

In [3]:
# Contoh menggunakan SentencePiece (Anda perlu menginstalnya terlebih dahulu)
!pip install sentencepiece

import sentencepiece as spm

# Gabungkan semua data menjadi satu file untuk melatih tokenizer
df['text'] = df['eng'] + ' ' + df['id']
with open('corpus.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(df['text'].tolist()))

# Melatih model SentencePiece
spm.SentencePieceTrainer.train(
    '--input=corpus.txt --model_prefix=spm_bpe --vocab_size=8000 --model_type=bpe')

# Muat model tokenizer yang sudah dilatih
sp = spm.SentencePieceProcessor(model_file='spm_bpe.model')

# Contoh tokenisasi
sentence = "Saya suka belajar Machine Learning."
tokens = sp.encode_as_pieces(sentence)
print(tokens) # Output: [' Saya', ' suka', ' belajar', ' Machine', ' Learning', '.']

# Simpan vocab untuk digunakan nanti
vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}
print(f"Ukuran vocab: {len(vocab)}")

['▁Saya', '▁suka', '▁belajar', '▁M', 'ach', 'ine', '▁Lear', 'ning', '.']
Ukuran vocab: 8000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=corpus.txt --model_prefix=spm_bpe --vocab_size=8000 --model_type=bpe
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: spm_bpe
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk

### C. Pembagian Data
Gunakan sklearn untuk membagi data menjadi set pelatihan, validasi, dan pengujian. Rasio yang umum digunakan adalah 80-10-10.

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

print(f"Ukuran data train: {len(train_df)}")
print(f"Ukuran data validation: {len(val_df)}")
print(f"Ukuran data test: {len(test_df)}")

Ukuran data train: 11904
Ukuran data validation: 1488
Ukuran data test: 1489


## 3. Implementasi Model (Baseline: RNN + Attention)
### A. Konsep Dasar
Arsitektur ini terdiri dari dua bagian: Encoder (untuk memahami kalimat sumber) dan Decoder (untuk menghasilkan kalimat target). Attention Mechanism memungkinkan decoder untuk "melihat" bagian-bagian yang relevan dari kalimat sumber saat menghasilkan setiap token target, mengatasi masalah bottleneck pada RNN standar.

### B. Implementasi di PyTorch
Anda akan membuat kelas EncoderRNN, DecoderRNN, dan Seq2Seq (model utama).

In [5]:
import torch
import torch.nn as nn
from torch import optim

# Contoh pseudo-code untuk arsitektur
# Encoder
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size) # Atau LSTM

    def forward(self, input, hidden):
        # ... (implementasi forward pass)
        pass

# Decoder dengan Attention
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=50):
        super(AttnDecoderRNN, self).__init__()
        # ... (implementasi __init__ dan forward pass)
        pass

# Model gabungan
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        # ... (implementasi __init__ dan forward pass)
        pass

## 4. Implementasi Model (Transformer)
### A. Konsep Dasar
Transformer adalah arsitektur yang sepenuhnya berbasis Attention (Self-Attention). Ia tidak menggunakan RNN. Ini memungkinkan pemrosesan paralel yang lebih cepat dan efektif dalam menangkap dependensi jarak jauh.

### B. Implementasi di PyTorch
Anda akan membuat blok-blok pembangun Transformer: Multi-Head Attention, Feed-Forward Network, dan kemudian menggabungkannya ke dalam Encoder dan Decoder.

In [23]:
import torch
import torch.nn as nn
import math

# Contoh pseudo-code untuk arsitektur
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super(MultiHeadAttention, self).__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)

    def forward(self, query, key, value, attn_mask=None):
        out, _ = self.attn(query, key, value, attn_mask=attn_mask)
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout): 
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        attn_out = self.attn(src, src, src)
        src = self.norm1(src + self.dropout(attn_out))
        ffn_out = self.ffn(src)
        src = self.norm2(src + self.dropout(ffn_out))
        return src


class Encoder(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, d_ff, device, dropout):
        super().__init__()
        self.device = device
        self.embed = nn.Embedding(input_dim, d_model)
        self.pos_enc = PositionalEncoding(d_model, dropout)
        self.layer = EncoderLayer(d_model, n_heads, d_ff, dropout)

    def forward(self, src):
        src = self.embed(src)
        src = self.pos_enc(src)
        return self.layer(src)


class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_out):
        trg2 = self.self_attn(trg, trg, trg)
        trg = self.norm1(trg + self.dropout(trg2))
        trg2 = self.cross_attn(trg, enc_out, enc_out)
        trg = self.norm2(trg + self.dropout(trg2))
        ffn_out = self.ffn(trg)
        trg = self.norm3(trg + self.dropout(ffn_out))
        return trg


class Decoder(nn.Module):
    def __init__(self, output_dim, d_model, n_heads, d_ff, device, dropout):
        super().__init__()
        self.device = device
        self.embed = nn.Embedding(output_dim, d_model)
        self.pos_enc = PositionalEncoding(d_model, dropout)
        self.layer = DecoderLayer(d_model, n_heads, d_ff, dropout)
        self.fc_out = nn.Linear(d_model, output_dim)

    def forward(self, trg, enc_out):
        trg = self.embed(trg)
        trg = self.pos_enc(trg)
        dec_out = self.layer(trg, enc_out)
        output = self.fc_out(dec_out)
        return output


class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super(Transformer, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def forward(self, src, trg):
        enc_out = self.encoder(src)
        output = self.decoder(trg, enc_out)
        return output

## 5. Evaluasi dan Analisis
### A. Metrik SacreBLEU
SacreBLEU adalah metrik evaluasi standar untuk machine translation yang lebih andal daripada BLEU biasa karena menghindari tokenisasi yang tidak standar. Ia mengukur seberapa mirip terjemahan model Anda dengan terjemahan referensi.

In [24]:
# Contoh penggunaan SacreBLEU
from sacrebleu.metrics import BLEU, CHRF

# Terjemahan model Anda
hypotheses = ["the cat is on the mat"]
# Terjemahan referensi
references = [["a cat is on a mat"]]

bleu = BLEU()
score = bleu.corpus_score(hypotheses, references)
print(score.score)

# Untuk chrF
chrf = CHRF()
score_chrf = chrf.corpus_score(hypotheses, references)
print(score_chrf.score)

32.46679154750991
50.36239285689067


## 6. Ablation Study
### A. Ukuran Vocab
Uji coba ini membandingkan performa model dengan ukuran vocab yang berbeda. Kamu perlu melatih dua model secara terpisah: satu dengan vocab_size=4000 dan satu lagi dengan vocab_size=8000.

In [25]:
import sentencepiece as spm

# Latih model dengan vocab size 4000
spm.SentencePieceTrainer.train(
    '--input=/kaggle/working/corpus.txt --model_prefix=spm_bpe_4000 --vocab_size=4000 --model_type=bpe')

print("Model SentencePiece dengan vocab 4000 berhasil dilatih dan disimpan.")

# Latih model dengan vocab size 8000
# Gunakan nama file yang berbeda untuk membedakan
spm.SentencePieceTrainer.train(
    '--input=/kaggle/working/corpus.txt --model_prefix=spm_bpe_8000 --vocab_size=8000 --model_type=bpe')

print("Model SentencePiece dengan vocab 8000 berhasil dilatih dan disimpan.")

Model SentencePiece dengan vocab 4000 berhasil dilatih dan disimpan.
Model SentencePiece dengan vocab 8000 berhasil dilatih dan disimpan.


### B. Ablation Study: Dropout
Untuk uji coba ini, kamu bisa menggunakan salah satu model dari eksperimen vocab (misalnya, model dengan vocab 8000).

In [26]:
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
import sacrebleu

# Asumsikan Anda sudah memiliki data loader dan fungsi pelatihan (train_model, translate_sentences)
# train_loader = ...
# test_data = ...
# sp_tokenizer = ...
# references_test = ...

# Parameter model umum
d_model = 512
n_heads = 8
d_ff = 2048
src_vocab_size = 8000 # Gunakan ukuran vocab terbaik dari uji coba sebelumnya
trg_vocab_size = 8000
src_pad_idx = 0
trg_pad_idx = 0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- Uji Coba Model dengan Dropout 0.1 ---
print("Memulai pelatihan untuk model dengan dropout 0.1")
encoder_d01 = Encoder(src_vocab_size, d_model, n_heads, d_ff, device, dropout=0.1)
decoder_d01 = Decoder(trg_vocab_size, d_model, n_heads, d_ff, device, dropout=0.1)
model_dropout_01 = Transformer(encoder_d01, decoder_d01, src_pad_idx, trg_pad_idx, device).to(device)

# Latih model ini (Ganti dengan kode pelatihan Anda)
# train_model(model_dropout_01, train_loader, ...)
torch.save(model_dropout_01.state_dict(), 'transformer_dropout_01.pth')

# Evaluasi model
# hypotheses_d01 = translate_sentences(model_dropout_01, test_data, sp_tokenizer)
# bleu_score_d01 = sacrebleu.corpus_bleu(hypotheses_d01, [references_test]).score
# print(f"Hasil Dropout 0.1: BLEU={bleu_score_d01:.2f}")

# --- Uji Coba Model dengan Dropout 0.5 ---
print("\nMemulai pelatihan untuk model dengan dropout 0.5")
encoder_d05 = Encoder(src_vocab_size, d_model, n_heads, d_ff, device, dropout=0.5)
decoder_d05 = Decoder(trg_vocab_size, d_model, n_heads, d_ff, device, dropout=0.5)
model_dropout_05 = Transformer(encoder_d05, decoder_d05, src_pad_idx, trg_pad_idx, device).to(device)

# Latih model ini (Ganti dengan kode pelatihan Anda)
# train_model(model_dropout_05, train_loader, ...)
torch.save(model_dropout_05.state_dict(), 'transformer_dropout_05.pth')

# Evaluasi model
# hypotheses_d05 = translate_sentences(model_dropout_05, test_data, sp_tokenizer)
# bleu_score_d05 = sacrebleu.corpus_bleu(hypotheses_d05, [references_test]).score
# print(f"Hasil Dropout 0.5: BLEU={bleu_score_d05:.2f}")

Memulai pelatihan untuk model dengan dropout 0.1

Memulai pelatihan untuk model dengan dropout 0.5


### C. Ablation Study: Beam Search Size

Ini adalah uji coba paling mudah karena tidak memerlukan pelatihan model baru. Cukup gunakan model Transformer terbaik yang sudah kamu latih.

In [28]:
import torch
import sacrebleu
import pandas as pd
import sentencepiece as spm

# ---- MODEL SETUP (sudah ada di code kamu) ----
model_path = '/kaggle/working/transformer_dropout_05.pth' 
d_model = 512
n_heads = 8
d_ff = 2048
src_vocab_size = 8000
trg_vocab_size = 8000
src_pad_idx = 0
trg_pad_idx = 0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

encoder = Encoder(src_vocab_size, d_model, n_heads, d_ff, device, dropout=0.1)
decoder = Decoder(trg_vocab_size, d_model, n_heads, d_ff, device, dropout=0.1)
best_model = Transformer(encoder, decoder, src_pad_idx, trg_pad_idx, device).to(device)
best_model.load_state_dict(torch.load(model_path, map_location=device))
best_model.eval()

sp_tokenizer = spm.SentencePieceProcessor(model_file='spm_bpe_8000.model')


# ---- TRANSLATE FUNCTION ----
def translate_with_beam_search(model, test_data, tokenizer, beam_size=1, max_len=50):
    model.eval()
    hypotheses = []

    for sentence in test_data:
        # Encode input
        src_tokens = tokenizer.encode(sentence)
        src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)  # [1, src_len]

        # Start with <s> token (anggap id 1 = <s>, id 2 = </s>)
        ys = torch.LongTensor([[1]]).to(device)  

        for i in range(max_len):
            # forward pass
            out = model(src_tensor, ys)
            prob = out[:, -1, :]  # token terakhir
            next_word = prob.argmax(dim=-1).item()

            ys = torch.cat([ys, torch.LongTensor([[next_word]]).to(device)], dim=1)

            if next_word == 2:  # </s> token
                break

        # Decode hasil jadi string
        hyp = tokenizer.decode(ys.squeeze().tolist())
        hypotheses.append(hyp)

    return hypotheses


# ---- EVALUATION ----
# Asumsikan test_df sudah ada
# test_df = pd.read_csv("test.csv")

hypotheses_greedy = translate_with_beam_search(best_model, test_df['eng'].tolist(), sp_tokenizer, beam_size=1)
references = [test_df['id'].tolist()]

bleu_score_greedy = sacrebleu.corpus_bleu(hypotheses_greedy, references).score
print(f"Hasil Greedy Search (beam_size=1): BLEU={bleu_score_greedy:.2f}")

Hasil Greedy Search (beam_size=1): BLEU=0.00
