In [None]:
# !rm -rf /kaggle/working/*

In [None]:
!pip install evaluate rouge_score

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from datasets import Dataset
import evaluate
from tqdm import tqdm
import sys

In [4]:
# Load parallel data
def load_parallel_data(en_file, vi_file):
    with open(en_file, 'r', encoding='utf-8') as f:
        en_lines = f.readlines()
    with open(vi_file, 'r', encoding='utf-8') as f:
        vi_lines = f.readlines()
    
    assert len(en_lines) == len(vi_lines), "Mismatched number of lines"
    data = []
    for en, vi in zip(en_lines, vi_lines):
        en, vi = en.strip(), vi.strip()
        if en and vi:
            data.append(f"English: {en} [SEP] Vietnamese: {vi} <EOS>")
    
    return pd.DataFrame({"text": data})

# File paths
en_file = "/kaggle/input/en-vi-data/TED2020.en-vi.en"
vi_file = "/kaggle/input/en-vi-data/TED2020.en-vi.vi"

# Load and reduce dataset
df = load_parallel_data(en_file, vi_file)
# df = df.sample(frac=0.8, random_state=42)
dataset = Dataset.from_pandas(df)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, len(dataset)))
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

Train: 258633, Val: 32329, Test: 32330


In [7]:
# Set up tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Fix padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Use <EOS> as pad token
tokenizer.pad_token_id = tokenizer.eos_token_id  # Explicitly set pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id  # Sync with model config

# Add special tokens
tokenizer.add_special_tokens({'sep_token': '[SEP]', 'eos_token': '<EOS>'})
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [8]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "text": text
        }

# Create datasets
train_data = TranslationDataset(train_dataset, tokenizer)
val_data = TranslationDataset(val_dataset, tokenizer)
test_data = TranslationDataset(test_dataset, tokenizer)

# Create DataLoaders
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32)
test_dataloader = DataLoader(test_data, batch_size=8)

In [None]:
from tqdm import tqdm

# Training configuration
num_epochs = 10
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop with tqdm
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    # Save model and tokenizer
    model.save_pretrained("/kaggle/working/finetuned_gpt2_en_vi")
    tokenizer.save_pretrained("/kaggle/working/finetuned_gpt2_en_vi")

In [22]:
def translate_batch(texts, model, tokenizer, max_new_tokens=128):
    model.eval()
    input_texts = [f"English: {text} [SEP] Vietnamese:" for text in texts]
    inputs = tokenizer(
        input_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,  # Generate up to 64 new tokens
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )
    translations = []
    for output in outputs:
        translation = tokenizer.decode(output, skip_special_tokens=True)
        if "Vietnamese:" in translation:
            translations.append(translation.split("Vietnamese:")[1].strip())
        else:
            translations.append(translation)
    return translations

In [None]:
!kaggle kernels output tneduvn/gpt-mt-pre -p /kaggle/working

In [10]:
# Load saved model and tokenizer
model_path = "/kaggle/working/finetuned_gpt2_en_vi"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad_token is set
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure pad_token_id is set

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained(model_path)
model.config.pad_token_id = tokenizer.pad_token_id  # Sync model's config
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [23]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

references = []
hypotheses = []

# Generate translations and collect references in batches
for batch in tqdm(test_dataloader, desc="Generating translations", leave=False, file=sys.stdout):
    texts = [text.split("[SEP]")[0].replace("English: ", "").strip() for text in batch["text"]]
    ref_texts = [text.split("[SEP]")[1].replace("Vietnamese: ", "").replace("<EOS>", "").strip() for text in batch["text"]]
    translations = translate_batch(texts, model, tokenizer)
    hypotheses.extend(translations)
    references.extend(ref_texts)

# Compute BLEU and ROUGE
bleu_results = bleu_metric.compute(predictions=hypotheses, references=references)
rouge_results = rouge_metric.compute(predictions=hypotheses, references=references)

print(f"BLEU Score: {bleu_results['bleu']:.4f}")
print(f"ROUGE Scores: ROUGE-1: {rouge_results['rouge1']:.4f}, ROUGE-2: {rouge_results['rouge2']:.4f}, ROUGE-L: {rouge_results['rougeL']:.4f}")

# Save results
with open("/kaggle/working/eval_results.txt", "w") as f:
    f.write(f"BLEU Score: {bleu_results['bleu']:.4f}\n")
    f.write(f"ROUGE-1: {rouge_results['rouge1']:.4f}\n")
    f.write(f"ROUGE-2: {rouge_results['rouge2']:.4f}\n")
    f.write(f"ROUGE-L: {rouge_results['rougeL']:.4f}\n")

BLEU Score: 0.2386                                                              
ROUGE Scores: ROUGE-1: 0.6786, ROUGE-2: 0.4468, ROUGE-L: 0.5943


In [24]:
# Example translation
sample_text = "If you don’t eat breakfast, you might feel tired before lunchtime"
print(f"Input: {sample_text}")
print(f"Translation: {translate_batch([sample_text], model, tokenizer)[0]}")

Input: If you don’t eat breakfast, you might feel tired before lunchtime
Translation: Nếu bạn không ăn sáng, bạn có thể cảm thấy mệt mỏi trước bữa trưa.
