In [1]:
!ls -R /kaggle/input

/kaggle/input:
vlsp-dataset

/kaggle/input/vlsp-dataset:
data

/kaggle/input/vlsp-dataset/data:
public_test.en.txt  public_test.vi.txt	train.en.txt  train.vi.txt


In [2]:
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu tensorflow-intel keras keras-nightly keras-preprocessing keras-vis tf-nightly tf-estimator-nightly tensorflow-estimator
!pip install -q transformers datasets sacrebleu sentencepiece accelerate evaluate protobuf==3.20.3

Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
[0mFound existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from datasets import Dataset, DatasetDict
import pandas as pd

base_path = "/kaggle/input/vlsp-dataset/data"

def read_parallel(src_file, tgt_file):
    with open(base_path + src_file, encoding="utf-8") as f_src, open(base_path + tgt_file, encoding="utf-8") as f_tgt:
        src = f_src.read().strip().splitlines()
        tgt = f_tgt.read().strip().splitlines()
    n = min(len(src), len(tgt))
    return pd.DataFrame({'en': src[:n], 'vi': tgt[:n]})

train_df = read_parallel("/train.en.txt", "/train.vi.txt")
test_df  = read_parallel("/public_test.en.txt", "/public_test.vi.txt")

raw = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})
print(raw)

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 500000
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 3000
    })
})


In [4]:
from transformers import AutoTokenizer

model_name = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 128

def preprocess(batch):
    inputs = [ex for ex in batch["en"]]
    targets = [ex for ex in batch["vi"]]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=max_length)
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
tokenized = tokenized.with_format("torch")

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import numpy as np

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")
metric = evaluate.load("sacrebleu")

def postprocess(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/envi_results",
    eval_strategy="epoch",        
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=5,
    logging_steps=100,
    ddp_find_unused_parameters=False,
    report_to="none"
)

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [6]:
from accelerate import notebook_launcher

def train_fn():
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()

train_fn()

  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Bleu
1,1.6663,1.581191,41.643524
2,1.4605,1.422846,43.652587
3,1.3567,1.353307,44.394005
4,1.32,1.319914,44.68785
5,1.2966,1.308489,45.063901


