In [None]:
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu tensorflow-intel keras keras-nightly keras-preprocessing keras-vis tf-nightly tf-estimator-nightly tensorflow-estimator
!pip install -q transformers datasets sacrebleu sentencepiece accelerate evaluate protobuf==3.20.3

In [None]:
!ls -R /kaggle/input

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

base_path = "/kaggle/input/iwslt15-englishvietnamese/IWSLT'15 en-vi"

def read_parallel(src_file, tgt_file):
    with open(base_path + src_file, encoding="utf-8") as f_src, open(base_path + tgt_file, encoding="utf-8") as f_tgt:
        src = f_src.read().strip().splitlines()
        tgt = f_tgt.read().strip().splitlines()
    n = min(len(src), len(tgt))
    return pd.DataFrame({'en': src[:n], 'vi': tgt[:n]})

train_df = read_parallel("/train.en.txt", "/train.vi.txt")
val_df   = read_parallel("/tst2012.en.txt", "/tst2012.vi.txt")
test_df  = read_parallel("/tst2013.en.txt", "/tst2013.vi.txt")

raw = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})
print(raw)

In [None]:
raw["train"][1]

In [None]:
raw["validation"][1]

In [None]:
raw["test"][1]

In [None]:
max_length = 128

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

In [None]:
raw["validation"][1]

In [None]:
article = raw["validation"][1]['en']
inputs = tokenizer(article, return_tensors="pt")

translated_tokens = model.generate(
     **inputs,  max_length=256
 )
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

In [None]:
def preprocess_function(examples):
    inputs = examples["en"]
    targets = examples["vi"]

    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    labels = tokenizer(
        targets,
        max_length=max_length,
        truncation=True
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs


In [None]:
tokenized_datasets_train = raw['train'].map(
    preprocess_function,
    batched= True,
    remove_columns=raw["train"].column_names,
    batch_size = 128
)

tokenized_datasets_validation = raw['validation'].map(
    preprocess_function,
    batched= True,
    remove_columns=raw["validation"].column_names,
    batch_size = 128
)

tokenized_datasets_test = raw['test'].map(
    preprocess_function,
    batched= True,
    remove_columns=raw["test"].column_names,
    batch_size = 128)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Access the layers and freeze the specified number of layers
# Specify the number of layers to freeze from the end

for parameter in model.parameters():
    parameter.requires_grad = True
num_layers_to_freeze = 10  # Adjust as needed
for layer_index, layer in enumerate(model.model.encoder.layers):
    print
    if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
        for parameter in layer.parameters():
            parameter.requires_grad = False

num_layers_to_freeze = 10  # Adjust as needed
for layer_index, layer in enumerate(model.model.decoder.layers):
    print
    if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
        for parameter in layer.parameters():
            parameter.requires_grad = False

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
import torch

# Check if a GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import Seq2SeqTrainingArguments

model.to(device)
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/envi_results",
    eval_strategy="epoch",      
    num_train_epochs=5,
    gradient_checkpointing=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_steps=2,
    max_steps=2000,
    fp16=True,
    optim='adafactor',
    metric_for_best_model="eval_bleu",
    predict_with_generate=True,
    logging_steps=100,
    save_total_limit=2,
    push_to_hub=False,
    ddp_find_unused_parameters=False,
    report_to="none"
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_validation,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
print("Evaluating on test set:")
print(trainer.evaluate(tokenized_datasets_test))