In [None]:
!pip install transformers datasets evaluate accelerate

In [None]:
!pip install rouge_score

In [None]:
!pip install nltk

# Dataset

In [None]:
import os

def read_data(root, folder):
    data = {'en':[], 'vi':[]}
    path = os.path.join(root, folder)
    for file_name in os.listdir(path):
        file_path = os.path.join(path, file_name)
        with open(file_path,'r') as f:
            _, tail = file_path.split('.')
            if tail =='en':
                for line in f:
                    data['en'].append(line.strip())
            else:
                for line in f:
                    data['vi'].append(line.strip())
                    
    return data

In [None]:
root = "/kaggle/input/"

train_data = read_data(root, 'trainnew')
test_data = read_data(root, 'testnew')
valid_data = read_data(root, 'validd')

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
valid_dataset = Dataset.from_dict(valid_data)

In [None]:
from datasets import DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "valid": valid_dataset
})

In [None]:
dataset

# Tokenizer

In [None]:
from transformers import AutoTokenizer, EncoderDecoderModel, AutoModel

# Define encoder and decoder model names
encoder_model_name = "bert-base-uncased"  
decoder_model_name = "vinai/bartpho-word"  

# Load tokenizers
encoder_tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
decoder_tokenizer = AutoTokenizer.from_pretrained(decoder_model_name)

In [None]:
encoder_max_length=64
decoder_max_length=64

def process_data_to_model_inputs(batch):
    inputs = encoder_tokenizer(batch["en"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = decoder_tokenizer(batch["vi"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = [
        [-100 if token == decoder_tokenizer.pad_token_id else token for token in labels]
        for labels in outputs.input_ids
    ]
    return batch

In [None]:
batch_size=128

tokenizer_dataset = dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["en", "vi"]
)

In [None]:
tokenizer_dataset

# Metric

In [None]:
import evaluate
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Decode predictions and labels
    pred_str = decoder_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = decoder_tokenizer.pad_token_id
    label_str = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])
    rouge2_score = rouge_output["rouge2"]

    return {
        "rouge2_precision": round(rouge2_score, 4),
        "rouge2_recall": round(rouge2_score, 4),
        "rouge2_fmeasure": round(rouge2_score, 4),
    }


# Model

In [None]:
# Combine into an EncoderDecoderModel
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model_name,
    decoder_model_name
)

model.config.decoder_start_token_id = decoder_tokenizer.cls_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id

In [None]:
# Load your parallel dataset
train_dataset = tokenizer_dataset['train']
val_dataset = tokenizer_dataset['test']

# Training

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs = 5, 
    save_total_limit=2,
    predict_with_generate=True,
    report_to="none",
    metric_for_best_model="eval_loss",  
    greater_is_better=False,  
    load_best_model_at_end=True,  
    save_strategy="epoch"  
)


In [None]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2  # Stop training if no improvement for 2 consecutive evaluations
)

In [None]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback] 
)

# Train the model
trainer.train()

In [None]:
model.save_pretrained("./translation_model")

In [None]:
inputs = encoder_tokenizer(
    "On Sunday, September 1, 2019, Hurricane Dorian, one of the strongest hurricanes ever recorded in the Atlantic Ocean, with winds of 362 km/h, made landfall on Great Abaco Island, northern Bahamas.", 
    return_tensors="pt", 
    padding=True, 
    truncation=True, 
    max_length=64
)

# Move inputs to the model's device
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Generate outputs
outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)

# Decode the output
print(decoder_tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from transformers import AutoTokenizer
# Function to evaluate model predictions on the test set
def evaluate_model(model, encoder_tokenizer, decoder_tokenizer, test_data):
    predictions = []
    references = []
    
    for item in test_data:
        source = item["en"]
        target = item["vi"]
        
        # Tokenize the source sentence
        inputs = encoder_tokenizer(source, 
                                    padding=True, 
                                    truncation=True, 
                                    max_length=64,
                                    return_tensors="pt")
        inputs = {key: value.to(model.device) for key, value in inputs.items()}
        # Generate translation
        outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)
        prediction = decoder_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        predictions.append(prediction)
        references.append(target)
        
    
    return predictions, references


In [None]:
small_test_dataset =dataset["test"].shuffle(seed=42).select(range(10))

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothing = SmoothingFunction().method1

# Calculate BLEU score with smoothing
def calculate_bleu(predictions, references):
    scores = []
    for pred, ref in zip(predictions, references):
        ref_tokens = ref.split()
        pred_tokens = pred.split()
        score = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothing)
        scores.append(score)
    return sum(scores) / len(scores) 

bleu_score = calculate_bleu(predictions, references)
print(f"BLEU Score with Smoothing: {bleu_score:.4f}")
