In [2]:
from transformers import MarianTokenizer, MarianMTModel, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

train_data = pd.read_csv('../../data/OurDataEn-Vi/train.txt', sep='\t')
val_data = pd.read_csv('../../data/OurDataEn-Vi/validation.txt', sep='\t')
test_data = pd.read_csv('../../data/OurDataEn-Vi/test.txt', sep='\t')

train_data.columns = ['input_text', 'target_text']
val_data.columns = ['input_text', 'target_text']
test_data.columns = ['input_text', 'target_text']


In [3]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

In [4]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

def preprocess_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=64, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Map:   0%|          | 0/28999 [00:00<?, ? examples/s]



Map:   0%|          | 0/1013 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [5]:
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-vi").to('cuda')

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    push_to_hub=False,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Huấn luyện
trainer.train()


pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

  trainer = Trainer(


model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.1372,0.117859
2,0.1039,0.107922
3,0.0815,0.105261


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=2721, training_loss=0.12222607286481987, metrics={'train_runtime': 707.1061, 'train_samples_per_second': 123.032, 'train_steps_per_second': 3.848, 'total_flos': 1474528026820608.0, 'train_loss': 0.12222607286481987, 'epoch': 3.0})

In [6]:
import sacrebleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

def generate_predictions(dataset):
    predictions = []
    for example in dataset:
        input_ids = tokenizer(example['input_text'], return_tensors="pt").input_ids.to('cuda')
        output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(prediction)
    return predictions
    
predictions = generate_predictions(test_dataset)
references = test_dataset['target_text']

def compute_bleu(predictions, references):
    return sacrebleu.corpus_bleu(predictions, [references])

bleu_score = compute_bleu(predictions, references)
print(f"BLEU score: {bleu_score.score}")


BLEU score: 69.54124082131071


In [7]:
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(score[key].fmeasure)
    
    avg_rouge_scores = {key: sum(value)/len(value) for key, value in rouge_scores.items()}
    return avg_rouge_scores
    
rouge_scores = compute_rouge(predictions, references)
print(f"ROUGE scores: {rouge_scores}")


ROUGE scores: {'rouge1': 0.8931077620868283, 'rouge2': 0.8062295825556249, 'rougeL': 0.8757833053635277}


In [8]:
def calculate_meteor(translated_texts, reference_texts):
    scores = [meteor_score([ref], trans) for ref, trans in zip(reference_texts, translated_texts)]
    return sum(scores) / len(scores)

meteor_score_avg = calculate_meteor(predictions, references)
print(f"Average METEOR score: {meteor_score_avg}")


Average METEOR score: 0.8476562426106298
