# The Backtranslation Notebook to Rule Over Them All
Please do ignore the title, I just wanted to make it sound cool. This notebook is a simple demonstration of how to use backtranslation to improve the performance of a machine learning model. The notebook is divided into the following sections:
1. Introduction and Simple Generation
2. Applying Data Preparation/Filtering
3. Investigating Iterative Backtranslation
4. Applying a similar pipeline to a more complex model (ALMA-R)
5. Conclusion

In [32]:
# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Navigate to your project directory on Google Drive
# import os
# project_dir = '/content/drive/MyDrive/nlp-backtranslation'
# os.chdir(project_dir)

# # Install required packages
# !pip install transformers
# !pip install pytorch-lightning
# !pip install peft
# !pip install evaluate
# !pip install sentencepiece
# !pip install -U sacremoses
# !pip install sacrebleu
# !pip install unbabel-comet

In [33]:
# First cell in the notebook to enable autoreload of modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import pandas as pd
from calculate_bleu import calculate_bleu, calculate_comet

In [26]:

from pathlib import Path
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from utils.data import TranslationDataModule
from utils.models import TranslationLightning
from pytorch_lightning import Trainer
from peft import LoraConfig, get_peft_model
from pytorch_lightning.callbacks import ModelCheckpoint
# Isn't it so nice and clean now? I went through FOUR different ways of doing this before I thought of this one. WDWFDGFEQWDQWFGA

SRC = "de"
TGT = "en"
BATCH_SIZE = 16
MAX_LENGTH = 128

cwd = Path.cwd()
data_dir = cwd / "Data"

it_parallel = "it-parallel"
news_dataset = "train-euro-news-big"
it_mono = "it-mono"
generation_folder = "generation+it-parallel-en-de"

test_folder = cwd / "tests"
output_folder = test_folder / f"generation-{TGT}-{SRC}"

mname = f"facebook/wmt19-{SRC}-{TGT}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)

config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.2,
    target_modules = ["v_proj", "q_proj"]
)

checkpoint_callback = ModelCheckpoint(
    dirpath='/content/drive/MyDrive/checkpoints',
    save_top_k=1,
    monitor='val_loss',
    mode='min'
)

# it_parallel_data = TranslationDataModule(data_dir / it_parallel, SRC, TGT, tokenizer, None, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)
# # Add news_dataset
# combined_data_module = TranslationDataModule(data_dir,
#                                              SRC,
#                                              TGT,
#                                              tokenizer,
#                                              None,
#                                              batch_size=BATCH_SIZE,
#                                              max_length=MAX_LENGTH,
#                                              use_combined_data=True,
#                                              generation_folder=generation_folder)
# combined_data_module.setup('fit')

model = get_peft_model(model, config)
model.print_trainable_parameters()

#model_pl = TranslationLightning(model, tokenizer, lr=3e-4, adam_beta=(0.9, 0.98), weight_decay=1e-4, output_dir = output_folder)

trainer = Trainer(max_epochs=1, check_val_every_n_epoch=5, gradient_clip_val=0.3, val_check_interval = 0.25, limit_val_batches=0.25, callbacks=[checkpoint_callback])

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


trainable params: 1,179,648 || all params: 273,027,072 || trainable%: 0.4321


In [27]:
import argparse
from evaluate import load
from statistics import stdev
import pandas as pd
from pathlib import Path
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from pytorch_lightning import Trainer
from utils.data import TranslationDataModule
from utils.models import TranslationLightning
from peft import LoraConfig, get_peft_model

# Define constants
SRC = "de"
TGT = "en"
BATCH_SIZE = 16
MAX_LENGTH = 128
cwd = Path.cwd()
data_dir = cwd / "Data"
generation_folder = "generation+it-parallel-en-de"
output_folder_base = cwd / "results"

# Load tokenizer and model
mname = f"facebook/wmt19-{SRC}-{TGT}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)
config = LoraConfig(r=16, lora_alpha=16, lora_dropout=0.2, target_modules=["v_proj", "q_proj"])
model = get_peft_model(model, config)
model.print_trainable_parameters()

def calculate_bleu(hyps: list[str], refs: list[str]):
    sacrebleu = load("sacrebleu")
    bleu = sacrebleu.compute(predictions=hyps, references=refs)
    return bleu

def calculate_comet(hyps: list[str], refs: list[str], src: list[str]):
    comet = load("comet")
    score = comet.compute(predictions=hyps, references=refs, sources=src)
    score = f"COMET score: {score['mean_score']:.3f}±{stdev(score['scores']):.3f}"
    return score

def train_and_evaluate(top_percentage=None, log_file='training_log.csv'):
    output_folder = output_folder_base / f"{top_percentage}_percent"
    output_folder.mkdir(parents=True, exist_ok=True)

    combined_data_module = TranslationDataModule(
        data_dir,
        SRC,
        TGT,
        tokenizer,
        None,
        batch_size=BATCH_SIZE,
        max_length=MAX_LENGTH,
        use_combined_data=(top_percentage is not None),
        generation_folder=generation_folder,
        top_percentage=top_percentage if top_percentage else 1.0
    )
    combined_data_module.setup('fit')

    model_pl = TranslationLightning(model, tokenizer, lr=3e-4, adam_beta=(0.9, 0.98), weight_decay=1e-4, output_dir=output_folder)

    # Check the first few samples of the train dataset
    for i in range(5):
        print(f"Sample {i+1}:")
        print("Source:", combined_data_module.train.src_texts[i])
        print("Target:", combined_data_module.train.tgt_texts[i])

    # Check the dataset sizes
    print("Train dataset size:", len(combined_data_module.train))
    print("Validation dataset size:", len(combined_data_module.val))
    print("Test dataset size:", len(combined_data_module.test))

    # Setup the trainer
    checkpoint_callback = ModelCheckpoint(
        dirpath=output_folder,
        filename="{epoch}-{val_loss:.2f}",
        monitor="val_loss",
        save_top_k=1,
        mode="min",
    )
    trainer = Trainer(
        max_epochs=1,
        check_val_every_n_epoch=1,
        gradient_clip_val=0.3,
        val_check_interval=0.25,
        limit_val_batches=0.25,
        callbacks=[checkpoint_callback]
    )

    # Train and evaluate
    trainer.fit(model_pl, datamodule=combined_data_module)
    results = trainer.predict(model_pl, datamodule=combined_data_module)
    bleu = sum(results) / len(results)
    print("Average BLEU:", bleu)

    # Calculate BLEU score
    with open(output_folder / "hypothesis.hyp", "r") as hyp_file:
        hyps = hyp_file.readlines()
    with open(output_folder / "reference.ref", "r") as ref_file:
        refs = ref_file.readlines()
    bleu_score = calculate_bleu(hyps, refs)
    print("BLEU score:", bleu_score)

    # Calculate COMET score if source file exists
    src_file = output_folder / "source.src"
    if src_file.exists():
        with open(src_file, "r") as src_file:
            src = src_file.readlines()
        comet_score = calculate_comet(hyps, refs, src)
        print("COMET score:", comet_score)

    # Log results to CSV
    log_data = {
        'top_percentage': top_percentage,
        'train_loss': model_pl.trainer.logged_metrics.get('train_loss', None),
        'val_loss': model_pl.trainer.logged_metrics.get('val_loss', None),
        'val_bleu': model_pl.trainer.logged_metrics.get('val_bleu', None),
        'average_bleu': bleu,
        'bleu_score': bleu_score['score']
    }
    if 'comet_score' in locals():
        log_data['comet_score'] = comet_score

    df = pd.DataFrame([log_data])
    if not Path(log_file).exists():
        df.to_csv(log_file, index=False)
    else:
        df.to_csv(log_file, mode='a', header=False, index=False)

    return results

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,179,648 || all params: 273,027,072 || trainable%: 0.4321


In [34]:
# Train and evaluate with 50% data
print("Training with 50% data:")
train_and_evaluate(top_percentage=0.5, log_file='training_log.csv')

In [35]:
# Train and evaluate with 30% data
print("Training with 30% data:")
train_and_evaluate(top_percentage=0.3, log_file='training_log.csv')

In [36]:
# Train and evaluate with 70% data
print("Training with 70% data:")
train_and_evaluate(top_percentage=0.7, log_file='training_log.csv')

In [37]:
# Ablation study: Train and evaluate without using the metric (use all data)
print("Ablation study: Using all data without applying the metric:")
train_and_evaluate(top_percentage=None, log_file='training_log.csv')