# The Backtranslation Notebook to Rule Over Them All
Please do ignore the title, I just wanted to make it sound cool. This notebook is a simple demonstration of how to use backtranslation to improve the performance of a machine learning model. The notebook is divided into the following sections:
1. Introduction and Simple Generation
2. Applying Data Preparation/Filtering
3. Investigating Iterative Backtranslation
4. Applying a similar pipeline to a more complex model (ALMA-R)
5. Conclusion

In [2]:
# First cell in the notebook to enable autoreload of modules
%load_ext autoreload
%autoreload 2

In [84]:
from pathlib import Path
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from utils.data import TranslationDataModule
from utils.models import TranslationLightning
from pytorch_lightning import Trainer
from peft import LoraConfig, get_peft_model
# Isn't it so nice and clean now? I went through FOUR different ways of doing this before I thought of this one. WDWFDGFEQWDQWFGA

SRC = "de"
TGT = "en"
BATCH_SIZE = 8
MAX_LENGTH = 512

cwd = Path.cwd()
data_dir = cwd / "Data"

it_parallel = "it-parallel"
news_dataset = "train-euro-news-big"
it_mono = "it-mono"

test_folder = cwd / "tests"

mname = f"facebook/wmt19-{SRC}-{TGT}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)

config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.2,
    target_modules = ["v_proj", "q_proj"]
)

it_parallel_data = TranslationDataModule(data_dir / it_parallel, SRC, TGT, tokenizer, model, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)
# Add news_dataset


model = get_peft_model(model, config)
model.print_trainable_parameters()

model_pl = TranslationLightning(model, tokenizer, lr=3e-4, adam_beta=(0.9, 0.98), weight_decay=1e-4, test_folder = test_folder)

trainer = Trainer(max_steps=10,  gradient_clip_val=0.1, val_check_interval = 0.25, limit_val_batches=0.25)

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,179,648 || all params: 273,027,072 || trainable%: 0.4320626490841172


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [85]:
it_parallel_data.setup('fit')
dataset = it_parallel_data.train

In [124]:
batch = next(iter(it_parallel_data.train_dataloader()))
print(batch['input_ids'].shape, batch['labels'].shape, batch['decoder_input_ids'].shape)

[{'input_ids': tensor([[26298,  3875,    93,  2381,   144,    95,  2217,   125,   504,     4,
           319,    13,  3296,   125,   368,    10, 27477,   183,    10,  5040,
          2220,  3966,  2567,  8789,  1871,    12,   308,  3689,    83,   101,
            17,  8627,   205,    28,    13,    59,  1606,  3746,     5,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[13484,  1697,    16,  3875,  9989,   485,    94,    24,   144,    95,
          2217,   125,   504,    24,     4,   262,  3296,   125,   368,    10,
         27477,   183,  5040,  2220,  3849, 21794,    10,    11,    10, 10478,
             4,    41,   387,    41,  4864,  1562,   427,     5,     2]])}, {'input_ids': tensor([[3257,  730,  948,   20, 4162, 9859,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[8882, 1089,   20, 4162,  325,  536,    2]])}, 

TypeError: list indices must be integers or slices, not BatchEncoding

In [83]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


def prep_fn(sample):
    sample['input_ids'] = sample['input_ids'].squeeze()
    sample['attention_mask'] = sample['attention_mask'].squeeze()
    sample['labels'] = sample['labels'].squeeze()
    return sample
this = [prep_fn(dataset[i]) for i in range(8)]

# print(this)
batch = data_collator(this)
batch.labels[1], batch.decoder_input_ids[1]

(tensor([1124, 8001,   19, 1647,   38,   22,  675,    7,    6, 1775, 6221,   48,
            6, 1867,    4,  399, 1203, 3263,  119,   99, 7555,    9,    6, 1647,
            7,    6, 6221,    4,   11,  755, 8001,   61,   48,    6, 1867,    5,
            2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]),
 tensor([   2, 1124, 8001,   19, 1647,   38,   22,  675,    7,    6, 1775, 6221,
           48,    6, 1867,    4,  399, 1203, 3263,  119,   99, 7555,    9,    6,
         1647,    7,    6, 6221,    4,   11,  755, 8001,   61,   48,    6, 1867,
            5,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,

In [4]:
it_parallel_data.setup('fit')
# batch = next(iter(it_parallel_data.train_dataloader()))

# print(batch["labels"].)

## Simple Generation
Here we simply load a pre-trained model and run inference on the test set. The model loaded is a pre-trained model from the [Hugging Face Transformers](https://huggingface.co/transformers/) library. 

TODO: Evaluate the model on the NEWS set and report the BLEU score.

| Model | it-parallel | NEWS |
| --- | --- | --- |
| Base | 38.307622648987454 | 0.0 |
| FT on IT-parallel | 29.919114415962934 | 0.0 |
| FT on BT | 29.919114415962934 | 0.0 |
| FT on IT-parallel and BT | 29.919114415962934 | 0.0 |

In [3]:
# IT-parallel
results = trainer.predict(model_pl, datamodule=it_parallel_data)
bleu = sum(r[2]["score"] for r in results) / len(results)
print("Average BLEU:", bleu)

# News corpus
# ...

In [None]:
# Now we can train the model
trainer.fit(model_pl, datamodule=it_parallel_data)

# IT-parallel
trainer.predict(model_pl, datamodule=it_parallel_data)
bleu = sum(r[2]["score"] for r in results) / len(results)
print("Average BLEU:", bleu)

# News corpus

### Now the key ingredient: Backtranslation
Backtranslation uses a reverse model to generate synthetic data. This synthetic data is then used to train the model. The idea is that the synthetic data will help the model generalize better.

In [None]:
reverse_model = FSMTForConditionalGeneration.from_pretrained(f"facebook/wmt19-{TGT}-{SRC}")
reverse_tokenizer = FSMTTokenizer.from_pretrained(f"facebook/wmt19-{TGT}-{SRC}")

it_mono = TranslationDataModule(data_dir / it_mono, TGT, SRC, reverse_tokenizer, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)

output_folder = test_folder / f"generation-{TGT}-{SRC}"
reverse_lightning = TranslationLightning(reverse_model, reverse_tokenizer, lr=3e-4, adam_beta=(0.9, 0.98), weight_decay=1e-4, test_folder = output_folder)

trainer.predict(reverse_lightning, it_mono.train_dataloader())

Now that we have generated the data, we can now copy in the generated data.

In [None]:
train_file = output_folder / "hypothesis.hyp"
train_file_target = output_folder / "source.src"
train_file.rename(data_dir / f"generated-{it_mono}" / f"train.{SRC}")
train_file_target.rename(data_dir / f"generated-{it_mono}" / f"train.{TGT}")

import shutil
bt_dir = data_dir / f"generated-{it_mono}"
shutil.copy(data_dir / it_mono / f"dev.{SRC}", bt_dir / f"dev.{SRC}")
shutil.copy(data_dir / it_mono / f"dev.{TGT}", bt_dir / f"dev.{TGT}")
shutil.copy(data_dir / it_mono / f"test.{SRC}", bt_dir / f"test.{SRC}")
shutil.copy(data_dir / it_mono / f"test.{TGT}", bt_dir / f"test.{TGT}")


In [None]:
# And now we can finally train...
it_mono_bt = TranslationDataModule(data_dir / f"generated-{it_mono}", SRC, TGT, tokenizer, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)
trainer.fit(model_pl, datamodule=it_mono_bt)

In [None]:
model_pl.test_folder = test_folder / f"bt_{it_mono}"
# IT-parallel
results = trainer.predict(model_pl, datamodule=it_parallel_data)
print("IT-parallel", sum(r[2]["score"] for r in results) / len(results))
# News corpus
# ...

And now finally we can concatenate both the original and the generated data and train the model on it.

In [None]:
parallel_plus_bt = data_dir / f"parallel+bt_generated_{it_mono}"
shutil.copytree(bt_dir, parallel_plus_bt)
train_file = parallel_plus_bt /f"train.{SRC}"
with open(train_file, "r+") as f:
    with open(data_dir / it_parallel / f"train.{SRC}", "r") as f2:
        f.write("\n" + f2.read())
        
train_file_target = parallel_plus_bt / f"train.{TGT}"
with open(train_file_target, "r+") as f:
    with open(data_dir / it_parallel / f"train.{TGT}", "r") as f2:
        f.write("\n" + f2.read())


In [None]:
# Fitting the model on the combined data
parallel_plus_bt_data = TranslationDataModule(parallel_plus_bt, SRC, TGT, tokenizer, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)
trainer.fit(model_pl, datamodule=parallel_plus_bt_data)

In [None]:
# IT-parallel
results = trainer.predict(model_pl, datamodule=it_parallel_data)
print("IT-parallel BLEU:", sum(r[2]["score"] for r in results) / len(results))
# News corpus
# ...