# NLP Backtranslation
Let's define some variables:

In [1]:
import shlex
from typing import Dict, List, Union, Tuple
from pathlib import Path
import subprocess

SRC = "de"
TGT = "en"


cwd = Path.cwd()
data_dir = cwd / "Data"
model_dir = cwd / "Models" / "hugging_face"

it_parallel = "it-parallel"
news_dataset = "train-euro-news-big"
it_mono = "it-mono"

test_folder = cwd / "tests"

sentencepiece_script = cwd / "spm_encode.py"

In [3]:
from huggingface_hub import hf_hub_download

base_model_en_de = hf_hub_download("rinto/transformer_wmt_en_de", "checkpoint_best-en-de.pt", local_dir=model_dir)
base_model_en_de = hf_hub_download("rinto/transformer_wmt_en_de", "checkpoint_best-de-en.pt", local_dir=model_dir)
dict_de = hf_hub_download("rinto/transformer_wmt_en_de", "dict.de.txt", local_dir=model_dir)
dict_en = hf_hub_download("rinto/transformer_wmt_en_de", "dict.en.txt", local_dir=model_dir)
sentencepiece_model = hf_hub_download("rinto/transformer_wmt_en_de", "spm.model", local_dir=model_dir)

from tokenizers import SentencePieceUnigramTokenizer
tokenizer = SentencePieceUnigramTokenizer.from_spm(
    sentencepiece_model
)

tokens = tokenizer.encode("Hello, how are you?")
tokenizer.decode(tokens.ids)

'Hello, how are you?'

In [76]:
def run_command(args: List[str]):
    with subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, shell=True
    ) as proc:
        for line in proc.stdout:
            print(line)
    return proc

def print_file(path: Path):
    with path.open() as f:
        print(f.read())

def get_train_model_args(
    path_to_data,
    arch="transformer_wmt_en_de",
    max_update=10,
    model_dir: Path | str ="Models",
    experiment_name="test-de-en",
    lr=6e-4,
):
    return [
        "fairseq-train",
        str(path_to_data),
        "--arch",
        arch,
        "--task translation",
        "--share-decoder-input-output-embed",
        "--optimizer adam",
        "--adam-betas '(0.9, 0.98)'",
        "--clip-norm 0.1",
        "--lr",
        lr,
        "--lr-scheduler inverse_sqrt",
        "--warmup-updates 2500",
        "--warmup-init-lr 1e-07",
        "--stop-min-lr 1e-09",
        "--dropout 0.3",
        "--weight-decay 0.0001",
        "--criterion label_smoothed_cross_entropy",
        "--label-smoothing 0.1",
        "--max-tokens 8192",
        "--max-update",
        max_update,
        "--update-freq 8",
        "--patience 10",
        "--scoring sacrebleu",
        "--eval-bleu",
        '--eval-bleu-args \'{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}\'',
        "--eval-bleu-detok moses",
        "--eval-bleu-remove-bpe",
        "--eval-bleu-print-samples",
        "--best-checkpoint-metric bleu",
        "--maximize-best-checkpoint-metric",
        "--save-interval-updates 2000",
        "--validate-interval-updates 2000",
        "--keep-best-checkpoints 1",
        "--encoder-learned-pos",
        "--save-dir",
        str(model_dir) + "/" + experiment_name,
        "--bpe sentencepiece",
    ]


def get_bleu_score(experiment: Path):
    """Returns the command to calculate the BLEU score. Final path component is the result file."""
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.hyp | sacrebleu $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.ref -m bleu > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.sacrebleu
    return "cat {0}.hyp | sacrebleu {0}.ref -m bleu > {0}.sacrebleu".format(
        experiment
    ), experiment.with_suffix(".sacrebleu")


# [

#         "cat",
#         str(experiment.with_suffix(".hyp")),
#         "| sacrebleu",
#         str(experiment.with_suffix(".ref")),
#         "-m",
#         "bleu >",
#        str( experiment.with_suffix(".sacrebleu")),
#     ]


def generate_args(
    path_to_data: str | Path,
    subset: str,
    src: str,
    tgt: str,
    model_checkpoint: str | Path,
    save_dir: str,
) -> list[str]:
    """
    Generates the arguments for the fairseq-generate command

    Parameters
    ----------
    path_to_data : str
        Path to the data directory (the data must be binarized)
    subset : str
        The subset to generate the outputs for (e.g., test)
    src : str
        The source language
    tgt : str
        The target language
    model_checkpoint : str
        The path to the model checkpoint
    """
    # fairseq-generate Data/$TEST/bin \
    #  --gen-subset test --source-lang $SRC --target-lang $TGT \
    #  --path./Models/$MODEL/checkpoint_best.pt \
    #  --skip-invalid-size-inputs-valid-test \
    #  --batch-size 128 --beam 5 --remove-bpe sentencepiece > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT
    return f"fairseq-generate {path_to_data} --gen-subset {subset} --source-lang {src} --target-lang {tgt} --path {model_checkpoint} --skip-invalid-size-inputs-valid-test --batch-size 128 --beam 5 --remove-bpe sentencepiece > {save_dir}"


def process_outputs(experiment_name: Path):
    """Outputs the processed outputs to the output directory

    Parameters
    ----------
    experiment_name : Path
        The name of the experiment

    Returns
    -------
    Dict[str, str]
        A dictionary containing the keys `hyp`, `ref`, and `src` with the corresponding commands
    """
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^H | sort -V | cut -f3- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.hyp
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^T | sort -V | cut -f2- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.ref
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^S | sort -V | cut -f2- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.src
    base = f"cat {experiment_name} | grep -p {{grep}} | sort -V | cut -f{{cut}}- | sacremoses detokenize > {experiment_name}.{{ext}}"
    return {
        "hyp": base.format(grep="^H", cut=3, ext="hyp"),
        "ref": base.format(grep="^T", cut=2, ext="ref"),
        "src": base.format(grep="^S", cut=2, ext="src"),
    }


def tokenize_input(
    source_file: Path, target_file: Path, src: str, tgt: str, sentencepiece_script: Path
) -> Dict[str, Union[Path, List[str]]]:
    """
    Tokenizes the input files using sacremoses and sentencepiece

    Parameters
    ----------
    source_file : Path
        The path to the source file
    target_file : Path
        The path to the target file
    src : str
        The source language
    tgt : str
        The target language
    sentencepiece_script : Path
        The path to the sentencepiece script

    Returns
    -------
    Dict[str, Union[Path, List[str]]]
        A dictionary containing the keys `tokenize_source_side`, `tokenize_target_side`, `encode`, and `output_files`

    """
    # # tokenize train-mono, dev, test
    # cat $src_train | sacremoses -l $SRC -j 4 normalize -c tokenize -a > $train_file.tok.$SRC
    # cat $tgt_train | sacremoses -l $TGT -j 4 normalize -c tokenize -a > $train_file.tok.$TGT
    # # separated for clarity
    # python ./spm_encode.py --model="$spm" \
    #     --output_format=piece \
    #     --inputs $train_file.tok.$SRC $train_file.tok.$TGT  \
    #     --outputs  $train_file.tok.spm.$SRC $train_file.tok.spm.$TGT
    intermediate_files = (
        str(source_file.with_suffix(".tok." + src)),
        str(target_file.with_suffix(".tok." + tgt)),
    )
    output_files = (
        source_file.with_suffix(".tok.spm." + src),
        source_file.with_suffix(".tok.spm." + tgt),
    )
    return {
        "tokenize_source_side":  f"cat {source_file} | sacremoses -l {src} -j 4 normalize -c tokenize -a > {intermediate_files[0]}",
        "tokenize_target_side": f"cat {target_file} | sacremoses -l {tgt} -j 4 normalize -c tokenize -a > {intermediate_files[1]}",
        "encode": [
            "python",
            str(sentencepiece_script),
            "--model",
            str(sentencepiece_script),
            "--output_format=piece",
            "--inputs",
            *intermediate_files,
            "--outputs",
            str(output_files[0]),
            str(output_files[1]),
        ],
        "output_files": output_files,
    }


def binarize_data(
    src: str,
    tgt: str,
    src_dict,
    tgt_dict,
    train_prefix_file: Path,
    valid_prefix_file: Path,
    test_prefix_file: Path,
    output_dir,
    only_source=False,
):
    """Binarizes the data. Note: if monolingual, use --only-source. Repeat in opposite direction if required, binary files are directional. Recommendation: Use different output directories for each direction."""
    # fairseq-preprocess \
    # --source-lang $SRC --target-lang $TGT \
    # --srcdict ./Data/it-mono/dict.$SRC.txt \
    # --tgtdict ./Data/it-mono/dict.$TGT.txt \
    # --trainpref $train_file.tok.spm \
    #     --validpref $dev_file.tok.spm \
    #     --testpref $test_file.tok.spm \
    # --destdir "$(dirname $train_file)/bin" \
    #     --thresholdtgt 0 --thresholdsrc 0 --workers 20 $only_source

    # NOTE: if monolingual, --only-source
    # repeat in opposite direction if required, binary files are directional
    return [
        "fairseq-preprocess",
        "--source-lang",
        src,
        "--target-lang",
        tgt,
        "--srcdict",
        str(src_dict),
        "--tgtdict",
        str(tgt_dict),
        "--trainpref",
        str(train_prefix_file.with_suffix(f".tok.spm")),
        "--validpref",
        str(valid_prefix_file.with_suffix(f".tok.spm")),
        "--testpref",
        str(test_prefix_file.with_suffix(f".tok.spm")),
        "--destdir",
        str(output_dir),
        "--thresholdtgt",
        "0",
        "--thresholdsrc",
        "0",
        "--workers",
        "20",
        *(["--only_source"] if only_source else [])
    ]

## Experiment 1
- Test base MODEL performance on the test set
- Test base MODEL performance on the it-parallel dataset
- Finetune the base MODEL on the it-parallel dataset and evaluate on both test sets

### Results
| Model |  News Corpus |  it-parallel |
|-------|--------------|--------------|
| Base  |  21.2      |  14.1      |
| Finetune |  0.0000  |  0.0000      |


In [47]:
base_model = model_dir / f"big-{SRC}-{TGT}" / "checkpoint_best.pt"

experiment_name = f"big-{SRC}-{TGT}-test-{it_mono}"

# Base evaluation on the news corpus first
evaluate_news = process_outputs(test_folder / experiment_name)
hyp_args, ref_args, src_args = evaluate_news.values()

# Extract the hypothesis, reference, and source
# res = subprocess.check_output(hyp_args, shell=True)
# print(res)
# res = subprocess.check_output(ref_args, shell=True)
# print(res)
# res = subprocess.check_output(src_args, shell=True)
# print(res)

args, file = get_bleu_score(test_folder / experiment_name)
run_command(args)
!cat $file

{
 "name": "BLEU",
 "score": 14.1,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1",
 "verbose_score": "43.1/18.4/9.6/5.2 (BP = 1.000 ratio = 1.032 hyp_len = 23650 ref_len = 22924)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.1"
}


In [63]:
# Generate test results from news_dataset

model_name = f"big_de_en_{news_dataset}"

args = generate_args(data_dir / news_dataset / f"bin-{SRC}-{TGT}", "test", model_checkpoint=base_model_en_de, src=SRC, tgt=TGT, save_dir=test_folder / model_name)
output = args.rsplit(">", 1)[-1].strip()
print(args)
run_command(args)

evaluate_news = process_outputs(output)
hyp_args, ref_args, src_args = evaluate_news.values()
print(hyp_args, ref_args, src_args, sep='\n')

run_command(hyp_args)
run_command(ref_args)
run_command(src_args)

output = Path(output)
args, output = get_bleu_score(output)
print(args)
run_command(args)
print_file(output)



INFO:fairseq.tasks.text_to_speech:Please install tensorboardX: pip install tensorboardX

DEBUG:hydra.core.utils:Setting JobRuntime:name=UNKNOWN_NAME

DEBUG:hydra.core.utils:Setting JobRuntime:name=utils

INFO:fairseq_cli.generate:{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quan

KeyboardInterrupt: 

In [73]:
args = binarize_data("de", "en", dict_de, dict_en, train_prefix_file=data_dir / it_parallel / "train", valid_prefix_file=data_dir / it_parallel / "dev", test_prefix_file=data_dir / it_parallel / "test",  output_dir=data_dir / it_parallel / "bin-de-en")
print(shlex.join(args))
run_command(args)

fairseq-preprocess --source-lang de --target-lang en --srcdict /Users/Matey/project/nlp2/Models/dict.de.txt --tgtdict /Users/Matey/project/nlp2/Models/dict.en.txt --trainpref /Users/Matey/project/nlp2/Data/it-parallel/train.tok.spm --validpref /Users/Matey/project/nlp2/Data/it-parallel/dev.tok.spm --testpref /Users/Matey/project/nlp2/Data/it-parallel/test.tok.spm --destdir /Users/Matey/project/nlp2/Data/it-parallel/bin-de-en --thresholdtgt 0 --thresholdsrc 0 --workers 20
INFO:fairseq.tasks.text_to_speech:Please install tensorboardX: pip install tensorboardX

INFO:fairseq_cli.preprocess:Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_con

<Popen: returncode: 1 args: ['fairseq-preprocess', '--source-lang', 'de', '-...>

In [74]:
!fairseq-preprocess --source-lang de --target-lang en --srcdict /Users/Matey/project/nlp2/Models/dict.de.txt --tgtdict /Users/Matey/project/nlp2/Models/dict.en.txt --trainpref /Users/Matey/project/nlp2/Data/it-parallel/train.tok.spm --validpref /Users/Matey/project/nlp2/Data/it-parallel/dev.tok.spm --testpref /Users/Matey/project/nlp2/Data/it-parallel/test.tok.spm --destdir /Users/Matey/project/nlp2/Data/it-parallel/bin-de-en --thresholdtgt 0 --thresholdsrc 0 --workers 20


INFO:fairseq.tasks.text_to_speech:Please install tensorboardX: pip install tensorboardX
INFO:fairseq_cli.preprocess:Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', run_sanity_validation_steps=False, criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None,

In [80]:
experiment_name = f"big-{SRC}-{TGT}-ft-{it_parallel}"
train_args = get_train_model_args(data_dir / it_parallel / f"bin-{SRC}-{TGT}", experiment_name=experiment_name)
print(' '.join(map(str, train_args)))

fairseq-train /Users/Matey/project/nlp2/Data/it-parallel/bin-de-en --arch transformer_wmt_en_de --task translation --share-decoder-input-output-embed --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.1 --lr 0.0006 --lr-scheduler inverse_sqrt --warmup-updates 2500 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --dropout 0.3 --weight-decay 0.0001 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --max-tokens 8192 --max-update 10 --update-freq 8 --patience 10 --scoring sacrebleu --eval-bleu --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' --eval-bleu-detok moses --eval-bleu-remove-bpe --eval-bleu-print-samples --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --save-interval-updates 2000 --validate-interval-updates 2000 --keep-best-checkpoints 1 --encoder-learned-pos --save-dir Models/big-de-en-ft-it-parallel --bpe sentencepiece


In [4]:
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
mname = "facebook/wmt19-de-en"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)

input = "Maschinelles Lernen ist großartig, oder?"
input_ids = tokenizer.encode(input, return_tensors="pt")
outputs = model.generate(input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded) # Machine Learning is great, isn't it?

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Machine learning is great, isn't it?


In [39]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=1024):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        return self.tokenizer(src_text, text_target=tgt_text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

BATCH_SIZE = 16

it_parallel_src = data_dir / it_parallel / f"train.{SRC}"
it_parallel_tgt = data_dir / it_parallel / f"train.{TGT}"
# Create a dataset from the training parallel data
with open(it_parallel_src) as f:
    src = f.read().splitlines()
with open(it_parallel_tgt) as f:
    tgt = f.read().splitlines()
train_dataset = TranslationDataset(src, tgt, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Create a dataset from the validation parallel data
valid_src = data_dir / it_parallel / f"dev.{SRC}"
valid_tgt = data_dir / it_parallel / f"dev.{TGT}"
with open(valid_src) as f:
    src = f.read().splitlines()
with open(valid_tgt) as f:
    tgt = f.read().splitlines()
valid_dataset = TranslationDataset(src, tgt, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Create a dataset from the test parallel data
test_src = data_dir / it_parallel / f"test.{SRC}"
test_tgt = data_dir / it_parallel / f"test.{TGT}"
with open(test_src) as f:
    src = f.read().splitlines()
with open(test_tgt) as f:
    tgt = f.read().splitlines()
    
test_dataset = TranslationDataset(src, tgt, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [50]:
for batch in test_loader:
    tokens = batch.input_ids
    for i in range(tokens.size(0)):
        output = model.generate(tokens[i])
        print(output)
        src_text = tokenizer.decode(output[0], skip_special_tokens=True)
        true_label = tokenizer.decode(batch.labels[i][0], skip_special_tokens=True)
        print(src_text, true_label)
    break

tensor([[    2,   957,   328,  9841,    23,     6,  3520,  1123,  1250,     9,
             6, 19360,  1708,     2]])
Method for the calculation of the prediction Method to calculate forecast
tensor([[    2, 10708,  1463,     6,  2657,    47,   153,  4362,     5,    31,
          7197,    22,    36,  6149,    15,   153, 16316,     9,     6,  5617,
          2657,    14,     5,   512,   109,   157,   427,     6,    18,  2152,
            41,    19, 16228,  4670,     5,     2]])
Reduces the speed by one unit. The minimum is 2 units (one fifth of the normal speed). You can also use the - key as a shortcut. Decreases the game speed by one unit, down to a minimum of 2 units (a fifth of normal speed). You can use the - key as a shortcut.
tensor([[   2, 1337,  400,  359, 9543,  167,  780, 2266,    2]])
Piotr Szymanski Piotr Szymanski
tensor([[    2,    50,  1762,  4085, 12075,    47,     2]])
Sort & sort by Sort & By
tensor([[2, 2]])
 umbrello; Authors
tensor([[   2, 1838,  297, 4206,  483,  