# NLP Backtranslation
Let's define some variables:

In [35]:
import shlex
from typing import Dict, List, Union, Tuple
from pathlib import Path
import subprocess

SRC = "de"
TGT = "en"


cwd = Path.cwd()
data_dir = cwd / "Data"
model_dir = cwd / "Models"

it_parallel = "it-parallel"
news_dataset = "train-euro-news-big"
it_mono = "it-mono"

test_folder = cwd / "tests"

sentencepiece_script = cwd / "spm_encode.py"

In [None]:
from huggingface_hub import hf_hub_download

base_model_en_de = hf_hub_download("rinto/transformer_wmt_en_de", "checkpoint_best-en-de.pt", local_dir="Models/hugging_face")
base_model_en_de = hf_hub_download("rinto/transformer_wmt_en_de", "checkpoint_best-de-en.pt", local_dir="Models/hugging_face")
dict_de = hf_hub_download("rinto/transformer_wmt_en_de", "dict.de.txt", local_dir="Models/hugging_face")
dict_en = hf_hub_download("rinto/transformer_wmt_en_de", "dict.en.txt", local_dir="Models/hugging_face")
sentencepiece_model = hf_hub_download("rinto/transformer_wmt_en_de", "spm.model", local_dir="Models/hugging_face")

from tokenizers import SentencePieceUnigramTokenizer
tokenizer = SentencePieceUnigramTokenizer.from_spm(
    sentencepiece_model
)

tokens = tokenizer.encode("Hello, how are you?")
tokenizer.decode(tokens.ids)

In [46]:
def run_command(args: List[str]):
    with subprocess.Popen(
        args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, shell=True
    ) as proc:
        for line in proc.stdout:
            print(line)
    return proc


def get_train_model_args(
    path_to_data,
    arch="transformer_wmt_en_de",
    max_update=10,
    model_dir="Models",
    experiment_name="test-de-en",
    lr=6e-4,
):
    return [
        "fairseq-train",
        path_to_data,
        "--arch",
        arch,
        "--task translation",
        "--share-decoder-input-output-embed",
        "--optimizer adam",
        "--adam-betas '(0.9, 0.98)'",
        "--clip-norm 0.1",
        "--lr",
        lr,
        "--lr-scheduler inverse_sqrt",
        "--warmup-updates 2500",
        "--warmup-init-lr 1e-07",
        "--stop-min-lr 1e-09",
        "--dropout 0.3",
        "--weight-decay 0.0001",
        "--criterion label_smoothed_cross_entropy",
        "--label-smoothing 0.1",
        "--max-tokens 8192",
        "--max-update",
        max_update,
        "--update-freq 8",
        "--patience 10",
        "--scoring sacrebleu",
        "--eval-bleu",
        '--eval-bleu-args \'{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}\'',
        "--eval-bleu-detok moses",
        "--eval-bleu-remove-bpe",
        "--eval-bleu-print-samples",
        "--best-checkpoint-metric bleu",
        "--maximize-best-checkpoint-metric",
        "--save-interval-updates 2000",
        "--validate-interval-updates 2000",
        "--keep-best-checkpoints 1",
        "--encoder-learned-pos",
        "--save-dir",
        model_dir + "/" + experiment_name,
        "--bpe sentencepiece",
    ]


def get_bleu_score(experiment: Path):
    """Returns the command to calculate the BLEU score. Final path component is the result file."""
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.hyp | sacrebleu $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.ref -m bleu > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.sacrebleu
    return "cat {0}.hyp | sacrebleu {0}.ref -m bleu > {0}.sacrebleu".format(
        experiment
    ), experiment.with_suffix(".sacrebleu")


# [

#         "cat",
#         str(experiment.with_suffix(".hyp")),
#         "| sacrebleu",
#         str(experiment.with_suffix(".ref")),
#         "-m",
#         "bleu >",
#        str( experiment.with_suffix(".sacrebleu")),
#     ]


def generate_args(
    path_to_data: str | Path,
    subset: str,
    src: str,
    tgt: str,
    model_checkpoint: str | Path,
    save_dir: str,
) -> list[str]:
    """
    Generates the arguments for the fairseq-generate command

    Parameters
    ----------
    path_to_data : str
        Path to the data directory (the data must be binarized)
    subset : str
        The subset to generate the outputs for (e.g., test)
    src : str
        The source language
    tgt : str
        The target language
    model_checkpoint : str
        The path to the model checkpoint
    """
    # fairseq-generate Data/$TEST/bin \
    #  --gen-subset test --source-lang $SRC --target-lang $TGT \
    #  --path./Models/$MODEL/checkpoint_best.pt \
    #  --skip-invalid-size-inputs-valid-test \
    #  --batch-size 128 --beam 5 --remove-bpe sentencepiece > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT
    return f"fairseq-generate {path_to_data} --gen-subset {subset} --source-lang {src} --target-lang {tgt} --path {model_checkpoint} --skip-invalid-size-inputs-valid-test --batch-size 128 --beam 5 --remove-bpe sentencepiece > {save_dir}"


def process_outputs(experiment_name: Path):
    """Outputs the processed outputs to the output directory

    Parameters
    ----------
    experiment_name : Path
        The name of the experiment

    Returns
    -------
    Dict[str, str]
        A dictionary containing the keys `hyp`, `ref`, and `src` with the corresponding commands
    """
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^H | sort -V | cut -f3- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.hyp
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^T | sort -V | cut -f2- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.ref
    # cat $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT | grep -p ^S | sort -V | cut -f2- | sacremoses detokenize > $OUTPUT_DIR/$MODEL.test-$TEST-$SRC-$TGT.src
    base = f"cat {experiment_name} | grep -p {{grep}} | sort -V | cut -f{{cut}}- | sacremoses detokenize > {experiment_name}.{{ext}}"
    return {
        "hyp": base.format(grep="^H", cut=3, ext="hyp"),
        "ref": base.format(grep="^T", cut=2, ext="ref"),
        "src": base.format(grep="^S", cut=2, ext="src"),
    }


def tokenize_input(
    source_file: Path, target_file: Path, src: str, tgt: str, sentencepiece_script: Path
) -> Dict[str, Union[Path, List[str]]]:
    """
    Tokenizes the input files using sacremoses and sentencepiece

    Parameters
    ----------
    source_file : Path
        The path to the source file
    target_file : Path
        The path to the target file
    src : str
        The source language
    tgt : str
        The target language
    sentencepiece_script : Path
        The path to the sentencepiece script

    Returns
    -------
    Dict[str, Union[Path, List[str]]]
        A dictionary containing the keys `tokenize_source_side`, `tokenize_target_side`, `encode`, and `output_files`

    """
    # # tokenize train-mono, dev, test
    # cat $src_train | sacremoses -l $SRC -j 4 normalize -c tokenize -a > $train_file.tok.$SRC
    # cat $tgt_train | sacremoses -l $TGT -j 4 normalize -c tokenize -a > $train_file.tok.$TGT
    # # separated for clarity
    # python ./spm_encode.py --model="$spm" \
    #     --output_format=piece \
    #     --inputs $train_file.tok.$SRC $train_file.tok.$TGT  \
    #     --outputs  $train_file.tok.spm.$SRC $train_file.tok.spm.$TGT
    intermediate_files = (
        str(source_file.with_suffix(".tok." + src)),
        str(target_file.with_suffix(".tok." + tgt)),
    )
    output_files = (
        source_file.with_suffix(".tok.spm." + src),
        source_file.with_suffix(".tok.spm." + tgt),
    )
    return {
        "tokenize_source_side":  f"cat {source_file} | sacremoses -l {src} -j 4 normalize -c tokenize -a > {intermediate_files[0]}",
        "tokenize_target_side": f"cat {target_file} | sacremoses -l {tgt} -j 4 normalize -c tokenize -a > {intermediate_files[1]}",
        "encode": [
            "python",
            str(sentencepiece_script),
            "--model",
            str(sentencepiece_script),
            "--output_format=piece",
            "--inputs",
            *intermediate_files,
            "--outputs",
            str(output_files[0]),
            str(output_files[1]),
        ],
        "output_files": output_files,
    }


def binarize_data(
    src: str,
    tgt: str,
    src_dict,
    tgt_dict,
    train_prefix_file: Path,
    valid_prefix_file: Path,
    test_prefix_file: Path,
    output_dir,
    only_source=False,
):
    """Binarizes the data. Note: if monolingual, use --only-source. Repeat in opposite direction if required, binary files are directional. Recommendation: Use different output directories for each direction."""
    # fairseq-preprocess \
    # --source-lang $SRC --target-lang $TGT \
    # --srcdict ./Data/it-mono/dict.$SRC.txt \
    # --tgtdict ./Data/it-mono/dict.$TGT.txt \
    # --trainpref $train_file.tok.spm \
    #     --validpref $dev_file.tok.spm \
    #     --testpref $test_file.tok.spm \
    # --destdir "$(dirname $train_file)/bin" \
    #     --thresholdtgt 0 --thresholdsrc 0 --workers 20 $only_source

    # NOTE: if monolingual, --only-source
    # repeat in opposite direction if required, binary files are directional
    return [
        "fairseq_preprocess",
        "--source-lang",
        src,
        "--target-lang",
        tgt,
        "--srcdict",
        str(src_dict),
        "--tgtdict",
        str(tgt_dict),
        "--trainpref",
        str(train_prefix_file.with_suffix(".tok.spm")),
        "--validpref",
        str(valid_prefix_file.with_suffix(".tok.spm")),
        "--testpref",
        str(test_prefix_file.with_suffix(".tok.spm")),
        "--destdir",
        str(output_dir),
        "--thresholdtgt",
        "0",
        "--thresholdsrc",
        "0",
        "--workers",
        "20",
        "--only-source" if only_source else "",
    ]

## Experiment 1
- Test base MODEL performance on the test set
- Test base MODEL performance on the it-parallel dataset
- Finetune the base MODEL on the it-parallel dataset and evaluate on both test sets

### Results
| Model |  News Corpus |  it-parallel |
|-------|--------------|--------------|
| Base  |  0.0000      |  0.0000      |
| Finetune |  0.0000  |  0.0000      |


In [47]:
base_model = model_dir / f"big-{SRC}-{TGT}" / "checkpoint_best.pt"

experiment_name = f"big-{SRC}-{TGT}-test-{it_mono}"

# Base evaluation on the news corpus first
evaluate_news = process_outputs(test_folder / experiment_name)
hyp_args, ref_args, src_args = evaluate_news.values()

# Extract the hypothesis, reference, and source
# res = subprocess.check_output(hyp_args, shell=True)
# print(res)
# res = subprocess.check_output(ref_args, shell=True)
# print(res)
# res = subprocess.check_output(src_args, shell=True)
# print(res)

args, file = get_bleu_score(test_folder / experiment_name)
run_command(args)
!cat $file

{
 "name": "BLEU",
 "score": 14.1,
 "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.1",
 "verbose_score": "43.1/18.4/9.6/5.2 (BP = 1.000 ratio = 1.032 hyp_len = 23650 ref_len = 22924)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "13a",
 "smooth": "exp",
 "version": "2.4.1"
}
