### Installing Required Packages

In [None]:
!pip install -q git+https://github.com/One-sixth/fairseq.git sacrebleu tensorboardX

### Pipeline A (Baseline, 10k BPE Operations)

#### Preprocessing Data

In [None]:
%cd /content/drive/MyDrive/NLP_TRANSLATION
!mkdir /content/drive/MyDrive/NLP_TRANSLATION/bin
!mkdir /content/drive/MyDrive/NLP_TRANSLATION/checkpoints

In [None]:
!fairseq-preprocess \
    --source-lang fi --target-lang en \
    --trainpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/data/train.bpe \
    --validpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/data/dev.bpe \
    --testpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/data/test.bpe \
    --destdir /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/bin \
    --workers 20

#### Training Fairseq

In [None]:
!fairseq-train /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/bin \
    --arch transformer_iwslt_de_en \
    --share-decoder-input-output-embed \
    --source-lang fi --target-lang en \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 8192 \
    --update-freq 1 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok space \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --save-dir /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/checkpoints \
    --max-epoch 30 \
    --patience 10 \
    --fp16 \
    --no-epoch-checkpoints

#### Testing Fairseq

In [None]:
!fairseq-generate /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/bin \
    --path /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/checkpoints/checkpoint_best.pt \
    --batch-size 128 \
    --beam 5 \
    --remove-bpe \
    > /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_10k/results_A_10k.txt

#### Evaluation (SacreBLEU)

> _NOTE:_  Be sure you are under the Project Root Folder (`NLP_TRANSLATION`).

In [None]:
%%bash
moses_scripts="tools/mosesdecoder/scripts"

input_file="pipeline_A_10k/results_A_10k.txt"

original_ref_file="data/test/test.en"

# temporary files
src_tok="pipeline_A_10k/source.tok"
hypo_tok="pipeline_A_10k/hypothesis.tok"
ref_tok="pipeline_A_10k/reference.tok"

src_dtc="pipeline_A_10k/source.dtc"
hypo_dtc="pipeline_A_10k/hypothesis.dtc"
ref_dtc="pipeline_A_10k/reference.dtc"

# output files
src_detok="pipeline_A_10k/source.detok"
hypo_detok="pipeline_A_10k/hypothesis.detok"
ref_detok="pipeline_A_10k/reference.detok"

# extract detokenized texts
grep ^S- "$input_file" | sed 's/^S-//' | sort -n | cut -f2- > "$src_tok"
grep ^H- "$input_file" | sed 's/^H-//' | sort -n | cut -f3- > "$hypo_tok" # hypothesis (translated text)
grep ^T- "$input_file" | sed 's/^T-//' | sort -n | cut -f2- > "$ref_tok" # refernece (gold standard text)

# detruecasing
perl "$moses_scripts/recaser/detruecase.perl" < "$src_tok" > "$src_dtc"
perl "$moses_scripts/recaser/detruecase.perl" < "$hypo_tok" > "$hypo_dtc"
perl "$moses_scripts/recaser/detruecase.perl" < "$ref_tok" > "$ref_dtc"

# detokenizing
perl "$moses_scripts/tokenizer/detokenizer.perl" -l fi < "$src_dtc" > "$src_detok"
perl "$moses_scripts/tokenizer/detokenizer.perl" -l en < "$hypo_dtc" > "$hypo_detok"
perl "$moses_scripts/tokenizer/detokenizer.perl" -l en < "$ref_dtc" > "$ref_detok"

score=$(sacrebleu "$original_ref_file" -i "$hypo_detok" -m bleu -b -w 4)

# cleanup
rm "$src_tok" "$hypo_tok" "$ref_tok" "$src_dtc" "$hypo_dtc" "$ref_dtc"
echo "Pipeline A (10k) BLEU SCORE: $score"

### Pipeline A (Baseline, 20k BPE Operations)

#### Preprocessing Data

In [None]:
%cd /content/drive/MyDrive/NLP_TRANSLATION
!mkdir /content/drive/MyDrive/NLP_TRANSLATION/bin
!mkdir /content/drive/MyDrive/NLP_TRANSLATION/checkpoints

In [None]:
!fairseq-preprocess \
    --source-lang fi --target-lang en \
    --trainpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/data/train.bpe \
    --validpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/data/dev.bpe \
    --testpref /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/data/test.bpe \
    --destdir /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/bin \
    --workers 20

#### Training Fairseq

In [None]:
!fairseq-train /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/bin \
    --arch transformer_iwslt_de_en \
    --share-decoder-input-output-embed \
    --source-lang fi --target-lang en \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 8192 \
    --update-freq 1 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok space \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --save-dir /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/checkpoints \
    --max-epoch 30 \
    --patience 10 \
    --fp16 \
    --no-epoch-checkpoints

#### Testing Fairseq

In [None]:
!fairseq-generate /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/bin \
    --path /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/checkpoints/checkpoint_best.pt \
    --batch-size 128 \
    --beam 5 \
    --remove-bpe \
    > /content/drive/MyDrive/NLP_TRANSLATION/pipeline_A_20k/results_A_20k.txt

#### Evaluation (SacreBLEU)

> _NOTE:_  Be sure you are under the Project Root Folder (`NLP_TRANSLATION`).

In [None]:
%%bash
moses_scripts="tools/mosesdecoder/scripts"

input_file="pipeline_A_20k/results_A_20k.txt"

original_ref_file="data/test/test.en"

# temporary files
src_tok="pipeline_A_20k/source.tok"
hypo_tok="pipeline_A_20k/hypothesis.tok"
ref_tok="pipeline_A_20k/reference.tok"

src_dtc="pipeline_A_20k/source.dtc"
hypo_dtc="pipeline_A_20k/hypothesis.dtc"
ref_dtc="pipeline_A_20k/reference.dtc"

# output files
src_detok="pipeline_A_20k/source.detok"
hypo_detok="pipeline_A_20k/hypothesis.detok"
ref_detok="pipeline_A_20k/reference.detok"

# extract detokenized texts
grep ^S- "$input_file" | sed 's/^S-//' | sort -n | cut -f2- > "$src_tok"
grep ^H- "$input_file" | sed 's/^H-//' | sort -n | cut -f3- > "$hypo_tok" # hypothesis (translated text)
grep ^T- "$input_file" | sed 's/^T-//' | sort -n | cut -f2- > "$ref_tok" # refernece (gold standard text)

# detruecasing
perl "$moses_scripts/recaser/detruecase.perl" < "$src_tok" > "$src_dtc"
perl "$moses_scripts/recaser/detruecase.perl" < "$hypo_tok" > "$hypo_dtc"
perl "$moses_scripts/recaser/detruecase.perl" < "$ref_tok" > "$ref_dtc"

# detokenizing
perl "$moses_scripts/tokenizer/detokenizer.perl" -l fi < "$src_dtc" > "$src_detok"
perl "$moses_scripts/tokenizer/detokenizer.perl" -l en < "$hypo_dtc" > "$hypo_detok"
perl "$moses_scripts/tokenizer/detokenizer.perl" -l en < "$ref_dtc" > "$ref_detok"

score=$(sacrebleu "$original_ref_file" -i "$hypo_detok" -m bleu -b -w 4)

# cleanup
rm "$src_tok" "$hypo_tok" "$ref_tok" "$src_dtc" "$hypo_dtc" "$ref_dtc"
echo "Pipeline A (20k) BLEU SCORE: $score"

---