In [1]:
!pip install fairseq bitarray fastBPE hydra-core omegaconf regex requests sacremoses subword_nmt sacrebleu==1.5.1
!pip install transformers==4.28.0

Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bitarray
  Downloading bitarray-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastBPE
  Downloading fastBPE-0.1.0.tar.gz (35 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hydra-core
  Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omega

In [2]:
! unzip preprocessed_data.zip

Archive:  preprocessed_data.zip
   creating: preprocessed_data/
  inflating: __MACOSX/._preprocessed_data  
  inflating: preprocessed_data/train.fa  
  inflating: __MACOSX/preprocessed_data/._train.fa  
  inflating: preprocessed_data/train.en  
  inflating: __MACOSX/preprocessed_data/._train.en  
  inflating: preprocessed_data/test.en  
  inflating: __MACOSX/preprocessed_data/._test.en  
  inflating: preprocessed_data/test.fa  
  inflating: __MACOSX/preprocessed_data/._test.fa  
  inflating: preprocessed_data/valid.en  
  inflating: __MACOSX/preprocessed_data/._valid.en  
  inflating: preprocessed_data/valid.fa  
  inflating: __MACOSX/preprocessed_data/._valid.fa  


In [3]:
def run_bash(shell_string):
    with open('script.sh', 'w') as file:
        file.write(shell_string)
    !chmod 755 ./script.sh
    !./script.sh

In [4]:
# Tokenization + BPE
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]"], continuing_subword_prefix="@")
for lang in ["en", "fa"]:
    tokenizer.train(files=[f"preprocessed_data/train.{lang}", f"preprocessed_data/valid.{lang}", f"preprocessed_data/test.{lang}"], trainer=trainer)
    text = "test text for BPE" if lang == "en" else "متن تست بی پی ای"
    output = tokenizer.encode(text)
    print(tokenizer.decode(output.ids))

test text for
متن تست بی پی ای


In [5]:
fairseq_preprocess = """
rm -r data-bin/
TEXT=/content/preprocessed_data
fairseq-preprocess --source-lang en --target-lang fa \
    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    --destdir data-bin/data.tokenized.en-fa \
    --workers 20 \
    --bpe bert \
    --log-format json \
"""
run_bash(fairseq_preprocess)

rm: cannot remove 'data-bin/': No such file or directory
2023-11-16 12:12:42.577296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 12:12:42.577361: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 12:12:42.577406: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-16 12:12:42.585941: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler 

In [None]:
fairseq_train = """
fairseq-train \
    data-bin/data.tokenized.en-fa \
    --arch transformer --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --max-epoch 50 \
    --patience 10 \
    --save-dir checkpoints \
    # --bpe bert \
    # --fp16 \
    # --reset-optimizer \
    --batch-size 64
"""
run_bash(fairseq_train)

In [None]:
from tqdm.auto import tqdm
MAX_EPOCHS = 2
for i in tqdm(range(1, MAX_EPOCHS)):
    ! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint{i}.pt --batch-size 128 --beam 5 --remove-bpe --log-format json --tensorboard-logdir 123

In [None]:
! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint_best.pt --batch-size 128 --beam 5 --remove-bpe --eval-bleu --results-path generate_results

In [None]:
! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint_best.pt --batch-size 128 --beam 5 --remove-bpe --eval-bleu