In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv
import logging
import os
import shutil
import sys
from pathlib import Path

sys.path.insert(0, '../')

# from swissdox import SwissdoxData
import tokenization
# import utils

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
csv.field_size_limit(sys.maxsize)

131072

In [4]:
logging.basicConfig(level=logging.INFO)

In [3]:
!export TOKENIZERS_PARALLELISM=false

In [4]:
# LANGUAGES = [
#     "de_CH",
#     "fr_CH",
#     "it_CH",
#     "rm_CH",
# ]

language = "yo_XX"

In [5]:
data_dir = Path("../data")
assert data_dir.exists()

In [6]:
out_dir_xlm_vocab = data_dir / "xlm_vocab"
out_dir_xlm_vocab.mkdir(exist_ok=True)
out_dir_new_vocab = data_dir / "new_vocab"
out_dir_new_vocab.mkdir(exist_ok=True)

## Tokenization with XLM vocabulary

In [16]:
# for language in LANGUAGES:
for txt_path in [
    out_dir_xlm_vocab / "train.txt",
    out_dir_xlm_vocab / "valid.txt"
]:
    tokenization.tokenize_xlm(
        txt_path,
        txt_path.with_suffix(f".xlm.bpe")
    )

## Tokenization with new vocabulary

In [None]:

tokenization.create_spm_vocabulary(
    txt_paths=[out_dir_new_vocab / f"{language}.train.txt"],
    name="swissbert",
    sampling_alpha=0.3,
    vocab_size=50260,
    # user_defined_symbols=["</s>", "<medium>", "<year>", "<month>"],
)

In [7]:
model_path = Path("spm.yoruba.model")
vocab_path = Path("spm.yoruba.vocab")
assert model_path.exists()
assert vocab_path.exists()
vocab_dir = Path("../vocab")
assert vocab_dir.exists()
shutil.move(model_path, vocab_dir / model_path.name)
shutil.move(vocab_path, vocab_dir / vocab_path.name)
swissbert_model_path = vocab_dir / model_path.name
swissbert_vocab_path = vocab_dir / vocab_path.name

In [7]:
# for language in LANGUAGES:
tokenizer = Path("../tokenizer")
for txt_path in [
    out_dir_new_vocab / "train.txt",
    out_dir_new_vocab / "valid.txt"
]:
    tokenization.tokenize_hf(
        tokenizer,
        txt_path,
        txt_path.with_suffix(f".new.bpe"),
    )

268051
['<s>', '<pad>', '</s>', '<unk>', ',', '.', '▁', 's', '▁de', '-']
268051
['<s>', '<pad>', '</s>', '<unk>', ',', '.', '▁', 's', '▁de', '-']


## Binarization

In [None]:
os.environ["DATA_DIR"] = str(out_dir_xlm_vocab.resolve())
# for language in LANGUAGES:
os.environ["LANGUAGE"] = language
!fairseq-preprocess \
  --only-source \
  --trainpref "$DATA_DIR/train.xlm.bpe" \
  --validpref "$DATA_DIR/valid.xlm.bpe" \
  --destdir "$DATA_DIR/bin/$LANGUAGE" \
  --bpe sentencepiece \
  --srcdict ../vocab/xlm.dict.txt \
  --workers 20
!rm "$DATA_DIR/bin/$LANGUAGE/dict.txt"
!cp ../vocab/xlm.dict.txt "$DATA_DIR/bin/dict.txt"

In [15]:
# Convert spm vocab to fairseq format
from transformers import XLMRobertaTokenizer
swissbert_dict_path = Path("../tokenizer/yo.dict.txt")
vocab = XLMRobertaTokenizer.from_pretrained("../tokenizer").get_vocab()
# import json

# with open("dict_dump.json", "w") as f_out:
#     json.dump(vocab, f_out)

with open(swissbert_dict_path, "w") as f_out:
    for token in vocab:
        if token in {"<s>", "<pad>", "</s>", "<unk>"}:
            continue
        f_out.write(f"{token} 1\n")

In [11]:
os.environ["DATA_DIR"] = str(out_dir_new_vocab.resolve())
os.environ["DICT_PATH"] = str(swissbert_dict_path.resolve())
# for language in LANGUAGES:
os.environ["LANGUAGE"] = language
!fairseq-preprocess \
  --only-source \
  --trainpref "$DATA_DIR/train.new.bpe" \
  --validpref "$DATA_DIR/valid.new.bpe" \
  --destdir "$DATA_DIR/bin/$LANGUAGE" \
  --bpe sentencepiece \
  --srcdict "$DICT_PATH" \
  --workers 20
# !rm "$DATA_DIR/bin/$LANGUAGE/dict.txt"
# !cp $DICT_PATH "$DATA_DIR/bin/dict.txt"

2023-07-13 15:09:39 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe='sentencepiece', optimizer=None, lr_scheduler='fixed', scoring='bleu', task='translation', source_lang=None, target_la

In [None]:
# Convert from fairseq to huggingface format

! python3 ../convert_xmod_original_pytorch_checkpoint_to_pytorch.py \
    --xmod_checkpoint_path ../models/swissbert_yo_new_vocab/checkpoint_last.pt\
    --pytorch_dump_folder_path ../../models/xmod_fairseq 