Implementation of a simple BPE tokenizer, but in Nim. May contain BPE Dropout too
- figure out the special treatment of whitespaces as done in starcoder and make sure it is supported
@inproceedings{Wang2019NeuralMT,
title = {Neural Machine Translation with Byte-Level Subwords},
author = {Changhan Wang and Kyunghyun Cho and Jiatao Gu},
booktitle = {AAAI Conference on Artificial Intelligence},
year = {2019}
}
@inproceedings{provilkov-etal-2020-bpe,
title = "{BPE}-Dropout: Simple and Effective Subword Regularization",
author = "Provilkov, Ivan and Emelianenko, Dmitrii and Voita, Elena",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.170",
doi = "10.18653/v1/2020.acl-main.170",
pages = "1882--1892",
}