# HoogBERTa Conversion from subword-nmt and Fairseq to Huggingface

<a id="section-one"></a>
## Install Depedencies

In [1]:
!pip install -q transformers subword-nmt attacut tokenizers numpy==1.21.6 gdown torchmetrics==0.7

[0m

In [2]:
!git clone https://github.com/lstnlp/hoogberta
%cd hoogberta
!pip install -q torchtext==0.6.0 fairseq==0.10.2 pytorch-lightning==1.4.7 textsearch seqeval
!pip install -q --no-dependencies --editable  .

Cloning into 'hoogberta'...
remote: Enumerating objects: 267, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 267 (delta 63), reused 73 (delta 43), pack-reused 161[K
Receiving objects: 100% (267/267), 4.05 MiB | 800.00 KiB/s, done.
Resolving deltas: 100% (136/136), done.
/kaggle/working/hoogberta
[0m

In [3]:
from hoogberta import download
download()

Downloading...
From: https://drive.google.com/uc?id=1xQHDAE8nbFu2wAM6SAXtTjk890JWLUhy
To: /kaggle/working/hoogberta/dict.zip
100%|██████████| 1.15k/1.15k [00:00<00:00, 2.58MB/s]
Downloading...
From (uriginal): https://drive.google.com/uc?id=1bBSWQzzEt99mYd_EY5W-lQKW6L-D8axW
From (redirected): https://drive.google.com/uc?id=1bBSWQzzEt99mYd_EY5W-lQKW6L-D8axW&confirm=t&uuid=07fdf357-f672-4fc4-9e9e-3fef3b89208e
To: /kaggle/working/hoogberta/modelL12.pt
100%|██████████| 575M/575M [00:08<00:00, 67.6MB/s] 
Downloading...
From (uriginal): https://drive.google.com/uc?id=1fYtRAyh6d4W9LVCSJiSYKKM_CCPflBc9
From (redirected): https://drive.google.com/uc?id=1fYtRAyh6d4W9LVCSJiSYKKM_CCPflBc9&confirm=t&uuid=de0e2fcc-dd5e-4e94-86fa-a12d2fc4239e
To: /kaggle/working/hoogberta/checkpoint_best.pt
100%|██████████| 1.44G/1.44G [00:28<00:00, 51.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZNxpVHNZbAfdWA-wu7iMSUtcySzQCJam
To: /kaggle/working/hoogberta/dict.txt
100%|██████████| 1.09M/1.09M [00:00

<a id="section-two"></a>
## Convert subword-nmt to Huggingface Tokenizer

### Convert tokenizer

In [4]:
%cd /kaggle/working/hoogberta

/kaggle/working/hoogberta


In [5]:
base_path = '/kaggle/working/hoogberta'

In [6]:
from hoogberta.encoder import HoogBERTaEncoder
encoder = HoogBERTaEncoder(cuda=False)

In [7]:
encoder.model.encoder.decode(encoder.model.encoder.encode('ทดสอบการพิมพ์ภาษาไทย'))

'ทดสอบการพิมพ์ภาษาไทย'

In [8]:
from attacut import tokenize

In [9]:
def tokenize_original(sentence: str):
    all_sent = []
    sentences = sentence.split(" ")
    for sent in sentences:
        all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

    sentence = " _ ".join(all_sent)

    return encoder.model.encoder.encode(sentence)

In [10]:
from fairseq.data.data_utils import collate_tokens
def tokenize_original_batch(sentenceL):

    inputList = []
    for sentX in sentenceL:
        sentences = sentX.split(" ")
        all_sent = []
        for sent in sentences:
            all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

        sentence = " _ ".join(all_sent)
        inputList.append(sentence)

    batch = collate_tokens([encoder.model.encoder.encode(sent) for sent in inputList], pad_idx=1)

    return batch

In [11]:
tokenize_original('ทดสอบการพิมพ์ภาษาไทย 爨')

tensor([  0, 840,   6, 562, 338, 112,   4,   3,   2])

In [12]:
tokenize_original_batch(['ทดสอบการพิมพ์ภาษาไทย 爨', 'test'])

tensor([[   0,  840,    6,  562,  338,  112,    4,    3,    2],
        [   0, 9650,    2,    1,    1,    1,    1,    1,    1]])

In [13]:
encoder.model.encoder.decode(tokenize_original('ทดสอบการพิมพ์ภาษาไทย 爨'))

'ทดสอบ การ พิมพ์ ภาษา ไทย _ <unk>'

In [14]:
from tqdm.auto import tqdm

merges = []
with open(base_path + "/models/hoogberta_base/th_18M.50000.bpe") as f:
    lines = f.readlines()
    for line in tqdm(lines[1:]):
        left, right = line.rstrip().split(' ')
        merges.append((left, right))

  0%|          | 0/50000 [00:00<?, ?it/s]

In [15]:
len(merges)

50000

In [16]:
from tqdm.auto import tqdm

vocabs_hoog = ["<s>", "<pad>", "</s>", "<unk>"]
with open('/kaggle/working/hoogberta/models/hoogberta_base/dict.txt') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        subword = line.rstrip().split(' ')[0]
        if subword[-2:] == '@@':
            subword = subword[:-2]
        else:
            subword += '</w>'
        vocabs_hoog.append(subword)
        
vocabs_hoog.append('<mask>')

  0%|          | 0/74900 [00:00<?, ?it/s]

In [17]:
len(vocabs_hoog)

74905

In [18]:
vocab_converted_hoog = {}
for i in range(len(vocabs_hoog)):
    vocab_converted_hoog[vocabs_hoog[i]] = i

In [19]:
"""
Somehow some vocab in merge.txt did not registered in dict.txt file. 
We can easily fix that by removing not registered index from merge rules.
"""

i = 0
error_index = []
error_coms = []
for item in merges:
    item_com = item[0] + item[1]
    if item_com not in vocab_converted_hoog or item[0] not in vocab_converted_hoog or item[1] not in vocab_converted_hoog:
        error_index.append(i)
    i += 1

In [20]:
len(error_index)

94

In [21]:
merges_fixed = merges.copy()
for i in sorted(error_index, reverse=True):
    del merges_fixed[i]

In [22]:
len(merges), len(merges_fixed)

(50000, 49906)

In [23]:
from tokenizers.implementations.char_level_bpe import CharBPETokenizer
huggingface_bpe = CharBPETokenizer(
                                    merges=merges_fixed, 
                                   vocab=vocab_converted_hoog, 
                                   bert_normalizer = False, 
                                   split_on_whitespace_only = True, 
                                   unk_token='<unk>'
                                  )

In [24]:
# vocabs_hoog = 
huggingface_bpe.add_special_tokens(["<s>", "<pad>", "</s>", "<unk>"])
huggingface_bpe.add_special_tokens(["<mask>"])

0

In [25]:
from tokenizers.processors import TemplateProcessing
huggingface_bpe.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[("<s>", 0), ("</s>", 2)],
)

In [26]:
def tokenize_new(sentence: str):
    all_sent = []
    sentences = sentence.split(" ")
    for sent in sentences:
        all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

    sentence = " _ ".join(all_sent)

    return huggingface_bpe.encode(sentence).ids

In [27]:
import torch

def tokenize_new_batch(sentenceL):

    inputList = []
    for sentX in sentenceL:
        sentences = sentX.split(" ")
        all_sent = []
        for sent in sentences:
            all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

        sentence = " _ ".join(all_sent)
        inputList.append(sentence)
    
    batch = collate_tokens([torch.Tensor(huggingface_bpe.encode(sent).ids).long() for sent in inputList], pad_idx=1)

    return batch

### Parity Check

In [28]:
"""
爨 is for testting for unknown token
"""
text = 'ทดสอบการพิมพ์ภาษาไทย 爨'
tokenize_new(text) == tokenize_original(text).tolist()

True

In [29]:
"""
Testing for batching and padding
"""
text_batch = ['ทดสอบการพิมพ์ภาษาไทย 爨', 'test']
tokenize_original_batch(text_batch).tolist() == tokenize_new_batch(text_batch).tolist()

True

In [30]:
encoder.model.encoder.decode(tokenize_original('ทดสอบการพิมพ์ภาษาไทย 爨'))

'ทดสอบ การ พิมพ์ ภาษา ไทย _ <unk>'

In [31]:
huggingface_bpe.decode(tokenize_new('ทดสอบการพิมพ์ภาษาไทย 爨'), skip_special_tokens = True)

'ทดสอบ การ พิมพ์ ภาษา ไทย _'

In [32]:
huggingface_bpe.decode(tokenize_new('ทดสอบการพิมพ์ภาษาไทย 爨'), skip_special_tokens = False)

'<s>ทดสอบ การ พิมพ์ ภาษา ไทย _ <unk></s>'

In [33]:
huggingface_bpe.save('tokenizer.json')

### Convert huggingface/tokenizers to huggingface/transformers PreTrainedTokenizerFast

In [34]:
from transformers import PreTrainedTokenizerFast

auto_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='tokenizer.json', 
    cls_token='<s>', 
    eos_token='</s>', 
    bos_token='<s>', 
    pad_token='<pad>',
    mask_token='<mask>',
    model_max_length = 512
)

In [35]:
def tokenize_auto(sentence: str):
    all_sent = []
    sentences = sentence.split(" ")
    for sent in sentences:
        all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

    sentence = " _ ".join(all_sent)

    return auto_tokenizer(sentence).input_ids

In [36]:
import torch

def tokenize_auto_batch(sentenceL):

    inputList = []
    for sentX in sentenceL:
        sentences = sentX.split(" ")
        all_sent = []
        for sent in sentences:
            all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

        sentence = " _ ".join(all_sent)
        inputList.append(sentence)

    return auto_tokenizer(inputList, padding = True).input_ids

### Parity Check for PreTrainedTokenizerFast

In [37]:
"""
爨 is for testting for unknown token
"""
text = 'ทดสอบการพิมพ์ภาษาไทย 爨'
tokenize_auto(text) == tokenize_original(text).tolist()

True

In [38]:
text_batch = ['ทดสอบการพิมพ์ภาษาไทย 爨', 'test']
tokenize_auto_batch(text_batch) == tokenize_original_batch(text_batch).tolist()

True

In [39]:
text = 'เราสามารถพิมภาษาไทยได้ดีมั้ย ทดสอบการทำงาน1223123 😋 ဍဍဍကဏ္ဇ'
tokenize_original(text).tolist() == tokenize_auto(text)

True

In [40]:
auto_tokenizer.decode(tokenize_auto(text), skip_special_tokens = True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'เรา สามารถ พิม ภาษา ไทย ได้ ดี มั้ย _ ทดสอบ การ ทำ งาน 1223123 _ 😋 _ က ဏ ္ ဇ'

In [41]:
auto_tokenizer.decode(tokenize_auto(text), skip_special_tokens = False)

'<s>เรา สามารถ พิม ภาษา ไทย ได้ ดี มั้ย _ ทดสอบ การ ทำ งาน 1223123 _ 😋 _ <unk><unk><unk>က ဏ ္ ဇ </s>'

In [42]:
encoder.model.encoder.decode(tokenize_original(text))

'เรา สามารถ พิม ภาษา ไทย ได้ ดี มั้ย _ ทดสอบ การ ทำ งาน 1223123 _ 😋 _ <unk> <unk> <unk> က ဏ ္ ဇ'

<a id="section-three"></a>
## Convert Fairseq Roberta to Huggingface Roberta

In [43]:
from transformers.models.roberta.convert_roberta_original_pytorch_checkpoint_to_pytorch import convert_roberta_checkpoint_to_pytorch

In [44]:
!cp models/hoogberta_base/checkpoint_best.pt models/hoogberta_base/model.pt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [45]:
convert_roberta_checkpoint_to_pytorch('models/hoogberta_base', 'converted_model', False)

1042301B [00:00, 63170684.97B/s]
456318B [00:00, 35514954.49B/s]


Our BERT config: RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 74905
}



Configuration saved in converted_model/config.json


torch.Size([1, 11, 74905]) torch.Size([1, 11, 74905])
max_absolute_diff = 1.9073486328125e-06
Do both models output the same tensors? 🔥
Saving model to converted_model


Model weights saved in converted_model/pytorch_model.bin


<a id="section-four"></a>
## End to End Testing

In [46]:
len(auto_tokenizer)

74905

In [47]:
from hoogberta.encoder import HoogBERTaEncoder
encoder = HoogBERTaEncoder(cuda=False)

In [48]:
from transformers import RobertaModel, RobertaForMaskedLM

# huggingface_model = RobertaModel.from_pretrained('converted_model')
huggingface_model = RobertaForMaskedLM.from_pretrained('converted_model')

loading configuration file converted_model/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.27.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 74905
}

loading weights file converted_model/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at converted_model.
If your task is similar to the task the model 

In [49]:
encoder.model = encoder.model.eval()

In [50]:
huggingface_model = huggingface_model.eval()

In [51]:
with torch.no_grad():
    sentence = text
    all_sent = []
    sentences = sentence.split(" ")
    for sent in sentences:
        all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

    sentence = " _ ".join(all_sent)
    token_ids_hug = auto_tokenizer(sentence, return_tensors = 'pt')
    features_hug = huggingface_model(**token_ids_hug, output_hidden_states = True).hidden_states[-1]

In [52]:
inputText = ["วันที่ 12 มีนาคมนี้","ฉันจะไปเที่ยววัดพระแก้ว ที่กรุงเทพ"]

In [53]:
with torch.no_grad():
    sentenceL = inputText
    inputList = []
    for sentX in sentenceL:
        sentences = sentX.split(" ")
        all_sent = []
        for sent in sentences:
            all_sent.append(" ".join(tokenize(sent)).replace("_","[!und:]"))

        sentence = " _ ".join(all_sent)
        inputList.append(sentence)
    token_ids_hug_batch = auto_tokenizer(inputList, padding = True, return_tensors = 'pt')
    features_hug_batch = huggingface_model(**token_ids_hug_batch, output_hidden_states = True).hidden_states[-1]

In [54]:
token_ids_batch, features_batch = encoder.extract_features_batch(inputText)

In [55]:
token_ids, features = encoder.extract_features(text)

In [56]:
with torch.no_grad():
    mask = token_ids_hug_batch.attention_mask == 1
    print(torch.abs(features_hug_batch[mask] - features_batch[mask]).mean())

tensor(1.6408e-07)


In [57]:
token_ids_batch == token_ids_hug_batch.input_ids

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

In [58]:
with torch.no_grad():
    print(torch.abs(features - features_hug[0]).mean())

tensor(0.)


In [59]:
token_ids == token_ids_hug.input_ids

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True]])

<a id="section-five"></a>
## Push to Hub

In [60]:
# from huggingface_hub import notebook_login

# notebook_login()

In [61]:
# huggingface_model.push_to_hub("HoogBERTa")
# auto_tokenizer.push_to_hub("HoogBERTa")