In [20]:
from transformers import AutoTokenizer, AutoModel
# from tokenizers import Tokenizer
from tokenizer import Tokenizer
import json
from tokenizers import models

In [21]:
def read_data(path):
    with open(path) as f:
        return [x.strip() for x in f.readlines()]
    

def read_data_json(path):
    with open(path) as f:
        return [json.loads(line) for line in f.readlines()]

In [22]:
def resize_tokenizer(tokenizer, execlude_len=5, path='tokenizer-5'):
    words_to_remove = []
    for word in tokenizer.vocab:
        if word not in tokenizer.special_tokens_map.values() and len(word.replace('##', '')) > execlude_len:
            words_to_remove.append(word)

    print(f'{len(words_to_remove)} have been removed')
    
    model_state = json.loads(tokenizer.backend_tokenizer.model.__getstate__())

    for word in words_to_remove:
        del model_state['vocab'][word]
    
    model_class = getattr(models, model_state.pop("type"))

    tokenizer.backend_tokenizer.model = model_class(**model_state)

    # tokenizer.save_pretrained(path)

In [23]:
def resize_tokenizer_unk(tokenizer, execlude_len=5, path='tokenizer-5'):
    words_to_remove = []
    for word in tokenizer.vocab:
        if word not in tokenizer.special_tokens_map.values() and len(word.replace('##', '')) > execlude_len:
            words_to_remove.append(word)

    print(f'{len(words_to_remove)} have been removed')
    unk_token_id = tokenizer.convert_tokens_to_ids("[UNK]")
    for word in words_to_remove:
        # Map the token to `[UNK]`'s ID
        token_id = tokenizer.vocab[word]
        tokenizer.ids_to_tokens[token_id] = "[UNK]"
        tokenizer.vocab[word] = unk_token_id

    # tokenizer.save_pretrained(path)



In [6]:
camel_tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-msa')
# resize_tokenizer_unk(camel_tokenizer)
# resize_tokenizer(camel_tokenizer, execlude_len=4, path='tokenizer-4')
# resize_tokenizer(camel_tokenizer, execlude_len=4, path='tokenizer-4')

In [15]:
# tokenizer = Tokenizer('tokenizer-5')
tokenizer = Tokenizer('CAMeL-Lab/bert-base-arabic-camelbert-msa')
tokenizer_5 = Tokenizer('tokenizer-5')
tokenizer_4 = Tokenizer('tokenizer-4')
tokenizer_3 = Tokenizer('tokenizer-3')
qalb14_dev = read_data_json(f'../arabic-gec/data/gec/modeling/qalb14/wo_camelira/full/dev.json')
tokenized_dev = [tokenizer.tokenize(sent['raw'], flatten=True) for sent in qalb14_dev]
tokenized_dev_5 = [tokenizer_5.tokenize(sent['raw'], flatten=True) for sent in qalb14_dev]
tokenized_dev_4 = [tokenizer_4.tokenize(sent['raw'], flatten=True) for sent in qalb14_dev]
tokenized_dev_3 = [tokenizer_3.tokenize(sent['raw'], flatten=True) for sent in qalb14_dev]

In [16]:
len(max(tokenized_dev, key=len))

133

In [17]:
len(max(tokenized_dev_5, key=len))

152

In [18]:
len(max(tokenized_dev_4, key=len))

166

In [19]:
len(max(tokenized_dev_3, key=len))

186

In [8]:
sentence = qalb14_dev[0]['raw']

In [9]:
tokenizer._tokenizer(sentence)

{'input_ids': [2, 2575, 1953, 13448, 3157, 10914, 2054, 1961, 13489, 2134, 5003, 1017, 2590, 4449, 1016, 2135, 1971, 378, 15524, 1906, 8047, 3866, 27527, 2075, 20126, 4872, 2155, 1908, 4034, 5660, 2059, 13983, 3827, 1956, 15707, 378, 2349, 8017, 21507, 2085, 2554, 21589, 28181, 8393, 20126, 10738, 2058, 13174, 378, 2038, 4133, 3766, 1006, 2790, 1011, 1912, 6030, 8920, 2134, 3629, 2249, 378, 6841, 2534, 2270, 5552, 7542, 2013, 2677, 9290, 1922, 13489, 18463, 22254, 13418, 2503, 2204, 1015, 26563, 1949, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [36]:
tokenizer_5._tokenizer(sentence)

{'input_ids': [2, 2575, 1953, 14420, 1015, 3157, 10914, 2054, 1961, 6668, 1999, 2134, 5003, 1017, 2590, 4449, 1016, 2135, 1971, 378, 15524, 1906, 8047, 3866, 27527, 2075, 20126, 4872, 2155, 1908, 4034, 5660, 2059, 13983, 3827, 1956, 15707, 378, 2349, 8017, 21507, 2085, 2017, 2184, 21589, 28181, 8393, 20126, 10738, 2058, 2216, 4617, 378, 2038, 4133, 22989, 1911, 2790, 1011, 1912, 6030, 8920, 2134, 3629, 2249, 378, 6841, 2534, 2270, 5552, 7542, 2013, 2677, 9290, 1922, 6668, 1999, 18463, 22254, 13418, 2503, 2204, 1015, 26563, 1949, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,