In [1]:
from transformers import PreTrainedTokenizerFast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB

In [2]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file='bpe-400k')
vocabulary = list(tokenizer.vocab.keys())

In [3]:
%%time
strings = [
    'tak suka ayam123124123',
    'tak suka ayam'
]
subs = [' '.join(tokenizer.tokenize(s)) for s in strings]
subs[0]

CPU times: user 1 ms, sys: 918 µs, total: 1.92 ms
Wall time: 947 µs


'tak Ġsuka Ġayam 1 2 3 1 2 4 1 2 3'

In [4]:
bow = CountVectorizer(vocabulary = vocabulary, token_pattern = r'[\S]+').fit(subs)



In [5]:
import pickle

with open('bow.pkl', 'wb') as fopen:
    pickle.dump(bow, fopen)

In [6]:
lang_labels_v2 = {
    0: 'standard-english',
    1: 'local-english',
    2: 'manglish',
    3: 'standard-indonesian',
    4: 'socialmedia-indonesian',
    5: 'standard-malay',
    6: 'local-malay',
    7: 'standard-mandarin',
    8: 'local-mandarin',
    9: 'other',
}

lang_labels_v2_rev = {v: k for k, v in lang_labels_v2.items()}

In [8]:
!wc -l shuf-train-fasttext.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
10500345 shuf-train-fasttext.txt


In [9]:
from tqdm import tqdm

batch_size = 200000

x, y = [], []
train = False

nb = ComplementNB()

with open('shuf-train-fasttext.txt') as fopen:
    for l in tqdm(fopen):
        splitted = l.split()
        label = splitted[0].replace('__label__', '')
        x.append(' '.join(splitted[1:]))
        y.append(lang_labels_v2_rev[label])
        if len(x) >= batch_size:
            subs = [' '.join(tokenizer.tokenize(s)) for s in x]
            subs = bow.transform(subs)
            if not train:
                nb = nb.fit(subs, y)
                train = True
            else:
                nb = nb.partial_fit(subs, y)
            
            x, y = [], []

10500345it [35:49, 4886.03it/s] 


In [10]:
with open('nb.pkl', 'wb') as fopen:
    pickle.dump(nb, fopen)

In [13]:
subs[:10]

<10x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [14]:
nb.predict(subs[:10])

array([3, 7, 3, 2, 3, 4, 4, 8, 0, 9])

In [15]:
y[:10]

[1, 0, 6, 8, 6, 7, 4, 5, 9, 1]