In [1]:
import json

In [2]:
with open('train-test.json') as fopen:
    train_test = json.load(fopen)
    
train_test.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [3]:
with open('combined.txt', 'w') as fopen:
    fopen.write('\n'.join(train_test['train_X'] + train_test['test_X']))

In [4]:
import youtokentome as yttm

In [5]:
%%time

bpe = yttm.BPE.train(data='combined.txt', 
               vocab_size=400000, model='language-detection.model')

CPU times: user 3min 25s, sys: 36.7 s, total: 4min 2s
Wall time: 57 s


In [6]:
vocab = {v: i for i, v in enumerate(bpe.vocab())}
rev_vocab = {i: v for i, v in enumerate(bpe.vocab())}
len(vocab)

400000

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
import re

r = re.compile(r'[\S]+').findall

In [9]:
combined = train_test['train_X'] + train_test['test_X']

In [10]:
subs = [' '.join(s) for s in bpe.encode(combined[:3], output_type=yttm.OutputType.SUBWORD)]
subs[0]

'▁voi ▁avete ▁conosciuto ▁delle ▁canadesi ▁qua ▁a ▁boston'

In [11]:
subs = [' '.join(s) for s in bpe.encode(combined, output_type=yttm.OutputType.SUBWORD)]

In [12]:
len(subs)

23648246

In [13]:
bow = CountVectorizer(vocabulary = vocab, token_pattern = r'[\S]+').fit(subs)

In [14]:
tsubs = [' '.join(s) for s in bpe.encode(['我该'], output_type=yttm.OutputType.SUBWORD)]
tsubs

['▁我该']

In [15]:
bow.transform(tsubs)

<1x400000 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [16]:
subs[1]

'▁= ▁= ▁ahli ▁ahli ▁= ▁= ▁= ▁= ▁= ▁ahli ▁ahli ▁semasa ▁= ▁= ▁= ▁sekarang ▁vokal ▁sekarang ▁gitar ▁vokal ▁sokongan ▁sekarang ▁sekarang ▁= ▁= ▁= ▁bekas ▁ahli ▁= ▁= ▁= ▁nik ▁vokal ▁o ▁gitar ▁gitar ▁f ▁fed eski ▁gitar ▁vokal ▁gitar ▁gitar ▁p ▁gitar ▁gitar ▁vokal ▁gitar ▁meninggal ▁dunia ▁vokal ▁meninggal ▁dunia ▁= ▁= ▁= ▁kegiatan ▁berkait ▁= ▁= ▁= ▁ahli ▁ahli ▁telah ▁bergerak ▁untuk ▁membentuk ▁seperti'

In [17]:
import numpy as np

v = np.array(bow.transform(tsubs).todense())[0]
np.where(v > 0), v[v > 0]

((array([342793]),), array([1]))

In [18]:
import pickle
with open('bow-language-detection.pkl','wb') as fopen:
    pickle.dump(bow,fopen)