In [1]:
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz
# !tar -zxf train-en-vi.tgz
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/dev-2012-en-vi.tgz
# !tar -zxf dev-2012-en-vi.tgz
# !wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/test-2013-en-vi.tgz
# !tar -zxf test-2013-en-vi.tgz

In [2]:
# !pip3 install malaya --no-deps
# !pip3 install bert-tensorflow
# !pip3 install toolz
# !pip3 install pysastrawi
# !pip3 install fuzzywuzzy
# !pip3 install xgboost
# !pip3 install ftfy

In [3]:
import malaya
import re

tokenizer = malaya.preprocessing.SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    return tokenized

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
with open('train.en') as fopen:
    train_english = fopen.read().split('\n')[:-1]
    
with open('train.vi') as fopen:
    train_vietnam = fopen.read().split('\n')[:-1]
    
len(train_english), len(train_vietnam)

(133317, 133317)

In [5]:
train_english[0], train_vietnam[0]

('Rachel Pike : The science behind a climate headline',
 'Khoa học đằng sau một tiêu đề về khí hậu')

In [6]:
from tqdm import tqdm

for i in tqdm(range(len(train_english))):
    train_english[i] = ' '.join(preprocessing(train_english[i]))

100%|██████████| 133317/133317 [00:16<00:00, 8024.56it/s]


In [7]:
for i in tqdm(range(len(train_vietnam))):
    train_vietnam[i] = ' '.join(preprocessing(train_vietnam[i]))

100%|██████████| 133317/133317 [00:21<00:00, 6136.78it/s]


In [8]:
with open('tst2012.en') as fopen:
    test_english_2012 = fopen.read().split('\n')[:-1]
    
with open('tst2012.vi') as fopen:
    test_vietnam_2012 = fopen.read().split('\n')[:-1]
    
len(test_english_2012), len(test_vietnam_2012)

(1553, 1553)

In [9]:
for i in tqdm(range(len(test_english_2012))):
    test_english_2012[i] = ' '.join(preprocessing(test_english_2012[i]))
    
for i in tqdm(range(len(test_vietnam_2012))):
    test_vietnam_2012[i] = ' '.join(preprocessing(test_vietnam_2012[i]))

100%|██████████| 1553/1553 [00:00<00:00, 9039.32it/s]
100%|██████████| 1553/1553 [00:00<00:00, 6949.77it/s]


In [12]:
with open('tst2013.en') as fopen:
    test_english_2013 = fopen.read().split('\n')[:-1]
    
with open('tst2013.vi') as fopen:
    test_vietnam_2013 = fopen.read().split('\n')[:-1]
    
len(test_english_2013), len(test_vietnam_2013)

(1268, 1268)

In [13]:
for i in tqdm(range(len(test_english_2013))):
    test_english_2013[i] = ' '.join(preprocessing(test_english_2013[i]))
    
for i in tqdm(range(len(test_vietnam_2013))):
    test_vietnam_2013[i] = ' '.join(preprocessing(test_vietnam_2013[i]))

100%|██████████| 1268/1268 [00:00<00:00, 2778.69it/s]
100%|██████████| 1268/1268 [00:00<00:00, 2050.06it/s]


In [10]:
train_X, train_Y = [], []
for i in range(len(train_english)):
    if len(train_english[i].split()) > 250:
        continue
    train_X.append(train_english[i])
    train_Y.append(train_vietnam[i])

In [14]:
test_X, test_Y = [], []
for i in range(len(test_english_2012)):
    if len(test_english_2012[i].split()) > 250:
        continue
    test_X.append(test_english_2012[i])
    test_Y.append(test_vietnam_2012[i])
    
for i in range(len(test_english_2013)):
    if len(test_english_2013[i].split()) > 250:
        continue
    test_X.append(test_english_2013[i])
    test_Y.append(test_vietnam_2013[i])

In [15]:
import collections
import json

def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [16]:
concat_from = ' '.join(train_X).split()
vocabulary_size_from = len(list(set(concat_from)))
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from, vocabulary_size_from)
print('vocab from size: %d'%(vocabulary_size_from))
print('Most common words', count_from[4:10])
print('Sample data', data_from[:10], [rev_dictionary_from[i] for i in data_from[:10]])

vocab from size: 47925
Most common words [(',', 155595), ('.', 134615), ('the', 102861), ('to', 65617), ("'", 63980), ('of', 60178)]
Sample data [6514, 16802, 55, 58, 335, 591, 11, 731, 5458, 132] ['Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', 'In']


In [17]:
concat_to = ' '.join(train_Y).split()
vocabulary_size_to = len(list(set(concat_to)))
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, vocabulary_size_to)
print('vocab to size: %d'%(vocabulary_size_to))
print('Most common words', count_to[4:10])
print('Sample data', data_to[:10], [rev_dictionary_to[i] for i in data_to[:10]])

vocab to size: 22341
Most common words [(',', 128187), ('.', 125091), ('là', 57919), ('tôi', 51677), ('một', 48925), ('có', 48134)]
Sample data [1909, 66, 1135, 128, 8, 371, 111, 38, 411, 723] ['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']


In [18]:
with open('train-test.json', 'w') as fopen:
    json.dump({'train_X': train_X, 'train_Y': train_Y,
              'test_X': test_X,
              'test_Y': test_Y}, fopen)

In [19]:
with open('dictionary.json', 'w') as fopen:
    json.dump({'from': {'dictionary': dictionary_from, 'rev_dictionary': rev_dictionary_from},
              'to': {'dictionary': dictionary_to, 'rev_dictionary': rev_dictionary_to}}, fopen)