In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-left.train
# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-right.train

In [3]:
with open('ms-en-left.train') as fopen:
    left = fopen.read().split('\n')
    
with open('ms-en-right.train') as fopen:
    right = fopen.read().split('\n')

In [4]:
import malaya

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
TensorFlow Addons has compiled its custom ops against TensorFlow 2.4.0, and there are no compatibility guarantees between the two versions. 
This means that you might get segfaults when loading the custom op, or other kind of low-level errors.
 If you do, do not file an issue on Github. This is a known limitation.

It might help you to fallback to pure Python ops with TF_ADDONS_PY_OPS . To do that, see https://github.com/tensorflow/addons#gpucpu-custom-ops 

You can also change the TensorFlow version installed on your system. You would need a TensorFlow 

In [5]:
from malaya.text.rules import rules_normalizer, rules_compound_normalizer
from malaya.text.normalization import _is_number_regex
from malaya.text.function import replace_punct
from collections import defaultdict
import re
import random
import json

In [6]:
PUNCTUATION = '!"#$%&\'()*+,./:;<=>?@[\]^_`{|}~'

def case_of(text):
    return (
        str.upper
        if text.isupper()
        else str.lower
        if text.islower()
        else str.title
        if text.istitle()
        else str
    )

def strip_punct(word):
    left = []
    right = []
    i = 0
    while i < len(word):
        if word[i] in PUNCTUATION:
            left.append(word[i])
            i += 1
        else:
            break
    i = len(word) - 1
    while i > 0:
        if word[i] in PUNCTUATION:
            right.append(word[i])
            i -= 1
        else:
            break
    left = ''.join(left)
    right = ''.join(right[::-1])
    if len(right):
        word_ = word[:-len(right)]
    else:
        word_ = word
    word_ = word_[len(left):]
    return left, right, word_

In [32]:
def replace_words_punct(left_word, right_word):
    left_left, left_right, left_word = strip_punct(left_word)
    right_left, right_right, right_word = strip_punct(right_word)
    return f'{left_left}{right_word}{left_right}'

def random_replace_alignment(left, right, alignment, min_replace = 2, max_replace = 10):
    splitted_left = left.split()
    splitted_right = right.split()
    
    selected_alignment = []
    for s in alignment:
        l = s[0]
        r = s[1]
        try:
            if _is_number_regex(splitted_left[l].replace(',', '').replace('.', '')) or _is_number_regex(splitted_right[r].replace(',', '').replace('.', '')):
                continue
            elif splitted_left[l].isupper() or splitted_right[r].isupper():
                continue
            elif splitted_left[l].lower() == splitted_right[r].lower():
                continue
            elif splitted_right[r].lower() in ['the', 'a', 'an', 'it', 'is', 'are']:
                continue
            elif l == r:
                continue
            else:
                selected_alignment.append((l, r))
        except:
            pass
    
    try:
        count_replace = random.randint(min_replace, min(max_replace, len(selected_alignment)))
        selected = random.sample(selected_alignment, count_replace)
        for s in selected:
            splitted_left[s[0]] = replace_words_punct(splitted_left[s[0]], splitted_right[s[1]])

        return ' '.join(splitted_left), selected
    
    except:
        return ' '.join(splitted_left), []

In [8]:
eflomal = malaya.alignment.ms_en.eflomal(preprocessing_func=replace_punct)

In [9]:
fast_text = malaya.language_detection.fasttext()



In [10]:
model = malaya.language_detection.substring_rules(model = fast_text)

In [52]:
sastrawi = malaya.stem.sastrawi()

In [33]:
model.predict(['lifestyle'])

['EN']

In [12]:
tokenizer = malaya.tokenizer.Tokenizer()

In [59]:
l = ' '.join(tokenizer.tokenize(left[0]))
r = ' '.join(tokenizer.tokenize(right[0]))

In [60]:
%%time

alignment = eflomal.align([l], [r])['forward'][0]

CPU times: user 12.8 ms, sys: 36.1 ms, total: 48.9 ms
Wall time: 174 ms


In [61]:
r_ = random_replace_alignment(l, r, alignment)
r_

('Terminal 1 KKIA equipped with 64 kaunter daftar masuk , 12 aero bridge selain mampu menampung 3,200 penumpang dalam satu masa .',
 [(3, 4), (4, 5)])

In [65]:
splitted = l.split()
predicted = model.predict(splitted)
for no, w in enumerate(splitted):
    if predicted[no] != 'MS':
        w_stem = sastrawi.stem(w)
        if malaya.text.function.is_malay(w_stem) or fast_text.predict([w_stem])[0] in ['malay', 'ind']:
            predicted[no] = 'MS'
            
for i in r_[1]:
    predicted[i[0]] = 'EN'
    
predicted

['MS',
 'NOT_LANG',
 'CAPITAL',
 'EN',
 'EN',
 'NOT_LANG',
 'MS',
 'MS',
 'MS',
 'NOT_LANG',
 'NOT_LANG',
 'OTHERS',
 'EN',
 'MS',
 'MS',
 'MS',
 'NOT_LANG',
 'MS',
 'MS',
 'MS',
 'MS',
 'NOT_LANG']

In [66]:
list(zip(r_[0].split(), predicted))

[('Terminal', 'MS'),
 ('1', 'NOT_LANG'),
 ('KKIA', 'CAPITAL'),
 ('equipped', 'EN'),
 ('with', 'EN'),
 ('64', 'NOT_LANG'),
 ('kaunter', 'MS'),
 ('daftar', 'MS'),
 ('masuk', 'MS'),
 (',', 'NOT_LANG'),
 ('12', 'NOT_LANG'),
 ('aero', 'OTHERS'),
 ('bridge', 'EN'),
 ('selain', 'MS'),
 ('mampu', 'MS'),
 ('menampung', 'MS'),
 ('3,200', 'NOT_LANG'),
 ('penumpang', 'MS'),
 ('dalam', 'MS'),
 ('satu', 'MS'),
 ('masa', 'MS'),
 ('.', 'NOT_LANG')]

In [67]:
from tqdm import tqdm

def loop(rows):
    rows, _ = rows
    strings, labels = [], []
    for i in tqdm(range(len(rows))):
        try:
            left_, right_ = rows[i][0], rows[i][1]
            tokenized_l = tokenizer.tokenize(left_)
            tokenized_r = tokenizer.tokenize(right_)
            l = ' '.join(tokenized_l)
            r = ' '.join(tokenized_r)
            
            if len(tokenized_l) > 60 or len(tokenized_r) > 60:
                continue
                
            alignment = eflomal.align([l], [r])['forward'][0]
            r_ = random_replace_alignment(l, r, alignment)
            
            splitted = l.split()
            predicted = model.predict(splitted)
            for no, w in enumerate(splitted):
                if predicted[no] != 'MS':
                    w_stem = sastrawi.stem(w)
                    if malaya.text.function.is_malay(w_stem) or fast_text.predict([w_stem])[0] in ['malay', 'ind']:
                        predicted[no] = 'MS'

            for i in r_[1]:
                predicted[i[0]] = 'EN'
    
            strings.append(r_[0])
            labels.append(predicted)
        except Exception as e:
            print(e)
    
    return [[strings, labels]]

In [68]:
r = loop((list(zip(left[:10], right[:10])),0))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.35it/s]


In [77]:
import mp

In [None]:
r = mp.multiprocessing(list(zip(left[:200000], right[:200000])), loop, cores = 4)

  4%|███▊                                                                                     | 2170/50000 [10:46<3:58:23,  3.34it/s]

In [None]:
strings, labels = [], []
for i in range(len(r)):
    print(i, len(r[i][0]))
    strings.extend(r[i][0])
    labels.extend(r[i][1])

In [None]:
strings[-1], labels[-1]

In [None]:
with open('ms-en-substrings.json', 'w') as fopen:
    json.dump({'strings': strings, 'labels': labels}, fopen)