In [1]:
# !pip3 install git+https://github.com/huseinzol05/malaya.git@4.6.1 --no-deps

In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [3]:
import malaya
from malaya.preprocessing import Tokenizer
from malaya.text.function import case_of
from malaya.augmentation import (
    replace_similar_consonants, 
    replace_similar_vowels, 
    socialmedia_form,
    vowel_alternate)
from malaya.text import rules
from collections import defaultdict
import random
import re
import tensorflow as tf
from malaya.text.tatabahasa import alphabet, consonants, vowels
from malaya.text.function import augmentation_textcleaning, simple_textcleaning

def cleaning_row(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [4]:
replace_normalizer = defaultdict(list)
for k, v in rules.rules_normalizer.items():
    if v.count(' ') == 0:
        replace_normalizer[v].append(k)

In [5]:
def socialmedia_form(word: str):
    """
    augmenting a word into socialmedia form.

    Parameters
    ----------
    word: str

    Returns
    -------
    result: List[str]
    """

    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    results = []

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + word[1:])
    
        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(word[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    return list(set(results))

socialmedia_form('juga')

['juge', 'jugak']

In [6]:
def random_slide(string, min_n = 2):
    splitted = string.split()
    n = random.randint(min_n, len(splitted))
    i = random.randint(0, len(splitted) - n)
    return ' '.join(splitted[i: i + n])

random_slide('Husein makan ayam di kampung Jawa juga')

'makan ayam'

In [7]:
'word'.split('-')

['word']

In [8]:
tokenizer = Tokenizer(duration = False, date = False).tokenize

def augment(string):
    
    r = []
    for word in tokenizer(string):
        original_word = word
        word_lower = word.lower()
        try:
            if word.istitle() or word.isupper():
                if random.random() >= 0.3:
                    word = case_of(word)(random.choice(replace_normalizer[word_lower]))
            else:
                splitted = word_lower.split('-')
                if len(splitted) > 1:
                    word = splitted[0]
                    after = '-'.join(splitted[1:])
                else:
                    after = ''
                s = socialmedia_form(word_lower)
                if len(s):
                    word = case_of(word)(random.choice(s))
                else:
                    if word_lower in replace_normalizer and random.random() >= 0.3:
                        word = case_of(word)(random.choice(replace_normalizer[word_lower]))

                word = case_of(word)(vowel_alternate(word, 0.7))
                word = case_of(word)(replace_similar_consonants(word, 0.95))
                word = case_of(word)(replace_similar_vowels(word, 0.8))
            
                if len(after):
                    word = f'{word}-{after}'
                
        except Exception as e:
            word = original_word
            pass
        
        r.append(word)
    return ' '.join(r)

augment('abad ke-14-14-14-14')

'abaf kel-14-14-14-14'

In [9]:
string = """
Husein makan ayam di kampung Jawa juga
"""
splitted = malaya.text.function.split_into_sentences(string)
augment(splitted[0])

'Husein makn ayam dik kf Jawa jugak .'

In [10]:
string = """
Husein makan ayam di kampung Jawa
"""
splitted = malaya.text.function.split_into_sentences(string)
augment(splitted[0])

'Husein makan aym dik kf Jawa .'

In [11]:
splitted[0]

'Husein makan ayam di kampung Jawa.'

In [12]:
files = ['/home/husein/pure-text/dumping-parliament.txt',]

In [18]:
with open(files[0]) as fopen:
    data = list(filter(None, fopen.read().split('\n')))
    
data = [i for i in data if len(i) >= 2]
data = random.sample(data, 10000)
len(data)

10000

In [14]:
fast_text = malaya.language_detection.fasttext()




In [15]:
fast_text.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])

['other']

In [19]:
from tqdm import tqdm

def loop(strings):
    results = []
    for i in tqdm(range(len(strings))):
        try:
            if fast_text.predict([strings[i]])[0] == 'other':
                continue
            if random.random() > 0.6:
                s = random_slide(strings[i])
                if not len(s):
                    s = strings[i]
            else:
                s = strings[i]
            t = ' '.join(tokenizer(s))
            if random.random() >= 0.2:
                row = augment(s)
                results.append((row, t))
            else:
                results.append((t, t))
        except:
            pass
    return results

In [20]:
socialmedia_form('serba')

['serbak', 'serbe']

In [21]:
loop(data[:10])

100%|██████████| 10/10 [00:00<00:00, 393.36it/s]


[('KO CHUNG SEN [ KAMPAR ] m ontak MENTERI KEWANGAN ( a ) imbuhan anual dn bunus yang d ibayo kpafak Leader Pegawai Eksekutif 1 MDB sjk ia ditibihkan padak thn 2009 ; dn ( b ) imbihn anual Pengerusi 1 MDB sjk taon 2009 , dan imbuhan anual stp Ahli Lembaga Pengarah 1 MDB sejk 2009 .',
  'KO CHUNG SEN [ KAMPAR ] m inta MENTERI KEWANGAN ( a ) imbuhan tahunan dan bonus yang d ibayar kepada Ketua Pegawai Eksekutif 1 MDB sejak ia ditubuhkan pada tahun 2009 ; dan ( b ) imbuhan tahunan Pengerusi 1 MDB sejak tahun 2009 , dan imbuhan tahunan setiap Ahli Lembaga Pengarah 1 MDB sejak 2009 .'),
 ('Justeru , pade mase akn mao , pasarn mudal , khususnye psaran sukuk Msia dojngkak akn tris menjadik smbt pnting dalam penjanaan damak bgk negatak .',
  'Justeru , pada masa akan datang , pasaran modal , khususnya pasaran sukuk Malaysia dijangka akan terus menjadi sumber penting dalam penjanaan dana bagi negara .'),
 ('SO ALAN PEMBERITAHUAN PERTANYAAN DEWAN RAKYAT , PARLIMEN MALAYSIA .',
  'SO ALAN PEMBERI

In [22]:
import cleaning

results1 = cleaning.multiprocessing(data, loop)

100%|██████████| 625/625 [00:05<00:00, 113.43it/s]
100%|██████████| 625/625 [00:06<00:00, 93.38it/s] 
100%|██████████| 625/625 [00:08<00:00, 77.22it/s]]
100%|██████████| 625/625 [00:08<00:00, 74.39it/s] 
100%|██████████| 625/625 [00:08<00:00, 70.24it/s]
100%|██████████| 625/625 [00:08<00:00, 70.55it/s]
100%|██████████| 625/625 [00:09<00:00, 66.76it/s]
100%|██████████| 625/625 [00:09<00:00, 64.75it/s]
100%|██████████| 625/625 [00:09<00:00, 64.74it/s]
100%|██████████| 625/625 [00:09<00:00, 64.56it/s]
100%|██████████| 625/625 [00:09<00:00, 66.46it/s]]
100%|██████████| 625/625 [00:09<00:00, 65.29it/s]
100%|██████████| 625/625 [00:09<00:00, 62.59it/s]]
100%|██████████| 625/625 [00:09<00:00, 64.43it/s]]
100%|██████████| 625/625 [00:10<00:00, 61.89it/s] 
100%|██████████| 625/625 [00:10<00:00, 61.43it/s] 


In [23]:
not_same = 0
for r in tqdm(results1):
    if r[0] != r[1]:
        not_same += 1

not_same / len(results1)

100%|██████████| 9742/9742 [00:00<00:00, 381197.20it/s]


0.707452268528023

In [26]:
import json

with open('testset-spelling-augmentation.json', 'w') as fopen:
    json.dump(results1, fopen)

In [29]:
import os

b2_application_key_id = os.environ['b2_application_key_id']
b2_application_key = os.environ['b2_application_key']

In [30]:
from b2sdk.v1 import *
info = InMemoryAccountInfo()
b2_api = B2Api(info)
application_key_id = b2_application_key_id
application_key = b2_application_key
b2_api.authorize_account("production", application_key_id, application_key)
file_info = {'how': 'good-file'}
b2_bucket = b2_api.get_bucket_by_name('malay-dataset')

In [31]:
b2_bucket.upload_local_file(
    local_file='testset-spelling-augmentation.json',
    file_name='spelling/testset-spelling-augmentation.json',
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f85857decc0>

In [32]:
b2_bucket.upload_local_file(
    local_file='spelling-correction-news.tsv',
    file_name='spelling/spelling-correction-news.tsv',
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f8584b2c860>

In [33]:
b2_bucket.upload_local_file(
    local_file='spelling-correction-wiki.tsv',
    file_name='spelling/spelling-correction-wiki.tsv',
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f8584cba588>