In [1]:
import malaya

In [2]:
with open('dumping-wiki.txt') as fopen:
    data = fopen.read().split('\n')

In [3]:
len(data)

2621311

In [4]:
import re
from unidecode import unidecode

alphabets = '([A-Za-z])'
prefixes = (
    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'
)
suffixes = '(Inc|Ltd|Jr|Sr|Co|Mo)'
starters = '(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami|Beliau|Seri|Datuk|Dato|Datin|Tuan|Puan)'
acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
websites = '[.](com|net|org|io|gov|me|edu|my)'
another_websites = '(www|http|https)[.]'
digits = '([0-9])'
before_digits = '([Nn]o|[Nn]ombor|[Nn]umber|[Kk]e|=|al)'
month = '([Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|Mei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)'


def split_into_sentences(text, minimum_length = 5):
    text = text.replace('\x97', '\n')
    text = '. '.join([s for s in text.split('\n') if len(s)])
    text = text + '.'
    text = unidecode(text)
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(prefixes, '\\1<prd>', text)
    text = re.sub(websites, '<prd>\\1', text)
    text = re.sub(another_websites, '\\1<prd>', text)
    text = re.sub('[,][.]+', '<prd>', text)
    if '...' in text:
        text = text.replace('...', '<prd><prd><prd>')
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub('[.]\s*[,]', '<prd>,', text)
    text = re.sub(before_digits + '\s*[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub(month + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub('\s' + alphabets + '[.][ ]+', ' \\1<prd> ', text)
    text = re.sub(acronyms + ' ' + starters, '\\1<stop> \\2', text)
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]', '\\1<prd>\\2<prd>', text
    )
    text = re.sub(' ' + suffixes + '[.][ ]+' + starters, ' \\1<stop> \\2', text)
    text = re.sub(' ' + suffixes + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + alphabets + '[.]', ' \\1<prd>', text)
    text = re.sub(digits + '[.]' + digits, '\\1<prd>\\2', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if len(s) > minimum_length]
    return sentences

split_into_sentences('Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al . 1975 : 12 ), ')

['Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al.1975 : 12 ), .']

In [5]:
import malaya

fast_text = malaya.language_detection.fasttext()




In [6]:
VOWELS = 'aeiou'
PHONES = ['sh', 'ch', 'ph', 'sz', 'cz', 'sch', 'rz', 'dz']
punctuations = '!@#$%^&*()_+=-'

def isword_malay(word):
    if re.sub('[^0-9!@#$%\^&*()-=_\+{}\[\];\':",./<>?\|~`\\\ ]+', '', word) == word:
        return True
    if not any([c in VOWELS for c in word]):
        return False
    return True


def isword_english(word):
    if word:
        consecutiveVowels = 0
        consecutiveConsonents = 0
        for idx, letter in enumerate(word.lower()):
            vowel = True if letter in VOWELS else False
            if idx:
                prev = word[idx - 1]
                prevVowel = True if prev in VOWELS else False
                if not vowel and letter == 'y' and not prevVowel:
                    vowel = True
                if prevVowel != vowel:
                    consecutiveVowels = 0
                    consecutiveConsonents = 0
            if vowel:
                consecutiveVowels += 1
            else:
                consecutiveConsonents += 1
            if consecutiveVowels >= 3 or consecutiveConsonents > 3:
                return False
            if consecutiveConsonents == 3:
                subStr = word[idx - 2 : idx + 1]
                if any(phone in subStr for phone in PHONES):
                    consecutiveConsonents -= 1
                    continue
                return False
    return True

In [12]:
def filter_string(string, min_len = 10):
    if len(string) < min_len:
        return ''
    string = string.replace('<br>', ' ').replace('</br>', ' ')
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    string = [w for w in string.split() if isword_malay(w.lower())]
    string = ' '.join(string)
    if len(string) > 2:
        if fast_text.predict([string])[0] == 'other':
            return ''
        else:
            return string
    else:
        return string

In [13]:
def loop(strings):
    results = []
    for string in tqdm(strings):
        no = string[0]
        results.append((no, filter_string(string[1])))
    return results

In [14]:
import cleaning
from tqdm import tqdm

temp = [(no, s) for no, s in enumerate(data)]
results = cleaning.multiprocessing(temp, loop)

100%|██████████| 163831/163831 [01:00<00:00, 2693.98it/s]
 91%|█████████▏| 149713/163831 [01:01<00:03, 4160.79it/s]
100%|██████████| 15/15 [00:00<00:00, 3543.68it/s].38it/s]
100%|██████████| 163831/163831 [01:01<00:00, 2649.27it/s]
 99%|█████████▊| 161749/163831 [01:04<00:00, 2692.14it/s]
100%|██████████| 163831/163831 [01:04<00:00, 2535.55it/s]
100%|██████████| 163831/163831 [01:05<00:00, 2519.80it/s]
100%|██████████| 163831/163831 [01:06<00:00, 2452.54it/s]
 99%|█████████▉| 162377/163831 [01:15<00:00, 2708.99it/s]
100%|██████████| 163831/163831 [01:16<00:00, 2143.32it/s]
100%|██████████| 163831/163831 [01:16<00:00, 2128.94it/s]
100%|██████████| 163831/163831 [01:18<00:00, 2075.36it/s]
100%|██████████| 163831/163831 [01:18<00:00, 2081.95it/s]
100%|██████████| 163831/163831 [01:19<00:00, 2049.07it/s]
100%|██████████| 163831/163831 [01:19<00:00, 2048.67it/s]
100%|██████████| 163831/163831 [01:20<00:00, 2041.01it/s]
100%|██████████| 163831/163831 [01:21<00:00, 2021.46it/s]


In [15]:
%%time

results = sorted(results, key=lambda x: x[0])
results = [r[1] for r in results]

CPU times: user 655 ms, sys: 10.2 ms, total: 666 ms
Wall time: 664 ms


In [16]:
results[:1000]

['',
 'Dirk Jan Klaas "Klaas-Jan" Huntelaar (lahir 12 Ogos 1983) merupakan pemain bola sepak Belanda yang bermain di posisi penyerang.',
 'Beliau kini bermain untuk kelab Ajax.',
 '',
 'Hypo-Arena.',
 'Hypo-Arena (dahulu dikenali sebagai ) ialah sebuah stadium serba guna di Klagenfurt, Austria.',
 'Ia merupakan stadium pasukan Austria Karnten.',
 'Stadium lama dikenali sebagai Wortherseestadion, dibina pada 1960 dan mempunyai kapasiti sebanyak 10,900.',
 'Ia dirobohkan pada 2005 dan digantikan dengan Hypo-Arena yang baru, juga dikenali sehingga 30 Jun 2007 dengan nama "Wortherseestadion".',
 'Ia adalah salah satu daripada 8 stadium untuk UEFA Euro 2008, dan dibina untuk menampung 32,000 penonton.',
 'Selepas acara tersebut, kapasiti stadium ini sedang dipertimbangkan untuk dikurangkan kepada 12,500.',
 'Stadium ini dibuka secara rasmi pada 7 September 2007 dengan menjadi tuan rumah untuk perlawanan persahabatan di antara Austria dan Jepun di hadapan 26,500 penonton.',
 '',
 '',
 'Marta

In [17]:
with open('filtered-dumping-wiki.txt', 'w') as fopen:
    fopen.write('\n'.join(results))