In [1]:
import malaya

In [2]:
with open('dumping-cleaned-common-crawl.txt') as fopen:
    data = fopen.read().split('\n')

In [3]:
len(data)

65082255

In [4]:
import re
from unidecode import unidecode

alphabets = '([A-Za-z])'
prefixes = (
    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'
)
suffixes = '(Inc|Ltd|Jr|Sr|Co|Mo)'
starters = '(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami|Beliau|Seri|Datuk|Dato|Datin|Tuan|Puan)'
acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
websites = '[.](com|net|org|io|gov|me|edu|my)'
another_websites = '(www|http|https)[.]'
digits = '([0-9])'
before_digits = '([Nn]o|[Nn]ombor|[Nn]umber|[Kk]e|=|al)'
month = '([Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|Mei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)'


def split_into_sentences(text, minimum_length = 5):
    text = text.replace('\x97', '\n')
    text = '. '.join([s for s in text.split('\n') if len(s)])
    text = text + '.'
    text = unidecode(text)
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(prefixes, '\\1<prd>', text)
    text = re.sub(websites, '<prd>\\1', text)
    text = re.sub(another_websites, '\\1<prd>', text)
    text = re.sub('[,][.]+', '<prd>', text)
    if '...' in text:
        text = text.replace('...', '<prd><prd><prd>')
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub('[.]\s*[,]', '<prd>,', text)
    text = re.sub(before_digits + '\s*[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub(month + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub('\s' + alphabets + '[.][ ]+', ' \\1<prd> ', text)
    text = re.sub(acronyms + ' ' + starters, '\\1<stop> \\2', text)
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]', '\\1<prd>\\2<prd>', text
    )
    text = re.sub(' ' + suffixes + '[.][ ]+' + starters, ' \\1<stop> \\2', text)
    text = re.sub(' ' + suffixes + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + alphabets + '[.]', ' \\1<prd>', text)
    text = re.sub(digits + '[.]' + digits, '\\1<prd>\\2', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if len(s) > minimum_length]
    return sentences

split_into_sentences('Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al . 1975 : 12 ), ')

['Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al.1975 : 12 ), .']

In [5]:
data[11000: 12000]

['Pemasaran',
 'Komunikasi',
 'Penjenamaan',
 'Strategi Online & Offline',
 'Pelaburan',
 'Hartanah',
 'Logam',
 'Saham',
 'Motivasi',
 'Kepimpinan',
 'Kisah',
 'Pembangunan Diri',
 'Kewangan',
 'Pengurusan Kewangan',
 'Pinjaman',
 'Perakaunan',
 'Dekorasi Bisnes',
 'MUAT TURUN',
 'Artikel Panjang / Artikel Pilihan / Perniagaan',
 '\ufeff4 Perkara Yang Anda Patut Lakukan Apabila Rasa Gagal Dalam Bisnes',
 'Tweet',
 'PENULIS',
 'Penulis Jemputan',
 'TARIKH',
 'KATEGORI',
 'Artikel Panjang',
 'Artikel Pilihan',
 'ISI KANDUNGAN',
 'Introduction Ruangan Komen',
 'FacebookTwitter',
 'Bila bisnes kita rasa macam ‘slow’, bila kita punya stok jualan yang dah berbulan-bulan simpan tak susut-susut lagi berhabuk, bila dah berbulan-bulan tapi tak pusing modal, kadang-kadang sampai tertunggak bil-bil bulanan, mesti kita rasa nak ‘give up’ kan?',
 'Salah dia kat mana?',
 'Salah produk ke?',
 'Salah harga ke?',
 'Salah customer ke tak reti nak nilai barang?',
 'Atau salah sendiri?',
 'Atau bukan sala

In [6]:
import malaya

fast_text = malaya.language_detection.fasttext()




In [7]:
VOWELS = 'aeiou'
PHONES = ['sh', 'ch', 'ph', 'sz', 'cz', 'sch', 'rz', 'dz']
punctuations = '!@#$%^&*()_+=-'

def isword_malay(word):
    if re.sub('[^0-9!@#$%\^&*()-=_\+{}\[\];\':",./<>?\|~`\\\ ]+', '', word) == word:
        return True
    if not any([c in VOWELS for c in word]):
        return False
    return True


def isword_english(word):
    if word:
        consecutiveVowels = 0
        consecutiveConsonents = 0
        for idx, letter in enumerate(word.lower()):
            vowel = True if letter in VOWELS else False
            if idx:
                prev = word[idx - 1]
                prevVowel = True if prev in VOWELS else False
                if not vowel and letter == 'y' and not prevVowel:
                    vowel = True
                if prevVowel != vowel:
                    consecutiveVowels = 0
                    consecutiveConsonents = 0
            if vowel:
                consecutiveVowels += 1
            else:
                consecutiveConsonents += 1
            if consecutiveVowels >= 3 or consecutiveConsonents > 3:
                return False
            if consecutiveConsonents == 3:
                subStr = word[idx - 2 : idx + 1]
                if any(phone in subStr for phone in PHONES):
                    consecutiveConsonents -= 1
                    continue
                return False
    return True

In [8]:
def filter_string(string, min_len = 15):
    if len(string) < min_len:
        return ''
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    string = [w for w in string.split() if isword_malay(w.lower())]
    string = ' '.join(string)
    if len(string) > 2:
        if fast_text.predict([string])[0] == 'other':
            return ''
        else:
            return string
    else:
        return string

In [9]:
def loop(strings):
    results = []
    for string in tqdm(strings):
        no = string[0]
        results.append((no, filter_string(string[1])))
    return results

In [10]:
import cleaning
from tqdm import tqdm

temp = [(no, s) for no, s in enumerate(data)]
results = cleaning.multiprocessing(temp, loop)

100%|██████████| 4067640/4067640 [13:16<00:00, 5107.50it/s] 
100%|██████████| 4067640/4067640 [13:22<00:00, 5068.13it/s]
100%|██████████| 15/15 [00:00<00:00, 6566.60it/s]45.06it/s]
100%|██████████| 4067640/4067640 [13:22<00:00, 5068.85it/s]
100%|██████████| 4067640/4067640 [13:26<00:00, 5042.74it/s]
100%|██████████| 4067640/4067640 [13:30<00:00, 5015.83it/s] 
100%|██████████| 4067640/4067640 [13:30<00:00, 5017.72it/s]
100%|██████████| 4067640/4067640 [13:23<00:00, 5060.59it/s]
 97%|█████████▋| 3938646/4067640 [13:04<00:25, 5133.25it/s]
100%|██████████| 4067640/4067640 [13:18<00:00, 5093.36it/s]]
 99%|█████████▉| 4025794/4067640 [13:23<00:09, 4332.86it/s] 
100%|██████████| 4067640/4067640 [13:28<00:00, 5033.31it/s]
100%|██████████| 4067640/4067640 [13:37<00:00, 4976.66it/s]
100%|██████████| 4067640/4067640 [13:31<00:00, 5014.28it/s]
100%|██████████| 4067640/4067640 [13:24<00:00, 5054.90it/s]
100%|██████████| 4067640/4067640 [13:27<00:00, 5035.59it/s]
100%|██████████| 4067640/4067640 [13

In [11]:
%%time

results = sorted(results, key=lambda x: x[0])
results = [r[1] for r in results]

CPU times: user 20.8 s, sys: 575 ms, total: 21.4 s
Wall time: 21.4 s


In [12]:
results[:1000]

['',
 '',
 '',
 'Langgan: Catatan (Atom)',
 '',
 '',
 'Lihat profil lengkap saya',
 '',
 'Dikuasakan oleh Blogger.',
 '',
 'Projek - Konsortium Perumahan Rakyat Terengganu',
 '',
 'LIHAT PROJEK YANG DIBUKA JUALAN DISINI',
 'KONSORTIUM PERUMAHAN RAKYAT TERENGGANU',
 '',
 '09-617 3405 Email Koperat FAX : 09-617 7404',
 '',
 'Maklumat Korporat',
 '',
 '',
 'PROJEK DIBUKA JUALAN',
 '',
 '',
 '',
 '',
 'Konsep Perumahan Rumah Kos Rendah oleh KOPERAT adalah mengikut konsep Co-Developer, ‘Design and Built’ dan Pemberian Kontrak.',
 '',
 'Taman Koperat Peradong',
 'Taman Rakyat Bistari',
 'Taman Koperat Putera',
 'Taman Rakyat Harmoni',
 '',
 'TAMAN RAKYAT BISTARI FASA & 3E',
 '',
 'TAMAN RAKYAT BISTARI, KEMAMAN, TERENGGANU',
 'RUMAH BERKEMBAR & BANGLO WAKAF TAPAI',
 '',
 'WAKAF TAPAI,, MARANG,, TERENGGANU',
 '',
 'TAMAN RAKYAT HARMONI, FASA IV',
 'PEKAN JABI,, BESUT,, TERENGGANU',
 '',
 'TAMAN RAKYAT BISTARI, FASA',
 'BUKIT KUANG II, TELUK KALUNG,, KEMAMAN,, TERENGGANU',
 '',
 'TAMAN RAKYAT B

In [15]:
with open('filtered-dumping-cleaned-common-crawl.txt', 'w') as fopen:
    fopen.write('\n'.join(results))