In [1]:
import malaya

In [2]:
with open('dumping-academia.txt') as fopen:
    data = fopen.read().split('\n')

In [3]:
import re
from unidecode import unidecode

alphabets = '([A-Za-z])'
prefixes = (
    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'
)
suffixes = '(Inc|Ltd|Jr|Sr|Co|Mo)'
starters = '(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami|Beliau|Seri|Datuk|Dato|Datin|Tuan|Puan)'
acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
websites = '[.](com|net|org|io|gov|me|edu|my)'
another_websites = '(www|http|https)[.]'
digits = '([0-9])'
before_digits = '([Nn]o|[Nn]ombor|[Nn]umber|[Kk]e|=|al)'
month = '([Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|Mei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)'


def split_into_sentences(text, minimum_length = 5):
    text = text.replace('\x97', '\n')
    text = '. '.join([s for s in text.split('\n') if len(s)])
    text = text + '.'
    text = unidecode(text)
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(prefixes, '\\1<prd>', text)
    text = re.sub(websites, '<prd>\\1', text)
    text = re.sub(another_websites, '\\1<prd>', text)
    text = re.sub('[,][.]+', '<prd>', text)
    if '...' in text:
        text = text.replace('...', '<prd><prd><prd>')
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub('[.]\s*[,]', '<prd>,', text)
    text = re.sub(before_digits + '\s*[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub(month + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub('\s' + alphabets + '[.][ ]+', ' \\1<prd> ', text)
    text = re.sub(acronyms + ' ' + starters, '\\1<stop> \\2', text)
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]', '\\1<prd>\\2<prd>', text
    )
    text = re.sub(' ' + suffixes + '[.][ ]+' + starters, ' \\1<stop> \\2', text)
    text = re.sub(' ' + suffixes + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + alphabets + '[.]', ' \\1<prd>', text)
    text = re.sub(digits + '[.]' + digits, '\\1<prd>\\2', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if len(s) > minimum_length]
    return sentences

split_into_sentences('Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al . 1975 : 12 ), ')

['Pembolehubah yang ketiga adalah niat yang merujuk kepada niat seseorang dalam melakukan pelbagai tingkah laku ( Fishbein et al.1975 : 12 ), .']

In [4]:
data[11000: 12000]

["Setelah itu , semenjak 2006 hingga 2010 , tiada penulisan berkaitan dengan sam'iyyat diter selama empat tahun ini .",
 "Kemudian , selama dua tahun iaitu pada tahun 2015 dan 2016 , tiada tema sam'iyyat yang dikeluarkan .",
 'Penulis mendapati hal ini terjadi kerana majalah Pengasuh mula menitikberatkan isu ideologi-ideologi moden dan isu-isu akidah di dalam masyarakat kini .',
 "Dalam tema sam'iyyat ini , penulis Dr Randus Abdul Ghani Azmi Hj .",
 "Idris masih lagi mengambil bahagian dalam menulis isu-isu yang berkaitan dengan khurafat dan bid'ah dengan lebih memfokuskan kepada pentafsiran ulama - ulama tafsir yang pelbagai seperti Ibnu Kathir , Ar-Razi dan lain-lain lagi .",
 "Penulis kedua yang banyak didapati di dalm penulisan sam'iyyat ialah Muhammad ' Uthman El-Muhammadi .",
 "Beliau ada menulis secara bersiri dalam tajuk ' Di Sekitar Tahdzir Arwah '.",
 "Selain itu , menariknya majalah Pengasuh ialah apabila karya ulama hebat seperti Imam Ghazali juga diterjemahkan dan dimuatka

In [5]:
import malaya

fast_text = malaya.language_detection.fasttext()




In [6]:
fast_text.predict(['wthlthny wHmd llh thlth wthlthny wkbr llh thlth thmn sbH llh fy dbr kl Sl@ thl l llh wHdh l shryk lh lh wthlthny ftlk ts`@ wts`wn wql tmm lmy@ l lh.'])

['malay']

In [7]:
re.sub('[^0-9!@#$%\^&*()-=_\+{}\[\];\':",./<>? ]+', '', '(')

'('

In [8]:
VOWELS = 'aeiou'
PHONES = ['sh', 'ch', 'ph', 'sz', 'cz', 'sch', 'rz', 'dz']
punctuations = '!@#$%^&*()_+=-'

def isword_malay(word):
    if re.sub('[^0-9!@#$%\^&*()-=_\+{}\[\];\':",./<>?\|~`\\\ ]+', '', word) == word:
        return True
    if not any([c in VOWELS for c in word]):
        return False
    return True


def isword_english(word):
    if word:
        consecutiveVowels = 0
        consecutiveConsonents = 0
        for idx, letter in enumerate(word.lower()):
            vowel = True if letter in VOWELS else False
            if idx:
                prev = word[idx - 1]
                prevVowel = True if prev in VOWELS else False
                if not vowel and letter == 'y' and not prevVowel:
                    vowel = True
                if prevVowel != vowel:
                    consecutiveVowels = 0
                    consecutiveConsonents = 0
            if vowel:
                consecutiveVowels += 1
            else:
                consecutiveConsonents += 1
            if consecutiveVowels >= 3 or consecutiveConsonents > 3:
                return False
            if consecutiveConsonents == 3:
                subStr = word[idx - 2 : idx + 1]
                if any(phone in subStr for phone in PHONES):
                    consecutiveConsonents -= 1
                    continue
                return False
    return True

In [9]:
def filter_string(string, min_len = 15):
    if len(string) < min_len:
        return ''
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    string = [w for w in string.split() if isword_malay(w.lower())]
    string = ' '.join(string)
    if len(string) > 2:
        if fast_text.predict([string])[0] == 'other':
            return ''
        else:
            return string
    else:
        return string

In [10]:
%%time

string = 'Pengasuh .'
filter_string(string)

CPU times: user 13 µs, sys: 6 µs, total: 19 µs
Wall time: 22.2 µs


''

In [12]:
def loop(strings):
    results = []
    for string in tqdm(strings):
        no = string[0]
        results.append((no, filter_string(string[1])))
    return results

In [21]:
import cleaning
temp = [(no, s) for no, s in enumerate(data)]
results = cleaning.multiprocessing(temp, loop)

100%|██████████| 343908/343908 [01:26<00:00, 3984.79it/s] 
100%|██████████| 9/9 [00:00<00:00, 1790.06it/s]49.63it/s]
100%|██████████| 343908/343908 [01:26<00:00, 3959.57it/s] 
100%|██████████| 343908/343908 [01:29<00:00, 3862.36it/s]
100%|██████████| 343908/343908 [01:30<00:00, 3793.91it/s]
100%|██████████| 343908/343908 [01:28<00:00, 3902.32it/s]
100%|██████████| 343908/343908 [01:30<00:00, 3798.63it/s]
100%|██████████| 343908/343908 [01:28<00:00, 3865.37it/s]
100%|██████████| 343908/343908 [01:33<00:00, 3672.51it/s]
100%|██████████| 343908/343908 [01:31<00:00, 3745.43it/s]
100%|██████████| 343908/343908 [01:33<00:00, 3692.10it/s]
100%|██████████| 343908/343908 [01:32<00:00, 3718.36it/s]
100%|██████████| 343908/343908 [01:31<00:00, 3750.65it/s]
100%|██████████| 343908/343908 [01:37<00:00, 3543.65it/s]
100%|██████████| 343908/343908 [01:38<00:00, 3492.54it/s]
100%|██████████| 343908/343908 [01:37<00:00, 3510.80it/s]
100%|██████████| 343908/343908 [01:37<00:00, 3540.37it/s]


In [22]:
%%time

results = sorted(results, key=lambda x: x[0])

CPU times: user 646 ms, sys: 20.1 ms, total: 666 ms
Wall time: 664 ms


In [25]:
results = [r[1] for r in results]

In [26]:
with open('filtered-dumping-academia.txt', 'w') as fopen:
    fopen.write('\n'.join(results))

In [27]:
!head -n 100 filtered-dumping-academia.txt

JABATAN USULUDDIN DAN FALSAFAH.

KOLOKIUM SISWAZAH JABATAN USULUDDIN DAN FALSAFAH.

Mazlan Ibrahim Nurulfathonah Mohd Effendy Nur Fathiah A Rozak Muhammad Akmal .
Azmi Zulkeffly Adimun Amir Sharifuddin Ahmad.

JABATAN USULUDDIN DAN FALSAFAH FAKULTI PENGAJIAN ISLAM UNIVERSITI KEBANGSAAN MALAYSIA.

Hak cipta / Copyright.

Hak cipta terpelihara .
Tiada bahagian daripada ter ini boleh diter semula , disimpan untuk pengeluaran atau ditukarkan ke dalam sebarang bentuk atau dengan sebarang alat juga pun , sama ada dengan cara elektronik , gambar serta rakaman dan sebagainya tanpa kebenaran bertulis daripada Jabatan Usuluddin dan Falsafah , FPI , UKM terlebih dahulu .

All rights reserved .
No part of this publication may be reproduced or transmitted in any form or any means , electronic or mechanical , including photocopy , recording , or any information storage and retrieval system , without permission in writing from the Department of Usuluddin and Philosophy , FPI , UKM .
