In [1]:
# !wget https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/crawler/academia/academia-pdf.json

In [2]:
import json
import cleaning
from tqdm import tqdm

In [3]:
with open('../academia/academia-pdf.json') as fopen:
    pdf = json.load(fopen)
    
len(pdf)

1414

In [4]:
import os

os.path.split(pdf[0]['file'])

('academia.edu', 'Prosiding_Kolokium_Siswazah_JUF_2017.pdf')

In [5]:
import malaya

fast_text = malaya.language_detection.fasttext()




In [6]:
fast_text.predict(['Prosiding_Kolokium_Siswazah_JUF_2017.pdf'])

['malay']

In [7]:
from unidecode import unidecode

def clean(string):
    string = [cleaning.cleaning(s) for s in string]
    
    string = [s.strip() for s in string if 'tarikh' not in s.lower() and 'soalan no' not in s.lower()]
    string = [s for s in string if not ''.join(s.split()[:1]).isdigit() and '.soalan' not in s.lower() and 'jum ' not in s.lower()]
    string = [s for s in string if not s[:3].isdigit() and not s[-3:].isdigit()]
    return string

In [8]:
outer = []

for k in tqdm(range(len(pdf))):

    c = clean(pdf[k]['content']['content'].split('\n'))
    t, last = [], 0

    i = 0
    while i < len(c):
        text = c[i]

        if len(text) > 5:
            if len(text.split()) > 1:
                t.append(text)
            last = i
        else:
            if len(t) and (i - last) > 2:
                t.append('')
                outer.extend(t)
                t = []
                last = i
            elif not len(t):
                last = i

        i += 1
    
    if len(t):
        t.append('')
        outer.extend(t)

100%|██████████| 1414/1414 [07:04<00:00,  3.33it/s]


In [9]:
len(outer)

8926305

In [10]:
%%time

temp_vocab = list(set(cleaning.multiprocessing(outer, cleaning.unique_words)))

CPU times: user 3.93 s, sys: 2.22 s, total: 6.15 s
Wall time: 7.15 s


In [11]:
%%time

# important
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.duplicate_dots_marks_exclamations, list_mode = False)
print(len(temp_dict))

7040
CPU times: user 415 ms, sys: 961 ms, total: 1.38 s
Wall time: 3.11 s


In [12]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 276346.68it/s]


In [13]:
%%time

# important
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_underscore, list_mode = False)
print(len(temp_dict))

536
CPU times: user 591 ms, sys: 972 ms, total: 1.56 s
Wall time: 2.19 s


In [14]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 274032.82it/s]


In [15]:
%%time

# important
temp_dict = cleaning.multiprocessing(outer, cleaning.isolate_spamchars, list_mode = False)
print(len(temp_dict))

0
CPU times: user 2.54 s, sys: 2.15 s, total: 4.69 s
Wall time: 10.7 s


In [16]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_short_words, list_mode = False)
print(len(temp_dict))

19693
CPU times: user 407 ms, sys: 1.01 s, total: 1.42 s
Wall time: 1.55 s


In [17]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 270720.18it/s]


In [18]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.break_long_words, list_mode = False)
print(len(temp_dict))

4441
CPU times: user 360 ms, sys: 1.04 s, total: 1.4 s
Wall time: 1.64 s


In [19]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 278113.08it/s]


In [20]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_ending_underscore, list_mode = False)
print(len(temp_dict))

272
CPU times: user 314 ms, sys: 1 s, total: 1.31 s
Wall time: 1.5 s


In [21]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 270761.52it/s]


In [22]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.remove_starting_underscore, list_mode = False)
print(len(temp_dict))

343
CPU times: user 376 ms, sys: 1 s, total: 1.38 s
Wall time: 1.57 s


In [23]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:32<00:00, 278484.09it/s]


In [24]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.end_punct, list_mode = False)
print(len(temp_dict))

533165
CPU times: user 2.05 s, sys: 1.08 s, total: 3.13 s
Wall time: 3.32 s


In [25]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:35<00:00, 249997.35it/s]


In [26]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.start_punct, list_mode = False)
print(len(temp_dict))

178877
CPU times: user 949 ms, sys: 1.04 s, total: 1.99 s
Wall time: 2.16 s


In [27]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:35<00:00, 249939.42it/s]


In [28]:
%%time
temp_dict = cleaning.multiprocessing(temp_vocab, cleaning.join_dashes, list_mode = False)
print(len(temp_dict))

495
CPU times: user 319 ms, sys: 1 s, total: 1.32 s
Wall time: 1.63 s


In [29]:
outer = cleaning.string_dict_cleaning(outer, temp_dict)

100%|██████████| 8926305/8926305 [00:35<00:00, 253898.14it/s]


In [30]:
results, result = [], []
for i in tqdm(outer):
    if not len(i) and len(result):
        results.append(result)
        result = []
    else:
        result.append(i)
        
if len(result):
    results.append(result)

100%|██████████| 8926305/8926305 [00:07<00:00, 1261203.88it/s]


In [31]:
import re

alphabets = '([A-Za-z])'
prefixes = (
    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'
)
suffixes = '(Inc|Ltd|Jr|Sr|Co|Mo)'
starters = '(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami|Beliau|Seri|Datuk|Dato|Datin|Tuan|Puan)'
acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
websites = '[.](com|net|org|io|gov|me|edu|my)'
another_websites = '(www|http|https)[.]'
digits = '([0-9])'
before_digits = '([Nn]o|[Nn]ombor|[Nn]umber|[Kk]e|=|al)'
month = '([Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|Mei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)'


def split_into_sentences(text, minimum_length = 5):
    text = text.replace('\x97', '\n')
    text = '. '.join([s for s in text.split('\n') if len(s)])
    text = text + '.'
    text = unidecode(text)
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(prefixes, '\\1<prd>', text)
    text = re.sub(websites, '<prd>\\1', text)
    text = re.sub(another_websites, '\\1<prd>', text)
    text = re.sub('[,][.]+', '<prd>', text)
    if '...' in text:
        text = text.replace('...', '<prd><prd><prd>')
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub('[.]\s*[,]', '<prd>,', text)
    text = re.sub(before_digits + '\s*[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub(month + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub('\s' + alphabets + '[.][ ]+', ' \\1<prd> ', text)
    text = re.sub(acronyms + ' ' + starters, '\\1<stop> \\2', text)
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]', '\\1<prd>\\2<prd>', text
    )
    text = re.sub(' ' + suffixes + '[.][ ]+' + starters, ' \\1<stop> \\2', text)
    text = re.sub(' ' + suffixes + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + alphabets + '[.]', ' \\1<prd>', text)
    text = re.sub(digits + '[.]' + digits, '\\1<prd>\\2', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if len(s) > minimum_length]
    return sentences

split_into_sentences('733 ke . 633 , berlaku penurunan akibat kesan program PMI .')

['733 ke.633 , berlaku penurunan akibat kesan program PMI .']

In [32]:
import malaya
import re

def strip(string):
    string = ' '.join(string)
    string = re.sub(r'[ ]+', ' ', string.replace('\n', ' ').replace('\t', ' ')).strip()
    return split_into_sentences(string)

In [33]:
output = []

for r in tqdm(results):
    output.extend(strip(r) + [''])

100%|██████████| 678289/678289 [04:47<00:00, 2360.23it/s]


In [34]:
len(output)

5502537

In [35]:
output[10000:11000]

['He then stressed that activities held by the mosque committee is always welcomed by local community and receiving a great number of participants for each of the activities .',
 'After the interview and a observation for what have been happening around Putra Mosque and inside the mosque , researcher is really agreed with Mr .',
 'Raja Aman .',
 'First thing is about the pattern of Islam in this region .',
 'It is clearly seen that there are a lot of prayer room can be found even in the shopping complex , offices and the places is clean and neat .',
 'The cleanliness and the beauty keep in this region is one of the values and teachings of Islam that has been commanded by Allah in the Quran.',
 '',
 'Prosiding Kolokium Siswazah Jabatan Usuluddin dan Falsafah.',
 '',
 'and by the hadith .',
 'It is so fresh and comfortable to be there either inside or outside the mosque .',
 'Attitude shown by the guard of the mosque and volunteers are so good .',
 'They always smiling and welcoming visi

In [37]:
with open('dumping-academia.txt', 'w') as fopen:
    fopen.write('\n'.join(output))