In [1]:
# download hansard from https://www.parlimen.gov.my/hansard-dewan-rakyat.html?uweb=dr
# !wget https://dlcdn.apache.org/tika/2.6.0/tika-server-standard-2.6.0-bin.zip
# !unzip tika-server-standard-2.6.0-bin.zip
# !chmod +x tika-server-standard-2.6.0-bin/bin/tika
# !./tika-server-standard-2.6.0-bin/bin/tika start -p 9998
# !pip3 install tika

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
import tika
from tika import parser
import re
from unidecode import unidecode
from tqdm import tqdm
from glob import glob
from datetime import datetime
import json
from bs4 import BeautifulSoup

def cleaning(string):
    string = unidecode(string).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ').replace('_', ' ')
    string = string.replace(' -', '-').replace(' ANYA', 'ANYA').replace('ki ta', 'kita').replace('s aya', 'saya')
    string = string.replace('m enjadi', 'menjadi').replace('meno lak', 'menolak')
    try:
        soup = BeautifulSoup(string, 'lxml')
        string = soup.text
    except:
        pass
    return re.sub(r'[ ]+', ' ', string).strip()

In [4]:
files = sorted(glob('pdf/*.pdf'))
files[0]

'pdf/DR-01032000.pdf'

In [5]:
files[0]

'pdf/DR-01032000.pdf'

In [34]:
raw_xml = parser.from_file('pdf/DR-04112003.pdf', 'http://localhost:9998/tika', xmlContent=True)
body = raw_xml['content'].split('<body>')[1].split('</body>')[0]
body_without_tag = body.replace("<p>", "").replace("</p>", "").replace("<div>", "").replace("</div>","").replace("<p />","")
text_pages = body_without_tag.split("""<div class="page">""")[1:]

In [35]:
text_pages[87]

'\n76 DR 04.11.2003 \n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n\n'

In [20]:
splitted = text_pages[87].split('\n \n')
splitted = [s for s in splitted if len(s.strip()) > 1]
if len(splitted) < 3:
    splitted = t.split('\n\n')
splitted = [cleaning(s) for s in splitted[1:]]
splitted = [s + '.' if s[-1] not in  '.;:,' else s for s in splitted if len(s)]
splitted = ' '.join(splitted)
len(splitted)

3790

In [36]:
with open('hansard.jsonl', 'w') as fopen:
    for f in tqdm(files):
        raw_xml = parser.from_file(f, 'http://localhost:9998/tika', xmlContent=True)
        body = raw_xml['content'].split('<body>')[1].split('</body>')[0]
        body_without_tag = body.replace("<p>", "").replace("</p>", "").replace("<div>", "").replace("</div>","").replace("<p />","")
        text_pages = body_without_tag.split("""<div class="page">""")[1:]

        for i, t in enumerate(text_pages):
            r = re.findall(r'DR[\. ]\s*[0-3]?[0-9].[0-3]?[0-9].(?:[0-9]{2})?[0-9]{2}\s+\d+\b', t)
            r_ = re.findall(r'\d+\s+DR[\. ]\s*[0-3]?[0-9].[0-3]?[0-9].(?:[0-9]{2})?[0-9]{2}\b', t)
            found = True
            if len(r):
                no_page = r[0].split()[-1]
            elif len(r_):
                no_page = r_[0].split()[0]
            else:
                found = False

            if not found:
                continue

            splitted = t.split('\n \n')
            splitted = [s for s in splitted if len(s.strip()) > 1]
            if len(splitted) < 3:
                splitted = t.split('\n\n')
            splitted = [cleaning(s) for s in splitted[1:]]
            splitted = [s + '.' if s[-1] not in  '.;:,' else s for s in splitted if len(s)]
            splitted = ' '.join(splitted)
            date = datetime.strptime(f, 'pdf/DR-%d%m%Y.pdf').strftime('%Y-%m-%d')
            
            if not len(splitted):
                print(f, i, t)

            d = {
                'original': t,
                'cleaned': splitted,
                'no_page': int(no_page),
                'actual_no_page': i + 1,
                'date': date,
                'url': f'https://www.parlimen.gov.my/files/hindex/{f}'
            }
            fopen.write(f'{json.dumps(d)}\n')
#             break
#         break

 11%|█████████▋                                                                            | 175/1560 [00:59<07:22,  3.13it/s]

pdf/DR-04112003.pdf 87 
76 DR 04.11.2003 
 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 





 14%|███████████▋                                                                          | 213/1560 [01:11<05:01,  4.47it/s]

pdf/DR-05102005.pdf 96 
DR 5.10.2005 83 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 





 23%|███████████████████▌                                                                  | 354/1560 [01:57<05:08,  3.91it/s]

pdf/DR-08052008.pdf 95 
84 DR.8.5.2008 

 
 

 

 

 

 

 





 23%|███████████████████▋                                                                  | 356/1560 [01:57<05:26,  3.68it/s]

pdf/DR-08062010.pdf 107 
92 DR.8.6.2010 

 
 

 

 

 





 23%|████████████████████▏                                                                 | 366/1560 [02:02<08:49,  2.26it/s]

pdf/DR-08092003.pdf 144 
DR.8.9.2003 133 
 

 
 
 
 

  





 27%|██████████████████████▉                                                               | 417/1560 [02:20<07:59,  2.38it/s]

pdf/DR-09092002.pdf 84 
DR.9.9.2002 75 
 

 

 





 27%|███████████████████████▏                                                              | 420/1560 [02:20<06:22,  2.98it/s]

pdf/DR-09102002.pdf 1 
 DR.09.10.2002 
 
2 

 





 33%|████████████████████████████▏                                                         | 512/1560 [02:48<05:23,  3.24it/s]

pdf/DR-11072005.pdf 109 
96 DR.11.7.2005 

 

 

 





 34%|█████████████████████████████▏                                                        | 529/1560 [02:53<03:45,  4.57it/s]

pdf/DR-11112002.pdf 1 
2 DR.11.11.2002 
 
 
 
 





 35%|█████████████████████████████▉                                                        | 544/1560 [02:57<04:51,  3.48it/s]

pdf/DR-11122008.pdf 127 
114 DR.11.12.2008 

 
 
 





 37%|███████████████████████████████▉                                                      | 579/1560 [03:07<03:33,  4.60it/s]

pdf/DR-12112002.pdf 98 
DR.12.11.2002 87 
 

 

 

 

 

 

 

 

 





 40%|██████████████████████████████████                                                    | 617/1560 [03:19<04:16,  3.68it/s]

pdf/DR-13102003.pdf 1 
2 DR.13.10.2003 
 
 





 46%|███████████████████████████████████████▎                                              | 714/1560 [03:51<04:41,  3.01it/s]

pdf/DR-15102002.pdf 1 
2 DR.15.10.2002 
 

 
 





 46%|███████████████████████████████████████▉                                              | 724/1560 [03:55<05:12,  2.68it/s]

pdf/DR-15112006.pdf 3 
2 DR.15.11.2006 

 





 49%|█████████████████████████████████████████▉                                            | 761/1560 [04:07<04:04,  3.27it/s]

pdf/DR-16072020.pdf 97 
86 DR 16.7.2020 

 
 

 





 49%|██████████████████████████████████████████                                            | 763/1560 [04:08<04:08,  3.20it/s]

pdf/DR-16092002.pdf 1 
2 DR.16.9.2002 
 
 





 54%|██████████████████████████████████████████████▌                                       | 844/1560 [04:34<02:54,  4.10it/s]

pdf/DR-17122009.pdf 92 
DR 17.12.2009 81 

 
 





 60%|███████████████████████████████████████████████████▍                                  | 933/1560 [05:03<03:40,  2.84it/s]

pdf/DR-19092002.pdf 1 
2 DR.19.9.2002 
 

 



pdf/DR-19092002.pdf 79 
70 DR.19.9.2002 
 

 

 

 





 62%|█████████████████████████████████████████████████████▊                                | 975/1560 [05:18<03:14,  3.01it/s]

pdf/DR-20062005.pdf 96 
DR.20.6.2005 83 

 
 
 

 

 

 

 

 

 

 





 68%|█████████████████████████████████████████████████████████▍                           | 1053/1560 [05:42<02:08,  3.96it/s]

pdf/DR-21112011.pdf 120 
DR 21.11.2011 109 

 
 





 69%|██████████████████████████████████████████████████████████▋                          | 1077/1560 [05:51<03:07,  2.58it/s]

pdf/DR-22062005.pdf 99 
86 DR.22.6.2005 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

 
 
 
 
 
 
 





 73%|██████████████████████████████████████████████████████████████▎                      | 1144/1560 [06:14<02:29,  2.78it/s]

pdf/DR-23102002.pdf 1 
2 DR.23.10.2002 
 

 

 





 83%|██████████████████████████████████████████████████████████████████████▌              | 1296/1560 [07:06<01:40,  2.64it/s]

pdf/DR-26062002.pdf 1 
2 DR.26.6.2002 
 
 
 





 93%|███████████████████████████████████████████████████████████████████████████████▎     | 1456/1560 [07:56<00:35,  2.92it/s]

pdf/DR-29062010.pdf 174 
DR.29.6.2010 161 

 
 

 

 





 94%|████████████████████████████████████████████████████████████████████████████████     | 1469/1560 [08:00<00:29,  3.10it/s]

pdf/DR-29102003.pdf 1 
2 DR.29.10.2003 
 
 
 





 97%|██████████████████████████████████████████████████████████████████████████████████▏  | 1508/1560 [08:13<00:19,  2.62it/s]

pdf/DR-30082006.pdf 180 
DR.30.8.2006 165 

 

 

 

 

 

 

 

 





100%|████████████████████████████████████████████████████████████████████████████████████▊| 1557/1560 [08:30<00:01,  2.96it/s]

pdf/DR-31102016.pdf 148 
DR 31.10.2016                                                                                                                                                   137               

 

 

 

 

 

 

 





100%|█████████████████████████████████████████████████████████████████████████████████████| 1560/1560 [08:32<00:00,  3.05it/s]


In [37]:
!wc -l hansard.jsonl

142609 hansard.jsonl


In [38]:
!split -l 50000 -d --additional-suffix=.splitted hansard.jsonl hansard.jsonl