In [1]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
import requests
import json
import os
import re
import time
from tqdm import tqdm
from glob import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
def download(url, file_name, retry = 3):
    if os.path.exists(file_name) and os.path.getsize(file_name) > 50000:
        return
    with open(file_name, "wb") as file:
        for k in range(retry):
            try:
                response = requests.get(url, verify = False, timeout = 360)
                file.write(response.content)
                return 
            except Exception as e:
                print(k, e)
                time.sleep(0.5)

In [4]:
files = glob('pages/*.json')
len(files)

18787

In [5]:
pdfs = []
for f in files:
    with open(f) as fopen:
        pdfs.extend(json.load(fopen))
        
pdfs = list(set(pdfs))

In [6]:
filtered_pdfs = []
for p in tqdm(pdfs):
    if len(re.findall( r'[0-9]+(?:\.[0-9]+){3}', p)):
        continue
    if not p.endswith('.pdf'):
        continue
    if 'ade.sagepub.com' in p:
        continue
    filtered_pdfs.append(p)
    
filtered_pdfs = sorted(list(set(filtered_pdfs)))
len(filtered_pdfs)

100%|███████████████████████████████| 235433/235433 [00:00<00:00, 677665.27it/s]


235130

In [7]:
ukm = [f for f in filtered_pdfs if 'ukm.my' in f]
len(ukm)

18101

In [8]:
# !rm -rf pdf
!mkdir ukm

mkdir: cannot create directory ‘ukm’: File exists


In [9]:
max_worker = 1

ukm = sorted(list(ukm))
for i in tqdm(range(0, len(ukm), max_worker)):
    urls = [(url, os.path.join('ukm', f'{i}-{no}.pdf')) for no, url in enumerate(ukm[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(download, url[0], url[1]): url for url in urls}

        for future in as_completed(futures):
            future.result()

 99%|████████████████████████████████████▋| 17933/18101 [01:40<00:46,  3.59it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%206%282%292008/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fed30>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%206%282%292008/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feb20>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%206%282%292008/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fef70>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%206%282

 99%|████████████████████████████████████▋| 17933/18101 [01:59<00:46,  3.59it/s]

1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter2.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fedc0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter2.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feb20>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17947/18101 [02:00<01:15,  2.05it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter3.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feca0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter3.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fefa0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter3.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fed00>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17948/18101 [02:01<01:17,  1.96it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter4.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fec10>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter4.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fe790>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281%292009/Chapter4.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fecd0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%281

2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/HOLLY%20KNOX%20THOMPSON%20&%20PENELOPE%20A.%20HASKING.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fec10>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/NOR%20AZURA%20AZMI,%20NOR%20AZLIN%20MOHD.%20NORDIN%20&%20NOR%20AZILA%20NOH.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fea30>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/NOR%20AZURA%20AZMI,%20NOR%20AZLIN%20MOHD.%20NORDIN%20&%20NOR%20AZILA%20NOH.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fefa0>: Failed 

 99%|████████████████████████████████████▋| 17948/18101 [02:19<01:17,  1.96it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/NORHANI%20MOHIDIN%20&%20TEE%20LEE%20FUNG.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fec10>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/NORHANI%20MOHIDIN%20&%20TEE%20LEE%20FUNG.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fefd0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/NORHANI%20MOHIDIN%20&%20TEE%20LEE%20FUNG.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feee0>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17960/18101 [02:20<01:48,  1.29it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/SUZANA%20SHAHAR%20&%20YOW%20BEE%20CHARN.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feca0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/SUZANA%20SHAHAR%20&%20YOW%20BEE%20CHARN.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fe100>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%207%282%292009/SUZANA%20SHAHAR%20&%20YOW%20BEE%20CHARN.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fef40>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17961/18101 [02:22<01:51,  1.26it/s]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%281%29%202010/Bab%201.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fe250>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%281%29%202010/Bab%201.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feca0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%281%29%202010/Bab%201.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fe850>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%2

1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%2010.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feeb0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%2010.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fedc0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%202.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feee0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/ji

 99%|████████████████████████████████████▋| 17961/18101 [02:39<01:51,  1.26it/s]

2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%202.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fedf0>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17972/18101 [02:40<02:14,  1.04s/it]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fea60>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82febe0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%203.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feaf0>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▋| 17973/18101 [02:41<02:16,  1.06s/it]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%204.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fefd0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%204.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fed30>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%208%282%29%202010/Chap%204.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fe9d0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jili

 99%|████████████████████████████████████▊| 17982/18101 [02:55<02:27,  1.24s/it]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%205_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fee20>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%205_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fef70>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%205_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fedf0>: Failed to establish a new connection: [Errno -2] Name or service not known'))


 99%|████████████████████████████████████▊| 17983/18101 [02:57<02:28,  1.26s/it]

0 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%206_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fec40>: Failed to establish a new connection: [Errno -2] Name or service not known'))
1 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%206_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82feca0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
2 HTTPConnectionPool(host='www.fsk.ukm.my', port=80): Max retries exceeded with url: /jurnal/jilid%209%281%29%202011/Chap%206_terbitan%209.1%20%282011%29.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fe9b82fec10>: Failed to establish a new connection: [Errno -2] Name or service not known'))
0 HTTPConnect

 99%|████████████████████████████████████▊| 17998/18101 [03:19<01:09,  1.48it/s]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|████████████████████████████████████▊| 18019/18101 [06:05<07:50,  5.73s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|████████████████████████████████████▊| 18020/18101 [09:06<17:33, 13.00s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|████████████████████████████████████▊| 18021/18101 [11:07<25:22, 19.04s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|████████████████████████████████████▊| 18023/18101 [17:11<59:00, 45.40s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████████████████████████████▊| 18024/18101 [20:12<1:19:13, 61.73s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████████████████████████████▊| 18025/18101 [23:14<1:40:24, 79.27s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████████████████████████████▊| 18026/18101 [26:16<2:01:12, 96.97s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|█████████████████████████████████▊| 18027/18101 [28:17<2:05:30, 101.76s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|████████████████████████████████████▉| 18057/18101 [31:39<04:47,  6.54s/it]

0 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
1 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
2 ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|█████████████████████████████████████| 18101/18101 [35:02<00:00,  8.61it/s]
