In [1]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
import requests
import json
import os
import re
import time
from tqdm import tqdm
from glob import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

In [19]:
def download(url, file_name, retry = 3):
    if os.path.exists(file_name) and os.path.getsize(file_name) > 50000:
        return
    with open(file_name, "wb") as file:
        for k in range(retry):
            try:
                response = requests.get(url, verify = False, timeout = 360)
                file.write(response.content)
                return
            except Exception as e:
                print(k, e, url)
                time.sleep(0.5)

In [4]:
files = glob('pages/*.json')
len(files)

18787

In [5]:
pdfs = []
for f in files:
    with open(f) as fopen:
        pdfs.extend(json.load(fopen))
        
pdfs = list(set(pdfs))

In [6]:
filtered_pdfs = []
for p in tqdm(pdfs):
    if len(re.findall( r'[0-9]+(?:\.[0-9]+){3}', p)):
        continue
    if not p.endswith('.pdf'):
        continue
    if 'ade.sagepub.com' in p:
        continue
    filtered_pdfs.append(p)
    
filtered_pdfs = sorted(list(set(filtered_pdfs)))
len(filtered_pdfs)

100%|███████████████████████████████| 235433/235433 [00:00<00:00, 628002.20it/s]


235130

In [9]:
upm = [f for f in filtered_pdfs if 'upm.edu.my' in f]
len(upm)

76

In [22]:
!rm -rf upm
!mkdir upm

In [23]:
max_worker = 1

upm = sorted(list(upm))
for i in tqdm(range(0, len(upm), max_worker)):
    urls = [(url, os.path.join('upm', f'{i}-{no}.pdf')) for no, url in enumerate(upm[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(download, url[0], url[1]): url for url in urls}

        for future in as_completed(futures):
            future.result()

100%|███████████████████████████████████████████| 76/76 [00:15<00:00,  4.77it/s]


In [21]:
!du -hs upm

7.2M	upm


In [14]:
os.path.getsize('upm/16-0.pdf')

1854

In [18]:
!ls -lha upm

total 7.2M
drwxr-xr-x  2 husein husein 4.0K Mei  27 10:37 .
drwxr-xr-x 12 husein husein 4.0K Mei  27 16:59 ..
-rw-r--r--  1 husein husein  60K Mei  27 10:37 0-0.pdf
-rw-r--r--  1 husein husein 360K Mei  27 10:37 10-0.pdf
-rw-r--r--  1 husein husein  60K Mei  27 10:37 1-0.pdf
-rw-r--r--  1 husein husein 484K Mei  27 10:37 11-0.pdf
-rw-r--r--  1 husein husein 477K Mei  27 10:37 12-0.pdf
-rw-r--r--  1 husein husein 694K Mei  27 10:37 13-0.pdf
-rw-r--r--  1 husein husein 604K Mei  27 10:37 14-0.pdf
-rw-r--r--  1 husein husein 566K Mei  27 10:37 15-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 16-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 17-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 18-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 19-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 20-0.pdf
-rw-r--r--  1 husein husein  60K Mei  27 10:37 2-0.pdf
-rw-r--r--  1 husein husein 1.9K Mei  27 16:59 21-0.pdf
-rw-r--r--  1 husein husein 1.9K Me