In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import os
import json

In [2]:
base_url = 'https://forums.hardwarezone.com.sg/'
r = requests.get(base_url)

In [3]:
soup = BeautifulSoup(r.content, "lxml")
a = soup.find_all('a', {'class': 'subNodeLink'})
a.extend(soup.find_all('a', {'data-shortcut': 'node-description'}))
a = [a_.get('href') for a_ in a]
topics = sorted(list(set([a_ for a_ in a if '/link-forums' not in a_])))

In [4]:
!mkdir topics

mkdir: cannot create directory ‘topics’: File exists


In [12]:
topics

['/forums/android-user-group.323/',
 '/forums/apple-clinic.5/',
 '/forums/apple-watch.374/',
 '/forums/australia.317/',
 '/forums/campus-zone.180/',
 '/forums/cars-cars.204/',
 '/forums/certified-systems-it-security-and-network-trainin.68/',
 '/forums/chill-out-hangout-den.234/',
 '/forums/china.316/',
 '/forums/console-gaming.383/',
 '/forums/cpu-ram-mainboard-bazaar.199/',
 '/forums/credit-cards-line-of-credit-facilities.243/',
 '/forums/current-affairs-lounge.17/',
 '/forums/degree-programs-and-courses.70/',
 '/forums/design-visual-art-gallery-sig.123/',
 '/forums/digital-cameras-photography.12/',
 '/forums/diploma-programs-and-courses.67/',
 '/forums/eat-drink-man-woman.16/',
 '/forums/electronics-bazaar.259/',
 '/forums/employment-office.22/',
 '/forums/english-premier-league.179/',
 '/forums/europe.277/',
 '/forums/events-entertainment-celebrity-buzz.242/',
 '/forums/fashion-bazaar.308/',
 '/forums/fashion-grooming.235/',
 '/forums/football-and-sports-arena.20/',
 '/forums/gaming

In [5]:
max_worker = 20

def get_href(url):
    while True:
        try:
            r = requests.get(url, timeout = 10.0)
            break
        except Exception as e:
            time.sleep(1.0)
    soup = BeautifulSoup(r.content, "lxml")
    a = soup.find_all('a')
    a = [a_.get('href') for a_ in a if a_.get('href')]
    try:
        max_page = max([int(a_.split('page-')[-1]) for a_ in a if '/forums/' in a_ and '/page-' in a_])
    except:
        max_page = -1
    a = [a_ for a_ in a if a_.startswith('/threads')]
    filtered = []
    for a_ in a:
        splitted = a_.split('/')
        if len(splitted) == 4 and splitted[-1] == '':
            filtered.append(a_)
    return filtered, max_page

In [6]:
a_ = topics[0]
base_url = f'https://forums.hardwarezone.com.sg{a_}'
    
filtered, max_page = get_href(base_url)

In [7]:
base_url

'https://forums.hardwarezone.com.sg/forums/android-user-group.323/'

In [8]:
filtered, max_page

(['/threads/batteries-charging-and-you-why-no-100-charge-on-your-phone.3070181/',
  '/threads/batteries-charging-and-you-why-no-100-charge-on-your-phone.3070181/',
  '/threads/switching-from-iphone-to-android.4853305/',
  '/threads/switching-from-iphone-to-android.4853305/',
  '/threads/read-before-you-post-rules-in-android-user-group.3069765/',
  '/threads/read-before-you-post-rules-in-android-user-group.3069765/',
  '/threads/noobs-a-dummies-guide-to-android-terminology.3071051/',
  '/threads/noobs-a-dummies-guide-to-android-terminology.3071051/',
  '/threads/google-pixel-7a.6825753/',
  '/threads/google-pixel-7a.6825753/',
  '/threads/google-tensor-g3-and-beyond.6900061/',
  '/threads/google-tensor-g3-and-beyond.6900061/',
  '/threads/exynos-2400-leaps-over-the-adreno-750-of-snapdragon-8-gen-3.6896773/',
  '/threads/exynos-2400-leaps-over-the-adreno-750-of-snapdragon-8-gen-3.6896773/',
  '/threads/htc-u23-pro-incoming-upper-midranger.6901067/',
  '/threads/htc-u23-pro-incoming-upper

In [11]:
for i in tqdm(range(len(topics))):
    a_ = topics[i]
    filename = os.path.join('topics', f'{i}.json')
    if os.path.exists(filename):
        continue
        
    base_url = f'https://forums.hardwarezone.com.sg{a_}'
    
    filtered, max_page = get_href(base_url)
    
    for i in tqdm(range(2, min(5000, max_page + 1), max_worker)):
        aranged = np.arange(i, i + max_worker)
        urls = [f'{base_url}page-{a}' for a in aranged]

        with ThreadPoolExecutor(max_workers=max_worker) as executor:
            futures = {executor.submit(get_href, url): url for url in urls}

            for future in as_completed(futures):
                filtered.extend(future.result()[0])
                
    filtered = list(set(filtered))
    with open(filename, 'w') as fopen:
        json.dump(filtered, fopen)

  0%|                                                    | 0/97 [00:00<?, ?it/s]
  0%|                                                   | 0/250 [00:00<?, ?it/s][A
  0%|▏                                          | 1/250 [00:01<04:43,  1.14s/it][A
  1%|▎                                          | 2/250 [00:02<04:47,  1.16s/it][A
  1%|▌                                          | 3/250 [00:03<04:55,  1.19s/it][A
  2%|▋                                          | 4/250 [00:04<05:03,  1.23s/it][A
  2%|▊                                          | 5/250 [00:06<05:16,  1.29s/it][A
  2%|█                                          | 6/250 [00:07<05:18,  1.31s/it][A
  3%|█▏                                         | 7/250 [00:08<05:21,  1.32s/it][A
  3%|█▍                                         | 8/250 [00:10<05:25,  1.35s/it][A
  4%|█▌                                         | 9/250 [00:11<05:33,  1.38s/it][A
  4%|█▋                                        | 10/250 [00:13<06:03,  1.51s/it

 77%|███████████████████████████████▋         | 193/250 [12:14<06:15,  6.59s/it][A
 78%|███████████████████████████████▊         | 194/250 [12:20<05:57,  6.39s/it][A
 78%|███████████████████████████████▉         | 195/250 [12:26<05:40,  6.19s/it][A
 78%|████████████████████████████████▏        | 196/250 [12:32<05:28,  6.09s/it][A
 79%|████████████████████████████████▎        | 197/250 [12:38<05:29,  6.21s/it][A
 79%|████████████████████████████████▍        | 198/250 [12:45<05:30,  6.35s/it][A
 80%|████████████████████████████████▋        | 199/250 [12:51<05:21,  6.30s/it][A
 80%|████████████████████████████████▊        | 200/250 [12:57<05:15,  6.31s/it][A
 80%|████████████████████████████████▉        | 201/250 [13:03<05:06,  6.25s/it][A
 81%|█████████████████████████████████▏       | 202/250 [13:10<05:01,  6.27s/it][A
 81%|█████████████████████████████████▎       | 203/250 [13:17<05:02,  6.44s/it][A
 82%|█████████████████████████████████▍       | 204/250 [13:23<04:55,  6.43s

 30%|█████████████▏                              | 3/10 [00:03<00:08,  1.18s/it][A
 40%|█████████████████▌                          | 4/10 [00:04<00:07,  1.20s/it][A
 50%|██████████████████████                      | 5/10 [00:06<00:06,  1.22s/it][A
 60%|██████████████████████████▍                 | 6/10 [00:07<00:04,  1.22s/it][A
 70%|██████████████████████████████▊             | 7/10 [00:08<00:03,  1.27s/it][A
 80%|███████████████████████████████████▏        | 8/10 [00:10<00:02,  1.32s/it][A
 90%|███████████████████████████████████████▌    | 9/10 [00:11<00:01,  1.34s/it][A
100%|███████████████████████████████████████████| 10/10 [00:12<00:00,  1.25s/it][A
 41%|█████████████████▋                         | 40/97 [24:04<16:24, 17.27s/it]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
 14%|██████▍                                      | 1/7 [00:01<00:07,  1.19s/it][A
 29%|████████████▊                                | 2/7 [00:02<00:05,  1.20s/it

 62%|██████████████████████████▌                | 60/97 [27:28<04:09,  6.75s/it]
  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.11s/it][A
 63%|███████████████████████████                | 61/97 [27:29<03:04,  5.13s/it]
  0%|                                                    | 0/12 [00:00<?, ?it/s][A
  8%|███▋                                        | 1/12 [00:01<00:13,  1.20s/it][A
 17%|███████▎                                    | 2/12 [00:02<00:12,  1.21s/it][A
 25%|███████████                                 | 3/12 [00:03<00:11,  1.24s/it][A
 33%|██████████████▋                             | 4/12 [00:04<00:09,  1.25s/it][A
 42%|██████████████████▎                         | 5/12 [00:06<00:08,  1.25s/it][A
 50%|██████████████████████                      | 6/12 [00:07<00:07,  1.27s/it][A
 58%|█████████████████████████▋                  | 7/12 [00:08<00:06,  1.29s/it][

 21%|█████████▍                                  | 3/14 [00:03<00:13,  1.25s/it][A
 29%|████████████▌                               | 4/14 [00:05<00:12,  1.25s/it][A
 36%|███████████████▋                            | 5/14 [00:06<00:11,  1.27s/it][A
 43%|██████████████████▊                         | 6/14 [00:07<00:10,  1.29s/it][A
 50%|██████████████████████                      | 7/14 [00:09<00:09,  1.33s/it][A
 57%|█████████████████████████▏                  | 8/14 [00:10<00:07,  1.33s/it][A
 64%|████████████████████████████▎               | 9/14 [00:11<00:06,  1.35s/it][A
 71%|██████████████████████████████▋            | 10/14 [00:13<00:05,  1.38s/it][A
 79%|█████████████████████████████████▊         | 11/14 [00:14<00:04,  1.39s/it][A
 86%|████████████████████████████████████▊      | 12/14 [00:16<00:02,  1.40s/it][A
 93%|███████████████████████████████████████▉   | 13/14 [00:17<00:01,  1.40s/it][A
100%|███████████████████████████████████████████| 14/14 [00:18<00:00,  1.35s

In [13]:
!du -hs topics

20M	topics


In [15]:
# !zip -r topics.zip topics