In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import json
import numpy as np
import time
import random
import json

In [2]:
from glob import glob

files = sorted(glob('*-topics.json'))
files[0], len(files)

('AnimeShrine-topics.json', 59)

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_text(url):
    texts = []
    try:
        r_ = requests.get(url)
        soup = BeautifulSoup(r_.content, "lxml")
        divs = soup.find_all('div', {'class': 'post_text'})
        for div in divs:
            t = BeautifulSoup(str(div).replace('<br/>', '\n')).text.strip()
            if len(t):
                texts.append(t)
    except Exception as e:
        pass
    return texts

In [4]:
limit = 20
worker_size = 1
per_batch = worker_size * limit

for k in range(len(files)):
    f = files[k]
    print(k, f)
    directory = f.split('-topics')[0]
    os.makedirs(f'done-{directory}', exist_ok = True)
    os.makedirs(directory, exist_ok = True)
    
    with open(f) as fopen:
        all_a = json.load(fopen)
    
    for topic in tqdm(sorted(all_a)):
    
        base_dir = topic[1:].replace('/', '-')
        os.makedirs(os.path.join(directory, base_dir), exist_ok = True)
        os.makedirs(f'done-{directory}', exist_ok = True)

        base_dir_filename = os.path.join(f'done-{directory}', f'{base_dir}.json')

        if os.path.exists(base_dir_filename):
            continue

        need_break = False
        last_texts = []
        for i in range(0, 100000, per_batch):
            filename = os.path.join(directory, base_dir, f'{i}.json')
            if os.path.exists(filename):
                continue
            aranged = np.arange(i, i + per_batch, limit)
            urls = []
            for a_ in aranged:
                if a_ == 0:
                    a_ = ''
                else:
                    a_ = f'/+{a_}'
                url = f'https://forum.lowyat.net/{topic}{a_}'
                urls.append(url)

            texts = []

            with ThreadPoolExecutor(max_workers=worker_size) as executor:
                futures = {executor.submit(get_text, item): item for item in urls}

                for future in as_completed(futures):
                    r = future.result()
                    texts.extend(r)
                    if not len(r):
                        need_break = True

                    s = set(texts)
                    if len(set(last_texts) & s) == len(s):
                        need_break = True

                    last_texts = texts[:]

            with open(filename, 'w') as fopen:
                json.dump(texts, fopen)

            time.sleep(random.uniform(0.5, 1.0))

            if need_break:
                break

        with open(base_dir_filename, 'w') as fopen:
            json.dump({'status': True}, fopen) 

0 AnimeShrine-topics.json


100%|████████████████████████████████████| 3672/3672 [00:00<00:00, 44358.47it/s]


1 Announcements-topics.json


100%|██████████████████████████████████████| 156/156 [00:00<00:00, 53684.89it/s]


2 AppleByte-topics.json


100%|████████████████████████████████████| 8441/8441 [00:00<00:00, 80532.73it/s]

3 Arts&Designs-topics.json



100%|████████████████████████████████████| 6604/6604 [00:00<00:00, 72051.67it/s]


4 Brides&Grooms-topics.json


100%|██████████████████████████████████████| 821/821 [00:00<00:00, 71070.82it/s]


5 BusinessForSale-topics.json


100%|████████████████████████████████████| 2027/2027 [00:00<00:00, 73022.42it/s]


6 CarsForSale-topics.json


100%|██████████████████████████████████████| 684/684 [00:00<00:00, 76937.00it/s]


7 CasingsandModifications-topics.json


100%|████████████████████████████████████| 1899/1899 [00:00<00:00, 80050.89it/s]


8 Codemasters-topics.json


100%|██████████████████████████████████| 16086/16086 [00:00<00:00, 70947.64it/s]


9 ConsolesCouch-topics.json


100%|████████████████████████████████████| 2791/2791 [00:00<00:00, 57600.97it/s]


10 ContentCreatorsBlogmasters&Webmasters-topics.json


100%|████████████████████████████████████| 8611/8611 [00:00<00:00, 70974.78it/s]


11 CupidsCorner-topics.json


100%|██████████████████████████████████| 16808/16808 [00:00<00:00, 75288.23it/s]


12 DesktopCustomization-topics.json


100%|████████████████████████████████████| 1734/1734 [00:00<00:00, 81712.73it/s]


13 E-hailing&Ride-sharing-topics.json


100%|██████████████████████████████████████| 528/528 [00:00<00:00, 69929.35it/s]


14 EducationEssentials-topics.json


100%|██████████████████████████████████| 33121/33121 [00:00<00:00, 69795.56it/s]


15 EventsandGatherings-topics.json


100%|████████████████████████████████████| 7787/7787 [00:00<00:00, 74853.48it/s]


16 FeedbackandHelpdesk-topics.json


100%|████████████████████████████████████| 8587/8587 [00:00<00:00, 75862.68it/s]


17 FinanceBusinessandInvestmentHouse-topics.json


100%|██████████████████████████████████| 19703/19703 [00:00<00:00, 63110.41it/s]


18 Food&Flavours-topics.json


100%|████████████████████████████████████| 2724/2724 [00:00<00:00, 74038.23it/s]


19 FootballLounge-topics.json


100%|████████████████████████████████████| 1753/1753 [00:00<00:00, 26235.35it/s]


20 GamersHideout-topics.json


100%|██████████████████████████████████| 14686/14686 [00:00<00:00, 66835.51it/s]


21 GarageSales-topics.json


100%|██████████████████████████████████| 44525/44525 [00:00<00:00, 69690.01it/s]


22 GarageSalesArchive-topics.json


100%|██████████████████████████████████| 99941/99941 [00:01<00:00, 68312.44it/s]


23 GarageSalesHelpdesk-topics.json


0it [00:00, ?it/s]


24 GirlsClub-topics.json


100%|████████████████████████████████████| 4689/4689 [00:00<00:00, 73232.74it/s]


25 Hardware-topics.json


100%|██████████████████████████████████| 56978/56978 [00:00<00:00, 65035.14it/s]


26 Health&Fitness-topics.json


100%|██████████████████████████████████| 16469/16469 [00:00<00:00, 76431.62it/s]


27 HobbiesCollectiblesandModelKits-topics.json


100%|████████████████████████████████████| 2508/2508 [00:00<00:00, 80114.81it/s]


28 HomeEntertainment-topics.json


100%|████████████████████████████████████| 7575/7575 [00:00<00:00, 82036.34it/s]


29 IPTVTalk-topics.json


100%|██████████████████████████████████████| 451/451 [00:00<00:00, 72490.17it/s]


30 JobEnlistments-topics.json


100%|██████████████████████████████████| 57693/57693 [00:00<00:00, 69081.10it/s]


31 Jobs&Careers-topics.json


100%|██████████████████████████████████| 37509/37509 [00:00<00:00, 63956.21it/s]


32 LYNCharityFoundation-topics.json


100%|██████████████████████████████████████| 668/668 [00:00<00:00, 65598.91it/s]


33 Linux&OpenSourceSoftware-topics.json


100%|████████████████████████████████████| 4347/4347 [00:00<00:00, 68020.05it/s]


34 MensStyle&Fashion-topics.json


100%|████████████████████████████████████| 3054/3054 [00:00<00:00, 73635.46it/s]


35 MobileComputing-topics.json


100%|██████████████████████████████████| 17783/17783 [00:00<00:00, 73898.28it/s]


36 MobileGamersHangout-topics.json


100%|████████████████████████████████████| 1767/1767 [00:00<00:00, 76690.93it/s]


37 MobilePhonesandTablets-topics.json


100%|██████████████████████████████████| 25079/25079 [00:00<00:00, 67116.89it/s]


38 Movies&Music-topics.json


100%|████████████████████████████████████| 7992/7992 [00:00<00:00, 74026.28it/s]


39 Musicians-topics.json


100%|████████████████████████████████████| 7111/7111 [00:00<00:00, 75190.01it/s]


40 NetworksandBroadband-topics.json


100%|██████████████████████████████████| 34522/34522 [00:00<00:00, 72312.37it/s]


41 OverclockersUnited-topics.json


100%|████████████████████████████████████| 1797/1797 [00:00<00:00, 49519.82it/s]


42 PetsWonderland-topics.json


100%|████████████████████████████████████| 5431/5431 [00:00<00:00, 74448.12it/s]


43 PhotographyDigitalImaging&Video-topics.json


100%|██████████████████████████████████| 11910/11910 [00:00<00:00, 74913.90it/s]


44 Pregnancy&Parenting-topics.json


100%|████████████████████████████████████| 3659/3659 [00:00<00:00, 78094.81it/s]


45 Price&DealersGuide-topics.json


100%|████████████████████████████████████| 9844/9844 [00:00<00:00, 77502.01it/s]


46 PropertyForRent-topics.json


100%|██████████████████████████████████| 12202/12202 [00:00<00:00, 74649.75it/s]


47 PropertyForSale-topics.json


100%|████████████████████████████████| 100007/100007 [00:01<00:00, 65128.53it/s]


48 ReversePhoneDirectory-topics.json


100%|██████████████████████████████████████| 212/212 [00:00<00:00, 53010.16it/s]


49 ReviewsandGuides-topics.json


100%|████████████████████████████████████| 2249/2249 [00:00<00:00, 76491.97it/s]


50 ServicesNoticeboard-topics.json


100%|██████████████████████████████████| 12506/12506 [00:00<00:00, 65325.83it/s]


51 Software-topics.json


100%|██████████████████████████████████| 12506/12506 [00:00<00:00, 67459.88it/s]


52 TechnicalSupport-topics.json


100%|██████████████████████████████████| 67600/67600 [00:01<00:00, 58563.27it/s]


53 TelcoTalk-topics.json


100%|████████████████████████████████████| 2191/2191 [00:00<00:00, 52555.93it/s]


54 TheFast&TheFurious-topics.json


100%|██████████████████████████████████| 44487/44487 [00:00<00:00, 66662.88it/s]


55 TheSportsChannel-topics.json


100%|████████████████████████████████████| 3301/3301 [00:00<00:00, 62315.57it/s]


56 Timepieces&Jewelleries-topics.json


100%|██████████████████████████████████████| 199/199 [00:00<00:00, 50112.06it/s]


57 Travel&Living-topics.json


100%|████████████████████████████████████| 3847/3847 [00:00<00:00, 75271.09it/s]


58 Windows11-topics.json


100%|████████████████████████████████████████| 40/40 [00:00<00:00, 49475.72it/s]
