In [1]:
pip install httpx beautifulsoup4 polars tqdm




In [2]:
import os
import httpx
import asyncio
from bs4 import BeautifulSoup
import polars as pl
from tqdm.asyncio import tqdm
import json

In [3]:
os.makedirs('datasets', exist_ok=True)

max_pages = [10, 20, 5, 15, 25]  # Define the maximum pages for each level
base_url = "https://www.fortiguard.com/encyclopedia?type=ips&risk={level}&page={i}"

skipped_pages = {}


In [4]:

async def fetch_page(client, url, level, page):
    try:
        response = await client.get(url, timeout=10.0)
        response.raise_for_status()
        return response.text
    except (httpx.RequestError, httpx.HTTPStatusError) as exc:
        skipped_pages.setdefault(level, []).append(page)
        return None

In [5]:
async def fetch_level_data(level, max_page):
    data = []
    async with httpx.AsyncClient() as client:
        tasks = []
        for i in range(1, max_page + 1):
            url = base_url.format(level=level, i=i)
            tasks.append(fetch_page(client, url, level, i))

        responses = await tqdm.gather(*tasks, desc=f"Fetching level {level}", ncols=100)

        for page, html in enumerate(responses, 1):
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                articles = soup.select('.threat-detail > a')
                for article in articles:
                    title = article.get_text(strip=True)
                    link = article['href']
                    data.append({"title": title, "link": link})
    return data



In [6]:
async def main():
    tasks = []
    for level, max_page in enumerate(max_pages, 1):
        tasks.append(fetch_level_data(level, max_page))

    results = await asyncio.gather(*tasks)

    for level, data in enumerate(results, 1):
        df = pl.DataFrame(data)
        df.write_csv(f'datasets/forti_lists_{level}.csv')

    with open('datasets/skipped.json', 'w') as f:
        json.dump(skipped_pages, f, indent=4)

if __name__ == "__main__":
    await main()

Fetching level 1:   0%|                                                      | 0/10 [00:00<?, ?it/s]
Fetching level 2:   0%|                                                      | 0/20 [00:00<?, ?it/s][A

Fetching level 3:   0%|                                                       | 0/5 [00:00<?, ?it/s][A[A


Fetching level 4:   0%|                                                      | 0/15 [00:00<?, ?it/s][A[A[A



Fetching level 1:  50%|███████████████████████                       | 5/10 [00:10<00:07,  1.54s/it]



Fetching level 1:  80%|████████████████████████████████████▊         | 8/10 [00:10<00:01,  1.25it/s]
Fetching level 1: 100%|█████████████████████████████████████████████| 10/10 [00:10<00:00,  1.09s/it]


Fetching level 3: 100%|███████████████████████████████████████████████| 5/5 [00:11<00:00,  2.21s/it]

Fetching level 2: 100%|█████████████████████████████████████████████| 20/20 [00:11<00:00,  1.80it/s]



Fetching level 4: 100%|███████████████████████████████████