In [1]:
pip install httpx beautifulsoup4 polars tqdm


Collecting httpx
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx
Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0


In [2]:
import os
import httpx
import asyncio
from bs4 import BeautifulSoup
import polars as pl
from tqdm.asyncio import tqdm
import json

In [3]:
os.makedirs('datasets', exist_ok=True)

max_pages = [10, 20, 5, 15, 25]  # Define the maximum pages for each level
base_url = "https://www.fortiguard.com/encyclopedia?type=ips&risk={level}&page={i}"

skipped_pages = {}


In [4]:

async def fetch_page(client, url, level, page):
    try:
        response = await client.get(url, timeout=10.0)
        response.raise_for_status()
        return response.text
    except (httpx.RequestError, httpx.HTTPStatusError) as exc:
        skipped_pages.setdefault(level, []).append(page)
        return None

In [5]:
async def fetch_level_data(level, max_page):
    data = []
    async with httpx.AsyncClient() as client:
        tasks = []
        for i in range(1, max_page + 1):
            url = base_url.format(level=level, i=i)
            tasks.append(fetch_page(client, url, level, i))

        responses = await tqdm.gather(*tasks, desc=f"Fetching level {level}", ncols=100)

        for page, html in enumerate(responses, 1):
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                articles = soup.select('.threat-detail > a')
                for article in articles:
                    title = article.get_text(strip=True)
                    link = article['href']
                    data.append({"title": title, "link": link})
    return data



In [6]:
async def main():
    tasks = []
    for level, max_page in enumerate(max_pages, 1):
        tasks.append(fetch_level_data(level, max_page))

    results = await asyncio.gather(*tasks)

    for level, data in enumerate(results, 1):
        df = pl.DataFrame(data)
        df.write_csv(f'datasets/forti_lists_{level}.csv')

    with open('datasets/skipped.json', 'w') as f:
        json.dump(skipped_pages, f, indent=4)

if __name__ == "__main__":
    await main()

Fetching level 1:   0%|                                                      | 0/10 [00:00<?, ?it/s]
Fetching level 2:   0%|                                                      | 0/20 [00:00<?, ?it/s][A

Fetching level 3:   0%|                                                       | 0/5 [00:00<?, ?it/s][A[A


Fetching level 4:   0%|                                                      | 0/15 [00:00<?, ?it/s][A[A[A



Fetching level 5:   0%|                                                      | 0/25 [00:00<?, ?it/s][A[A[A[A
Fetching level 2:   5%|██▎                                           | 1/20 [00:08<02:49,  8.90s/it][A



Fetching level 5:   4%|█▊                                            | 1/25 [00:09<03:39,  9.16s/it][A[A[A[A


Fetching level 1:  10%|████▌                                         | 1/10 [00:10<01:34, 10.49s/it]
Fetching level 2:  10%|████▌                                         | 2/20 [00:10<01:22,  4.59s/it][A


Fetching level 4:  13%|██████▏