In [1]:
pip install nest_asyncio

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Users\metranet\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
from bs4 import BeautifulSoup

# HTML snippet representing the content
html_snippet = '''
<div class="col-lg" style="word-break:break-all">
    <small><img class="me-1" src="https://filestore.fortinet.com/fortiguard/static/images/icons_white/ips.svg?v=24687" width="20" alt="ips-logo"> Intrusion Prevention</small>
    <br>
    <b>Qlik.Sense.HTTP.Tunneling.Vulnerability</b>
</div>
'''

# Parse the HTML snippet
soup = BeautifulSoup(html_snippet, 'html.parser')

# Extract the title
title_element = soup.find('b')
title = title_element.text.strip() if title_element else 'Title not found'

print('Title:', title)


Title: Qlik.Sense.HTTP.Tunneling.Vulnerability


In [1]:
import json
import os
import asyncio
import nest_asyncio
import httpx
from bs4 import BeautifulSoup
from tqdm import tqdm

# Menggunakan nest_asyncio untuk menjalankan asyncio di Jupyter Notebook
nest_asyncio.apply()

# Definisikan URL dasar
BASE_URL = "https://www.fortiguard.com/encyclopedia?type=ips&risk={}&page={}"

# Jumlah halaman maksimum untuk setiap level
max_pages = [10, 10, 10, 10, 10]

# Direktori output
OUTPUT_DIR = "datasets"

# Fungsi untuk mengambil data dari URL
async def scrape_data(url):
    async with httpx.AsyncClient() as client:
        try:
            response = await client.get(url)
            response.raise_for_status()
            return response.content
        except (httpx.HTTPStatusError, httpx.RequestError) as exc:
            print(f"Error fetching URL: {url}")
            print(exc)
            return None

def parse_html(html):
    if html is None:
        return []

    soup = BeautifulSoup(html, "html.parser")
    data = []
    for item in soup.find_all("section", class_="table-body"):
        title_element = item.find("b")
        if title_element:
            title = title_element.text.strip()
            link_element = item.find("div", onclick=True)
            if link_element and 'onclick' in link_element.attrs:
                link = 'https://www.fortiguard.com' + link_element['onclick'].split("'")[1]
                data.append({"title": title, "link": link})
    return data

# Fungsi untuk menyimpan data ke dalam file CSV
def save_to_csv(level, data):
    filename = os.path.join(OUTPUT_DIR, f"forti_lists_{level}.csv")
    with open(filename, "w") as f:
        f.write("Title,Link\n")
        for item in data:
            f.write(f"{item['title']},{item['link']}\n")
    print(f"Data written to CSV for level {level}")

# Fungsi untuk menyimpan halaman yang dilewati ke dalam file JSON
def save_skipped_pages(skipped):
    with open(os.path.join(OUTPUT_DIR, "skipped.json"), "w") as f:
        json.dump(skipped, f, indent=4)
    print("Skipped pages saved to JSON")

# Fungsi utama untuk pengikisan data
async def main():
    skipped_pages = {}
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    for level, max_page in enumerate(max_pages, start=1):
        print(f"Scraping Level {level}...")
        level_data = []

        for page in tqdm(range(1, max_page + 1)):
            url = BASE_URL.format(level, page)
            html = await scrape_data(url)

            if html is None:
                skipped_pages.setdefault(level, []).append(page)
                continue

            page_data = parse_html(html)
            level_data.extend(page_data)

        save_to_csv(level, level_data)

    save_skipped_pages(skipped_pages)

# Menjalankan fungsi utama dengan menggunakan async
asyncio.run(main())

Scraping Level 1...


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


Data written to CSV for level 1
Scraping Level 2...


100%|██████████| 10/10 [00:08<00:00,  1.21it/s]


Data written to CSV for level 2
Scraping Level 3...


100%|██████████| 10/10 [00:07<00:00,  1.26it/s]


Data written to CSV for level 3
Scraping Level 4...


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]


Data written to CSV for level 4
Scraping Level 5...


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]

Data written to CSV for level 5
Skipped pages saved to JSON



