In [None]:
!pip install beautifulsoup4 ipywidgets requests rich tqdm

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import json
import os
import requests
from typing import Any
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from rich.console import Console
from tqdm.notebook import tqdm


console = Console()

def get_root_dir() -> str:
    dirpath = os.getcwd()
    while os.path.basename(dirpath) != 'BloodOnTheClocktower':
        dirpath = os.path.dirname(dirpath)
    return dirpath


def get_edition_dir() -> str:
    return os.path.join(get_root_dir(), 'content', 'editions')


main_page_url = "https://wiki.bloodontheclocktower.com/Main_Page"

def get_edition_links() -> list[str]:
    main_page = requests.get(main_page_url).text 
    main_page_soup = BeautifulSoup(main_page)

    edition_header = main_page_soup.find("h2", string= "Characters By Edition")
    edition_links_div = edition_header.find_next_sibling('div', class_="row")
    edition_hrefs = edition_links_div.find_all("a", class_=lambda class_: class_ is None or class_.lower() != 'internal')
    return [urljoin(main_page_url, edition_href['href']) for edition_href in edition_hrefs]


def scrape_edition_page(url: str) -> dict[str, Any]:
    page = requests.get(url).text
    soup = BeautifulSoup(page)

    synopsis_title = soup.find(id="Synopsis")
    synopsis_div = next(parent for parent in synopsis_title.parents if parent.name == 'div')
    synopsis = '\n'.join(paragraph.get_text().strip() for paragraph in synopsis_div.find_all('p'))

    main_content_div = synopsis_div.find_next_sibling('div')
    characters: dict[str, list[str]] = dict()
    for characters_group_header in main_content_div.find_all("h3"):
        character_groupname = characters_group_header.get_text().strip()
        character_groupname_lower = character_groupname.lower()
        character_group_ul = soup.find(id=character_groupname).parent.find_next_sibling('ul')
        character_names = [character.get_text().strip() for character in character_group_ul.find_all("h4")]
        characters[character_groupname_lower] = character_names

    table_of_content = soup.find(id="toc")
    description_paragraphs = list(reversed(table_of_content.find_previous_siblings("p")))
    description = '\n'.join(paragraph.get_text().strip() for paragraph in description_paragraphs).strip()

    try:
        difficulty = description_paragraphs[1].get_text().split('.', maxsplit=1)[0].strip()
    except Exception:
        console.print_exception(show_locals=True)
        difficulty = 'Not Specified'

    guide: dict[str, str] = dict()
    try:
        good_player_guide_index = description.index("Good players")
        evil_player_guide_index = description.index("Evil players")
        guide["good players"] = description[good_player_guide_index:evil_player_guide_index].strip()
        guide["evil players"] = description[evil_player_guide_index:].strip()
    except Exception:
        console.print_exception(show_locals=True)
        

    data =  {
        "name": soup.h1.string,
        "url": url,
        "synopsis": synopsis,
        "characters": characters,
        "description": description,
        "difficulty": difficulty,
        "guide": guide
    }
    console.print(data)

    return data 


def write_editions(edition_folder: str) -> None:
    edition_links = get_edition_links()
    for edition_link in tqdm(edition_links):
        data = scrape_edition_page(edition_link)
        filepath = os.path.join(edition_folder, f'{data["name"]}.json')
        with open(filepath, 'w') as edition_filewriter:
            json.dump(data, edition_filewriter, indent=4, sort_keys=True)

In [None]:
write_editions(get_edition_dir())