In [1]:
import os
import requests

from bs4 import BeautifulSoup

In [2]:
def maybe_make_dir(name):
    if not os.path.isdir(name):
        os.mkdir(name)


def get_categories(base_url):
    url = "https://nijmegen.bestuurlijkeinformatie.nl/Calendar"
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    categories = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Calendar/OpenCategory" in link.attrs["href"]:
            l = link.attrs["href"]
            categories.add(f"{base_url}{l}")

    return categories


def get_meeting_urls(base_url, url):
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    urls = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Index/" in link.attrs["href"]:
            l = link.attrs["href"]
            urls.add(f"{base_url}{l}")

    return urls

In [3]:
BASE_URL = "https://nijmegen.bestuurlijkeinformatie.nl"
years = [
    "2022",
    "2023",
    "2024",
]
categories = get_categories(BASE_URL)

for category in categories:
    c = category.split("/")[-1]
    maybe_make_dir(f"data/nijmegen/{c}")
    for year in years:
        # If the file already exists do not
        if os.path.isfile(f"data/nijmegen/{c}/{year}/vergaderingen.txt"):
            continue

        hrefs = get_meeting_urls(
            BASE_URL,
            f"{BASE_URL}/Agenda/RetrieveAgendasForYear?agendatypeId={c}&year={year}",
        )
        # If no vergaderingen, continue to next year.
        if len(hrefs) == 0:
            continue

        maybe_make_dir(f"data/nijmegen/{c}/{year}")

        # Write urls to file.
        with open(f"data/nijmegen/{c}/{year}/vergaderingen.txt", "w") as f:
            for href in hrefs:
                f.write(href + "\n")

        print(f"Got category {c}, year:  {year}, number of meetings: {len(hrefs)}")

KeyboardInterrupt: 

In [4]:
import time

import concurrent.futures

import sys
sys.path.append("../tools")
from m3u8 import M3u8Downloader, get_default_cache_dir

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [5]:
def click_button(driver):
    try:
        frame = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "iframe[src*='sdk.companywebcast.com/sdk/player']")
            )
        )
        driver.switch_to.frame(frame)

        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[name="Start now"]'))
        )
        element.click()

    except Exception as e:
        print("Error:", e)

In [4]:
# Due to the way Nijmegen hosts its videos, the videos need to be started by
# clicking a  button, after which a request for a m3u8 file is made.
# Because of this, Selenium is used for the automation of interacting with the
# DOM in order to obtain the m3u8 link, which does not exist in the DOM and can
# thus not be obtained by BeautifulSoup.

# options = webdriver.ChromeOptions()
# options.set_capability("goog:loggingPrefs", {"performance": "ALL"})

# driver = webdriver.Chrome(options=options)

driver = webdriver.Firefox()

categories = os.listdir("data/nijmegen")
for category in categories:
    if category == ".DS_Store":
        continue
    for year in os.listdir(f"data/nijmegen/{category}"):
        if year == ".DS_Store":
            continue
        if not os.path.isfile(f"data/nijmegen/{category}/{year}/vergaderingen.txt"):
            continue
        with open(f"data/nijmegen/{category}/{year}/vergaderingen.txt", "r") as f:
            if os.path.isfile(
                f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt"
            ) or os.path.isfile(f"data/nijmegen/{category}/{year}/failed.txt"):
                print(
                    f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt already exists"
                )
                continue
            print(f"Doing {category}, {year}")
            for url in f:
                driver.get(url)
                click_button(driver)
                time.sleep(5)
                # https://stackoverflow.com/questions/53286828/how-to-get-browser-network-logs-using-python-selenium
                test = driver.execute_script(
                    "var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntries() || {}; return network;"
                )
                found = False
                for item in test:
                    if "m3u8" in item.get("name"):
                        with open(
                            f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt",
                            "a+",
                        ) as f:
                            f.write(
                                url.replace("\n", "")
                                + " |-| "
                                + item.get("name")
                                + "\n"
                            )
                        found = True
                        break
                if not found:
                    with open(f"data/nijmegen/{category}/{year}/failed.txt", "a+") as f:
                        f.write(url)

driver.quit()

NameError: name 'webdriver' is not defined

In [6]:
# Download three videos simultaneously
for category in os.listdir("data/nijmegen"):
    if category == ".DS_Store":
        continue
    for year in os.listdir(f"data/nijmegen/{category}"):
        if year != "2022" and year != "2023" and year != "2024":
            continue
        if not os.path.isfile(
            f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt"
        ):
            # print("No file containing download links has been found.")
            continue

        with open(
            f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt", "r"
        ) as f:
            for line in f:
                splitted = line.split(" |-| ")
                original = splitted[0]
                download = splitted[1].replace("\n", "")

                code = original.split("/")[-1]
                name = f"{code}.mp4"

                maybe_make_dir(f"data/nijmegen/{category}")
                maybe_make_dir(f"data/nijmegen/{category}/{year}")
                maybe_make_dir(
                    f"data/nijmegen/{category}/{year}/videos"
                )
                filename = (
                    f"data/nijmegen/{category}/{year}/videos/{name}"
                )
                if os.path.isfile(filename) or os.path.isfile(
                    f"data/nijmegen/{category}/{year}/audio/{name}"
                ):
                    # print(filename, "already exists.")
                    continue

                print(f'python ../tools/m3u8.py -o {filename} "{download}";')
                # try:
                #     downloader = M3u8Downloader(
                #         download,
                #         filename,
                #         tempdir=os.path.join(get_default_cache_dir(), "m3u8downloader"),
                #         poolsize=5,
                #         randomfilenames=True,
                #     )
                #     downloader.start()
                # except Exception as _:
                #     continue

python ../tools/m3u8.py -o data/nijmegen/100000013/2022/videos/e67f9762-ec69-4072-b4b8-4318d953527a.mp4 "https://sdk.companywebcast.com/playlist/1.1/2d3a02b7-7cf9-40fc-a9e4-195bfe96c63f/playlist/od/iv/sdk-ssl.m3u8?Policy=eyJTdGF0ZW1lbnQiOiBbeyJSZXNvdXJjZSI6Imh0dHBzOi8vc2RrLmNvbXBhbnl3ZWJjYXN0LmNvbS9wbGF5bGlzdC8~Lj8vMmQzYTAyYjctN2NmOS00MGZjLWE5ZTQtMTk1YmZlOTZjNjNmLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3MzE3NzcxOTJ9LCJJcEFkZHJlc3MiOnsiQVdTOlNvdXJjZUlwIjoiMC4wLjAuMC8wIn19fV19&Signature=SVAbgwqhey-h983XiuL2iBKM4~H8FHBOZUTdFU3jPbYmU45S1zfIr1TbYHfQRY53cdXRKjTP8yz7uPHCUgUDM4UzSVYIP2SRmeNj-a1jWJ~ghYzJnJ0fhizL68VkXgpScQDzgwHn~eLaxRSMeEcAf9gui8A5ht0P23GX~1qwY6Vh~5GNEv-Pa0Bre4Qpj4Z7fmOPKO0iJM2mKFhQf7b1fNyBbWv8rDw~wBzIEdF7vA4IXIpevRCQ7T-c8NVQpAuOpNZZWD5Leb0DR-aRpwlsvGfCIy~1T~io3SmGagvpBT5gf1SEhFghYUJwBY88fpJ5C93VJFU~uRqWyDqIw3cr3Q__&Key-Pair-Id=APKAIWM6LAZJX3UVVARQ";
python ../tools/m3u8.py -o data/nijmegen/100000013/2022/videos/3ab6f8ae-5abc-45dc-ba1c-0ac4eac58a50.mp4 