In [13]:
import os
import requests

from bs4 import BeautifulSoup

In [19]:
def maybe_make_dir(name):
    if not os.path.isdir(name):
        os.mkdir(name)


def get_categories(base_url):
    url = "https://nijmegen.bestuurlijkeinformatie.nl/Calendar"
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    categories = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Calendar/OpenCategory" in link.attrs["href"]:
            l = link.attrs["href"]
            categories.add(f"{base_url}{l}")

    # return ["100011337"]
    return categories


def get_meeting_urls(base_url, url):
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    urls = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Index/" in link.attrs["href"]:
            l = link.attrs["href"]
            urls.add(f"{base_url}{l}")

    return urls

In [20]:
BASE_URL = "https://nijmegen.bestuurlijkeinformatie.nl"
years = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
    "2022",
    "2023",
    "2024",
]
categories = get_categories(BASE_URL)

for category in categories:
    c = category.split("/")[-1]
    maybe_make_dir(f"data/nijmegen/{c}")
    for year in years:
        # If the file already exists do not
        if os.path.isfile(f"data/nijmegen/{c}/{year}/vergaderingen.txt"):
            continue

        hrefs = get_meeting_urls(
            BASE_URL,
            f"{BASE_URL}/Agenda/RetrieveAgendasForYear?agendatypeId={c}&year={year}",
        )
        # If no vergaderingen, continue to next year.
        if len(hrefs) == 0:
            continue

        maybe_make_dir(f"data/nijmegen/{c}/{year}")

        # Write urls to file.
        with open(f"data/nijmegen/{c}/{year}/vergaderingen.txt", "w") as f:
            for href in hrefs:
                f.write(href + "\n")

        print(f"Got category {c}, year:  {year}, number of meetings: {len(hrefs)}")

Got category 100000842, year:  2013, number of meetings: 42
Got category 100000842, year:  2014, number of meetings: 120
Got category 100000842, year:  2015, number of meetings: 115
Got category 100000842, year:  2016, number of meetings: 105
Got category 100000842, year:  2017, number of meetings: 52
Got category 100000842, year:  2018, number of meetings: 9
Got category 100000842, year:  2024, number of meetings: 19
Got category 100000013, year:  2018, number of meetings: 10
Got category 100000013, year:  2019, number of meetings: 12
Got category 100000013, year:  2020, number of meetings: 12
Got category 100000013, year:  2021, number of meetings: 12
Got category 100000013, year:  2022, number of meetings: 12
Got category 100000013, year:  2023, number of meetings: 12
Got category 100000013, year:  2024, number of meetings: 4
Got category 100495945, year:  2022, number of meetings: 1
Got category 100495945, year:  2023, number of meetings: 3
Got category 100495945, year:  2024, numb

In [None]:
import time
import subprocess

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [137]:
def click_button(driver):
    try:
        frame = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "iframe[src*='sdk.companywebcast.com/sdk/player']")
            )
        )
        driver.switch_to.frame(frame)

        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[name="Start now"]'))
        )
        element.click()

    except Exception as e:
        print("Error:", e)

In [142]:
# Due to the way Nijmegen hosts its videos, the videos need to be started by
# clicking a  button, after which a request for a m3u8 file is made.
# Because of this, Selenium is used for the automation of interacting with the
# DOM in order to obtain the m3u8 link, which does not exist in the DOM and can
# thus not be obtained by BeautifulSoup.

# options = webdriver.ChromeOptions()
# options.set_capability("goog:loggingPrefs", {"performance": "ALL"})

# driver = webdriver.Chrome(options=options)

driver = webdriver.Firefox()

categories = os.listdir("data/nijmegen")
for category in categories:
    for year in os.listdir(f"data/nijmegen/{category}"):
        with open(f"data/nijmegen/{category}/{year}/vergaderingen.txt", "r") as f:
            if os.path.isfile(
                f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt"
            ) or os.path.isfile(
                f"data/nijmegen/{category}/{year}/failed.txt"
            ):
                print(
                    f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt already exists"
                )
                continue
            print(f"Doing {category}, {year}")
            for url in f:
                driver.get(url)
                click_button(driver)
                time.sleep(5)
                # https://stackoverflow.com/questions/53286828/how-to-get-browser-network-logs-using-python-selenium
                test = driver.execute_script(
                    "var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntries() || {}; return network;"
                )
                found = False
                for item in test:
                    if "m3u8" in item.get("name"):
                        with open(
                            f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt",
                            "a+",
                        ) as f:
                            f.write(
                                url.replace("\n", "")
                                + " |-| "
                                + item.get("name")
                                + "\n"
                            )
                        found = True
                        break
                if not found:
                    with open(f"data/nijmegen/{category}/{year}/failed.txt", "a+") as f:
                        f.write(url)

driver.quit()

Doing 100000013, 2022
{'connectEnd': 2489, 'connectStart': 2489, 'decodedBodySize': 3044, 'domainLookupEnd': 2489, 'domainLookupStart': 2489, 'duration': 68, 'encodedBodySize': 3044, 'entryType': 'resource', 'fetchStart': 2489, 'initiatorType': 'xmlhttprequest', 'name': 'https://sdk.companywebcast.com/playlist/1.1/2d3a02b7-7cf9-40fc-a9e4-195bfe96c63f/playlist/od/iv/sdk-ssl.m3u8?Policy=eyJTdGF0ZW1lbnQiOiBbeyJSZXNvdXJjZSI6Imh0dHBzOi8vc2RrLmNvbXBhbnl3ZWJjYXN0LmNvbS9wbGF5bGlzdC8~Lj8vMmQzYTAyYjctN2NmOS00MGZjLWE5ZTQtMTk1YmZlOTZjNjNmLyoiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3Mjk2OTk4MzZ9LCJJcEFkZHJlc3MiOnsiQVdTOlNvdXJjZUlwIjoiMC4wLjAuMC8wIn19fV19&Signature=XGnDqwHp5ZUpr1wGOrJhC972C7HrApjxwbIHVP~yJyZKSI-OEtdWOKsXlIEFcg3WPleBDVdd8X7DmI7OrlUxvXS0EWar9BboylY4UdYb8Xylrh6Ww5iPnZ0KqoP5Jgjd504nYqlrGrgJ~naOUYfMB8HnBnEBEpuEnXOlS3WX7-HRnT5FBW3k8QDCkvOn8SL0Cdo--JPBtuyoQXKeCDkfr~44PeSUt2MHVSDhWandHP1tyZGtoVArewtBDeAp0sEBt4FDC2dO7gYmqpDuq3W5MUrYGiB4TjLFl27DHZ8o-Tg7wH2fBVGn46L2nlBBY

In [None]:
for category in os.listdir("data/nijmegen"):
    for year in os.listdir(f"data/nijmegen/{category}"):
        if not os.path.isfile(
            f"data/nijmegen/{category}/year/vergaderingenDownloads.txt"
        ):
            continue

        with open(
            f"data/nijmegen/{category}/{year}/vergaderingenDownloads.txt" "r"
        ) as f:
            for line in f:
                splitted = line.split(" |-| ")
                original = splitted[0]
                download = splitted[1]

                code = original.split("/")[-1]
                name = f"{code}.mp4"

                maybe_make_dir(f"data/nijmegen/{category}/{year}/videos")

                subprocess.run(
                    [
                        "python",
                        "../tools/m3u8.py",
                        "-o",
                        f"data/nijmegen/{category}/{year}/videos/{name}",
                        download
                    ]
                )