In [None]:
import os
import requests

from bs4 import BeautifulSoup

In [None]:
def maybe_make_dir(name):
    if not os.path.isdir(name):
        os.mkdir(name)


def get_categories(base_url):
    url = "https://wassenaar.bestuurlijkeinformatie.nl/Calendar"
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    categories = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Calendar/OpenCategory" in link.attrs["href"]:
            l = link.attrs["href"]
            categories.add(f"{base_url}{l}")

    return categories


def get_meeting_urls(base_url, url):
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, "lxml")
    urls = set()

    links = bsObj.findAll("a")
    for link in links:
        if "/Index/" in link.attrs["href"]:
            l = link.attrs["href"]
            urls.add(f"{base_url}{l}")

    return urls

In [None]:
BASE_URL = "https://wassenaar.bestuurlijkeinformatie.nl"
years = [
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
    "2022",
    "2023",
    "2024",
]
categories = get_categories(BASE_URL)

for category in categories:
    c = category.split("/")[-1]
    maybe_make_dir(f"data/wassenaar/{c}")
    for year in years:
        # If the file already exists do not
        if os.path.isfile(f"data/wassenaar/{c}/{year}/vergaderingen.txt"):
            continue

        hrefs = get_meeting_urls(
            BASE_URL,
            f"{BASE_URL}/Agenda/RetrieveAgendasForYear?agendatypeId={c}&year={year}",
        )
        # If no vergaderingen, continue to next year.
        if len(hrefs) == 0:
            continue

        maybe_make_dir(f"data/wassenaar/{c}/{year}")

        # Write urls to file.
        with open(f"data/wassenaar/{c}/{year}/vergaderingen.txt", "w") as f:
            for href in hrefs:
                f.write(href + "\n")

        print(f"Got category {c}, year:  {year}, number of meetings: {len(hrefs)}")

In [None]:
import time

import concurrent.futures

import sys

sys.path.append("../tools")
from m3u8 import M3u8Downloader, get_default_cache_dir

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def click_button(driver):
    try:
        frame = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "iframe[src*='sdk.companywebcast.com/sdk/player']")
            )
        )
        driver.switch_to.frame(frame)

        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[name="Start now"]'))
        )
        element.click()

    except Exception as e:
        print("Error:", e)

In [None]:
# Due to the way Wassenaar hosts its videos, the videos need to be started by
# clicking a  button, after which a request for a m3u8 file is made.
# Because of this, Selenium is used for the automation of interacting with the
# DOM in order to obtain the m3u8 link, which does not exist in the DOM and can
# thus not be obtained by BeautifulSoup.

# options = webdriver.ChromeOptions()
# options.set_capability("goog:loggingPrefs", {"performance": "ALL"})

# driver = webdriver.Chrome(options=options)

driver = webdriver.Firefox()

categories = os.listdir("data/wassenaar")
for category in categories:
    if category == ".DS_Store":
        continue
    for year in os.listdir(f"data/wassenaar/{category}"):
        if year == ".DS_Store":
            continue
        with open(f"data/wassenaar/{category}/{year}/vergaderingen.txt", "r") as f:
            if os.path.isfile(
                f"data/wassenaar/{category}/{year}/vergaderingenDownloads.txt"
            ) or os.path.isfile(f"data/wassenaar/{category}/{year}/failed.txt"):
                print(
                    f"data/wassenaar/{category}/{year}/vergaderingenDownloads.txt already exists"
                )
                continue
            print(f"Doing {category}, {year}")
            for url in f:
                driver.get(url)
                click_button(driver)
                time.sleep(5)
                # https://stackoverflow.com/questions/53286828/how-to-get-browser-network-logs-using-python-selenium
                test = driver.execute_script(
                    "var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntries() || {}; return network;"
                )
                found = False
                for item in test:
                    if "m3u8" in item.get("name"):
                        with open(
                            f"data/wassenaar/{category}/{year}/vergaderingenDownloads.txt",
                            "a+",
                        ) as f:
                            f.write(
                                url.replace("\n", "")
                                + " |-| "
                                + item.get("name")
                                + "\n"
                            )
                        found = True
                        break
                if not found:
                    with open(f"data/wassenaar/{category}/{year}/failed.txt", "a+") as f:
                        f.write(url)

driver.quit()

In [None]:
# Download three videos simultaneously
with concurrent.futures.ThreadPoolExecutor(max_workers=3):
    for category in os.listdir("data/wassenaar"):
        for year in os.listdir(f"data/wassenaar/{category}"):
            if not os.path.isfile(
                f"data/wassenaar/{category}/year/vergaderingenDownloads.txt"
            ):
                continue

            with open(
                f"data/wassenaar/{category}/{year}/vergaderingenDownloads.txt" "r"
            ) as f:
                for line in f:
                    splitted = line.split(" |-| ")
                    original = splitted[0]
                    download = splitted[1].replace("\n", "")

                    code = original.split("/")[-1]
                    name = f"{code}.mp4"

                    maybe_make_dir(f"data/wassenaar/{category}/{year}/videos")

                    downloader = M3u8Downloader(
                        download,
                        filename,
                        tempdir=os.path.join(get_default_cache_dir(), "m3u8downloader"),
                        poolsize=5,
                        randomfilenames=True,
                    )
                    downloader.start()