In [5]:
import os
import re
import requests
import urllib

from bs4 import BeautifulSoup

In [6]:
# scrapes all hrefs on a page. Returns a set to prevent duplicate hrefs.
def get_hrefs(url):
    r = requests.get(url)
    bsObj = BeautifulSoup(r.content, "html.parser")

    final_links = {link.get("href") for link in bsObj.find_all("a")}

    return final_links

def decode_href(href):
    decoded = bytes(href, "utf-8").decode("unicode_escape")
    # Replace escaped slashes and remove begin and end quotes.
    decoded = decoded.replace("\\/", "/")[1:-1]
    # Some vergaderingen are duplicated due to this anchor tag,
    # I remove the tag so the duplicate vergadering is filtered.
    decoded = decoded.split("#ai")[0]

    return decoded


def save_hrefs(hrefs, year):
    if not os.path.isdir(f"data/haarlem/{year}"):
        os.mkdir(f"data/haarlem/{year}")

    with open(f"data/haarlem/{year}/vergaderingen.txt", "w") as vergaderingen_f:
        with open(f"data/haarlem/{year}/documenten.txt", "w") as documenten_f:
            with open(f"data/haarlem/{year}/modules.txt", "w") as modules_f:
                with open(f"data/haarlem/{year}/rest.txt", "w") as rest_f:
                    for href in hrefs:
                        if "vergadering" in href:
                            vergaderingen_f.write(href + "\n")
                        elif "document" in href:
                            documenten_f.write(href + "\n")
                        elif "module" in href:
                            modules_f.write(href + "\n")
                        else:
                            rest_f.write(href + "\n")

In [7]:
if not os.path.isdir("data"):
    os.mkdir("data")
if not os.path.isdir("data/haarlem"):
    os.mkdir("data/haarlem")

years = range(2014, 2025)
for year in years:
    all_hrefs_year = set()
    for page in range(1, 30):
        BASE_URL = f"https://gemeentebestuur-haarlem.notubiz.nl/zoeken/result?keywords=vragenuur&limit=25&document_type=&search=send&filter[organisations][]=544&page={page}&filter[date][]={year}"
        all_hrefs_year.update(get_hrefs(BASE_URL))

    # Decode strings and remove duplicates.
    all_hrefs_year = set(list(map(decode_href, all_hrefs_year)))

    print(f"Got data for {year}, total links: {len(all_hrefs_year)}")
    save_hrefs(all_hrefs_year, year)

KeyboardInterrupt: 

In [8]:
def download_vergadering(url, name):
    # r = requests.get(url)
    # For some reason request.get gave me 500, urllib works fine.
    r = urllib.request.urlopen(url)
    # print(r.status_code, r.reason)
    bsObj = BeautifulSoup(r, "html.parser")
    download_url = bsObj.find(href=re.compile("download"))

    if not download_url:
        return False
    download_url = download_url.get("href")

    try:
        r = requests.get(download_url, stream=True)
        print(f"Downloading {download_url}")
        with open(f"data/haarlem/{year}/videos/{name}", "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
    except Exception as _:
        return False

    return True

In [10]:
for year in years:
    if not os.path.isdir(f"data/haarlem/{year}/videos"):
        os.mkdir(f"data/haarlem/{year}/videos")
    with open(f"data/haarlem/{year}/vergaderingen.txt", "r") as vergaderingen_f:
        for url in vergaderingen_f:
            code = url.split("/")[-1]
            # extension = download_url.split(".")[-1]
            extension = "mp4"
            name = f"{code}.{extension}"

            if os.path.isfile(f"data/haarlem/{year}/videos/{name}"):
                print(f"{url} already downloaded.")
                continue

            print(f"Trying {url}")
            if not download_vergadering(url, name):
                with open(f"data/haarlem/{year}/failed.txt", "a+") as f:
                    f.write(url+"\n")


https://gemeentebestuur-haarlem.notubiz.nl/vergadering/148629
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101044
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101039
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101057
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101663
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101048
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/125416
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101045
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101041
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101042
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101051
 already downloaded.
https://gemeentebestuur-haarlem.notubiz.nl/vergadering/101037
 already downloaded.
http