In [1]:
import os
import re
import requests
import urllib
import json

from bs4 import BeautifulSoup

In [5]:
# scrapes all hrefs on a page. Returns a set to prevent duplicate hrefs.
def get_hrefs(url):
    r = requests.get(url)
    bsObj = BeautifulSoup(r.content, "html.parser")

    final_links = {link.get("href") for link in bsObj.find_all("a")}

    return final_links

def decode_href(href):
    decoded = bytes(href, "utf-8").decode("unicode_escape")
    # Replace escaped slashes and remove begin and end quotes.
    decoded = decoded.replace("\\/", "/")[1:-1]
    # Some vergaderingen are duplicated due to this anchor tag,
    # I remove the tag so the duplicate vergadering is filtered.
    decoded = decoded.split("#ai")[0]

    return decoded


def save_hrefs(hrefs, year):
    if not os.path.isdir(f"data/barendrecht/vergaderingen/{year}"):
        os.mkdir(f"data/barendrecht/vergaderingen/{year}")

    with open(f"data/barendrecht/vergaderingen/{year}/vergaderingen.txt", "w") as vergaderingen_f:
        with open(f"data/barendrecht/vergaderingen/{year}/documenten.txt", "w") as documenten_f:
            with open(f"data/barendrecht/vergaderingen/{year}/modules.txt", "w") as modules_f:
                with open(f"data/barendrecht/vergaderingen/{year}/rest.txt", "w") as rest_f:
                    for href in hrefs:
                        href = href.split("#")[0]
                        if "vergadering" in href:
                            vergaderingen_f.write(href + "\n")
                        elif "document" in href:
                            documenten_f.write(href + "\n")
                        elif "module" in href:
                            modules_f.write(href + "\n")
                        else:
                            rest_f.write(href + "\n")

In [6]:
if not os.path.isdir("data"):
    os.mkdir("data")
if not os.path.isdir("data/barendrecht/vergaderingen"):
    os.mkdir("data/barendrecht/vergaderingen")

years = range(2014, 2025)
for year in years:
    if (
        os.path.isdir(f"data/barendrecht/vergaderingen/{year}/vergaderingen.txt")
        or os.path.isdir(f"data/barendrecht/vergaderingen/{year}/documenten.txt")
        or os.path.isdir(f"data/barendrecht/vergaderingen/{year}/modules.txt")
        or os.path.isdir(f"data/barendrecht/vergaderingen/{year}/rest.txt")
    ):
        print(f"year {year} already scraped.")
        continue

    all_hrefs_year = set()
    for page in range(1, 30):
        BASE_URL = f"https://barendrecht.raadsinformatie.nl/zoeken/result?keywords=vergadering&limit=100&document_type=&search=send&filter[organisations][]=791&page={page}&filter[date][]={year}"
        all_hrefs_year.update(get_hrefs(BASE_URL))

    # Decode strings and remove duplicates.
    all_hrefs_year = set(list(map(decode_href, all_hrefs_year)))

    print(f"Got data for {year}, total links: {len(all_hrefs_year)}")
    save_hrefs(all_hrefs_year, year)

  decoded = bytes(href, "utf-8").decode("unicode_escape")


Got data for 2014, total links: 389
Got data for 2015, total links: 442
Got data for 2016, total links: 434
Got data for 2017, total links: 302
Got data for 2018, total links: 361
Got data for 2019, total links: 638
Got data for 2020, total links: 770
Got data for 2021, total links: 826
Got data for 2022, total links: 801
Got data for 2023, total links: 808
Got data for 2024, total links: 234


In [11]:
def download_vergadering(url, name):
    # r = requests.get(url)
    # For some reason request.get gave me 500, urllib works fine.
    r = urllib.request.urlopen(url)
    # print(r.status_code, r.reason)
    bsObj = BeautifulSoup(r, "html.parser")
    download_url = bsObj.find(href=re.compile("download"))

    if not download_url:
        return False
    download_url = download_url.get("href")

    if os.path.isfile(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/videos/{name}"):
        return True

    try:
        r = requests.get(download_url, stream=True)
        print(f"Downloading {download_url}")
        with open(
            f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/videos/{name}",
            "wb",
        ) as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
    except Exception as _:
        return False

    return True

In [13]:
for year in years:
    if not os.path.isdir(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/videos"):
        os.mkdir(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/videos")
    with open(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/vergaderingen.txt", "r") as vergaderingen_f:
        for url in vergaderingen_f:
            url = url.replace("\n", "")
            code = url.split("/")[-1]
            # extension = download_url.split(".")[-1]
            extension = "mp4"
            name = f"{code}.{extension}"

            if os.path.isfile(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/videos/{name}") or os.path.isfile(
                f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/audio/{name}.mp4"
            ) or os.path.isfile(
                f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/audio/{name}"):
                print(f"{url} already downloaded.")
                continue

            print(f"Trying {url}")
            try:
                if not download_vergadering(url, name):
                    with open(f"/Volumes/Samsung_T5/data/barendrecht/vergaderingen/{year}/failed.txt", "a+") as f:
                        f.write(url + "\n")
            except Exception as e:
                print("Error!", e)
                continue

https://barendrecht.raadsinformatie.nl/vergadering/58718 already downloaded.
Trying https://barendrecht.raadsinformatie.nl/vergadering/77935
https://barendrecht.raadsinformatie.nl/vergadering/62148 already downloaded.
https://barendrecht.raadsinformatie.nl/vergadering/77932 already downloaded.
https://barendrecht.raadsinformatie.nl/vergadering/119083 already downloaded.
https://barendrecht.raadsinformatie.nl/vergadering/134839 already downloaded.
https://barendrecht.raadsinformatie.nl/vergadering/71807 already downloaded.
Trying https://barendrecht.raadsinformatie.nl/vergadering/71725
https://barendrecht.raadsinformatie.nl/vergadering/75785 already downloaded.
https://barendrecht.raadsinformatie.nl/vergadering/131438 already downloaded.
Trying https://barendrecht.raadsinformatie.nl/vergadering/134331
Trying https://barendrecht.raadsinformatie.nl/vergadering/74422
https://barendrecht.raadsinformatie.nl/vergadering/130590 already downloaded.
Trying https://barendrecht.raadsinformatie.nl/

In [None]:
for year in years:
    if not os.path.isdir(f"data/barendrecht/vergaderingen/{year}/agendas"):
        os.mkdir(f"data/barendrecht/vergaderingen/{year}/agendas")
    with open(f"data/barendrecht/vergaderingen/{year}/vergaderingen.txt", "r") as vergaderingen_f:
        for url in vergaderingen_f:
            url = url.replace("\n", "")
            code = url.replace("\n", "").split("/")[-1]
            output_path = f"data/barendrecht/vergaderingen/{year}/agendas/{code}.json"
            if os.path.isfile(output_path):
                print(f"{output_path} already exists.")
                continue

            vergadering_json = []
            r = requests.get(url)
            bsObj = BeautifulSoup(r.content, "html.parser")

            agenda_items = [li for li in bsObj.find_all("li", class_="agenda_item")]
            # print(agenda_items)
            for item in agenda_items:
                agenda_point = {}
                btn = item.find("button", class_="item_title")
                if not btn:
                    continue
                span = btn.find("span", class_="item_prefix")

                # Skip sub agenda item (for now, perhaps)
                if not span or not span.text.strip().endswith("."):
                    continue

                agenda = btn.get_text(strip=True)
                agenda_point["agendaPoint"] = agenda
                time_span = item.find("span", class_="item_time")
                if time_span:
                    time = time_span.text.strip().replace("tijdsduur:", "").strip()
                    agenda_point["time"] = time

                vergadering_json.append(agenda_point)

            with open(output_path, "w") as f:
                json.dump(vergadering_json, f)
