In [9]:
import os
import re
import requests
import urllib

from bs4 import BeautifulSoup

In [10]:
# scrapes all hrefs on a page. Returns a set to prevent duplicate hrefs.
def get_hrefs(url):
    r = requests.get(url)
    bsObj = BeautifulSoup(r.content, "html.parser")

    final_links = {link.get("href") for link in bsObj.find_all("a")}

    return final_links

def decode_href(href):
    decoded = bytes(href, "utf-8").decode("unicode_escape")
    # Replace escaped slashes and remove begin and end quotes.
    decoded = decoded.replace("\\/", "/")[1:-1]
    # Some vergaderingen are duplicated due to this anchor tag,
    # I remove the tag so the duplicate vergadering is filtered.
    decoded = decoded.split("#")[0]

    return decoded


def save_hrefs(hrefs, year):
    if not os.path.isdir(f"data/ridderkerk/{year}"):
        os.mkdir(f"data/ridderkerk/{year}")

    with open(f"data/ridderkerk/{year}/vergaderingen.txt", "w") as vergaderingen_f:
        with open(f"data/ridderkerk/{year}/documenten.txt", "w") as documenten_f:
            with open(f"data/ridderkerk/{year}/modules.txt", "w") as modules_f:
                with open(f"data/ridderkerk/{year}/rest.txt", "w") as rest_f:
                    for href in hrefs:
                        if "vergadering" in href:
                            vergaderingen_f.write(href + "\n")
                        elif "document" in href:
                            documenten_f.write(href + "\n")
                        elif "module" in href:
                            modules_f.write(href + "\n")
                        else:
                            rest_f.write(href + "\n")

In [11]:
if not os.path.isdir("data"):
    os.mkdir("data")
if not os.path.isdir("data/ridderkerk"):
    os.mkdir("data/ridderkerk")

years = range(2010, 2025)
for year in years:
    if (
        os.path.isdir(f"data/ridderkerk/{year}/vergaderingen.txt")
        or os.path.isdir(f"data/ridderkerk/{year}/documenten.txt")
        or os.path.isdir(f"data/ridderkerk/{year}/modules.txt")
        or os.path.isdir(f"data/ridderkerk/{year}/rest.txt")
    ):
        print(f"year {year} already scraped.")
        continue

    all_hrefs_year = set()
    for page in range(1, 30):
        BASE_URL = f"https://ridderkerk.notubiz.nl/zoeken/result?keywords=vergadering&limit=25&document_type=&search=send&filter[organisations][]=353&page={page}&filter[date][]={year}"
        all_hrefs_year.update(get_hrefs(BASE_URL))

    # Decode strings and remove duplicates.
    all_hrefs_year = set(list(map(decode_href, all_hrefs_year)))

    print(f"Got data for {year}, total links: {len(all_hrefs_year)}")
    save_hrefs(all_hrefs_year, year)

KeyboardInterrupt: 

In [12]:
def download_vergadering(url, name):
    # r = requests.get(url)
    # For some reason request.get gave me 500, urllib works fine.
    r = urllib.request.urlopen(url)
    # print(r.status_code, r.reason)
    bsObj = BeautifulSoup(r, "html.parser")
    download_url = bsObj.find(href=re.compile("download"))

    if not download_url:
        return False
    download_url = download_url.get("href")

    try:
        r = requests.get(download_url, stream=True)
        print(f"Downloading {download_url}")
        with open(
            f"data/ridderkerk/{year}/videos/{name}", "wb"
        ) as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
    except Exception as _:
        return False

    return True

In [18]:
for year in years:
    if year < 2016:
        continue
    if not os.path.isdir(f"data/ridderkerk/{year}/videos"):
        os.mkdir(f"data/ridderkerk/{year}/videos")
    with open(
        f"data/ridderkerk/{year}/vergaderingen.txt", "r"
    ) as vergaderingen_f:
        for url in vergaderingen_f:
            url = url.replace("\n", "")
            code = url.replace("\n", "").split("/")[-1]
            # extension = download_url.split(".")[-1]
            extension = "mp4"
            name = f"{code}.{extension}"

            if (
                os.path.isfile(
                    f"data/ridderkerk/{year}/videos/{name}"
                )
                or os.path.isfile(
                    f"data/ridderkerk/{year}/audio/{name}.mp4"
                )
                or os.path.isfile(
                    f"data/ridderkerk/{year}/audio/{name}"
                )
            ):
                print(f"{url} already downloaded.")
                continue

            print(f"Trying {url}")
            if not download_vergadering(url, name):
                with open(
                    f"data/ridderkerk/{year}/failed.txt", "a+"
                ) as f:
                    f.write(url + "\n")

https://ridderkerk.notubiz.nl/vergadering/248955 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/249009 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/249015 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248970 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/305610 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248965 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/249005 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248967 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248102 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/294511 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/368995 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248964 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/248952 already downloaded.
https://ridderkerk.notubiz.nl/vergadering/351851 already downloaded.
https://ridderkerk.notubiz.nl/verg