In [10]:
import os
import re
import requests
import urllib

from bs4 import BeautifulSoup

In [11]:
# scrapes all hrefs on a page. Returns a set to prevent duplicate hrefs.
def get_hrefs(url):
    r = requests.get(url)
    bsObj = BeautifulSoup(r.content, "html.parser")

    final_links = {link.get("href") for link in bsObj.find_all("a")}

    return final_links


def decode_href(href):
    decoded = bytes(href, "utf-8").decode("unicode_escape")
    # Replace escaped slashes and remove begin and end quotes.
    decoded = decoded.replace("\\/", "/")[1:-1]
    # Some vergaderingen are duplicated due to this anchor tag,
    # I remove the tag so the duplicate vergadering is filtered.
    decoded = decoded.split("#")[0]

    return decoded


def save_hrefs(hrefs, year):
    if not os.path.isdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}"):
        os.mkdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}")

    with open(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/vergaderingen.txt", "w") as vergaderingen_f:
        with open(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/documenten.txt", "w") as documenten_f:
            with open(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/modules.txt", "w") as modules_f:
                with open(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/rest.txt", "w") as rest_f:
                    for href in hrefs:
                        if "vergadering" in href:
                            vergaderingen_f.write(href + "\n")
                        elif "document" in href:
                            documenten_f.write(href + "\n")
                        elif "module" in href:
                            modules_f.write(href + "\n")
                        else:
                            rest_f.write(href + "\n")

In [12]:
if not os.path.isdir("/Volumes/Samsung_T5/data"):
    os.mkdir("/Volumes/Samsung_T5/data")
if not os.path.isdir("/Volumes/Samsung_T5/data/hoekschewaard"):
    os.mkdir("/Volumes/Samsung_T5/data/hoekschewaard")

years = range(2018, 2025)
for year in years:
    if (
        os.path.isdir(
            f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/vergaderingen.txt"
        )
        or os.path.isdir(
            f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/documenten.txt"
        )
        or os.path.isdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/modules.txt")
        or os.path.isdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/rest.txt")
    ):
        print(f"year {year} already scraped.")
        continue

    all_hrefs_year = set()
    for page in range(1, 30):
        BASE_URL = f"https://hoekschewaard.notubiz.nl/zoeken/result?keywords=vergadering&limit=25&document_type=&search=send&filter[organisations][]=3398&page={page}&filter[date][]={year}"
        all_hrefs_year.update(get_hrefs(BASE_URL))

    # Decode strings and remove duplicates.
    all_hrefs_year = set(list(map(decode_href, all_hrefs_year)))

    print(f"Got data for {year}, total links: {len(all_hrefs_year)}")
    save_hrefs(all_hrefs_year, year)

Got data for 2018, total links: 0


KeyboardInterrupt: 

In [13]:
def download_vergadering(url, name):
    # r = requests.get(url)
    # For some reason request.get gave me 500, urllib works fine.
    r = urllib.request.urlopen(url)
    # print(r.status_code, r.reason)
    bsObj = BeautifulSoup(r, "html.parser")
    download_url = bsObj.find(href=re.compile("download"))

    if not download_url:
        return False
    download_url = download_url.get("href")

    try:
        r = requests.get(download_url, stream=True)
        print(f"Downloading {download_url}")
        with open(
            f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/videos/{name}", "wb"
        ) as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
    except Exception as _:
        return False

    return True

In [17]:
for year in years:
    if year == 2020 or year == 2021:
        if not os.path.isdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/videos"):
            os.mkdir(f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/videos")
        with open(
            f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/vergaderingen.txt", "r"
        ) as vergaderingen_f:
            for url in vergaderingen_f:
                url = url.replace("\n", "")
                code = url.replace("\n", "").split("/")[-1]
                # extension = download_url.split(".")[-1]
                extension = "mp4"
                name = f"{code}.{extension}"

                if (
                    os.path.isfile(
                        f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/videos/{name}"
                    )
                    or os.path.isfile(
                        f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/audio/{name}.mp4"
                    )
                    or os.path.isfile(
                        f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/audio/{name}"
                    )
                ):
                    print(f"{url} already downloaded.")
                    continue

                print(f"Trying {url}")
                if not download_vergadering(url, name):
                    with open(
                        f"/Volumes/Samsung_T5/data/hoekschewaard/{year}/failed.txt", "a+"
                    ) as f:
                        f.write(url + "\n")

https://hoekschewaard.notubiz.nl/vergadering/760192 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760197 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/743807 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/763769 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760189 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760202 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/729315 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/763895 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760199 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760183 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/736817 already downloaded.
Trying https://hoekschewaard.notubiz.nl/vergadering/696761
https://hoekschewaard.notubiz.nl/vergadering/730739 already downloaded.
https://hoekschewaard.notubiz.nl/vergadering/760217 already downloaded.
https