In [1]:
import requests
import os
import pandas as pd

In [4]:
##########
# This code will take several minutes to run. 
# It will download 1448 documents which will take up 369 MB of space.
# If you wish to extend the dataset with more years' speeches, modify lines 9, 14, 59 and 63 for each new year.
##########

# Initial setup
base_url = "https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document"
query_params = "?$filter=Soort eq 'Stenogram'"  # change the year here to get data for other years.
page_size = 250
skip = 0

# Ensure there is a directory to save the downloaded PDFs
pdf_folder = 'tweede_pdfs'
os.makedirs(pdf_folder, exist_ok=True)

# Function to make HTTP GET requests
def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()  # Will stop the loop if an HTTP error occurs
    return response.json()

# Function to download a PDF from a URL
def download_pdf(pdf_url, filename):
    print(f"Download {filename}")
    response = requests.get(pdf_url)
    response.raise_for_status()
    with open(filename, 'wb') as f:
        f.write(response.content)

# Loop to fetch all pages and download PDFs
file_paths = []
file_names = []
while True:
    full_url = f"{base_url}{query_params}&$skip={skip}"
    data = fetch_data(full_url)
   
    if 'value' in data and len(data['value']) > 0:
        for entry in data['value']:
            # Construct the URL for the PDF resource
            doc_id = entry['Id']
            pdf_url = f"{base_url}/{doc_id}/resource"
            file_names.append(doc_id + '.pdf')
           
            # Download the PDF and save it
            pdf_filename = os.path.join(pdf_folder, f"{doc_id}.pdf")
            file_paths.append(pdf_filename)
            download_pdf(pdf_url, pdf_filename)
       
        skip += page_size  # Prepare for the next page
        if len(data['value']) < page_size:
            break
    else:
        break

print(f"Downloaded all PDFs to {pdf_folder}")

# save filenames to text file so they can be reused without re-downloading the files:
# Note that if you are downloading a new year, you should probably rename this file to match.
with open('filepaths.txt', 'w') as file:
    for item in file_paths:
        file.write(item + '\n')

with open('filenames.txt', 'w') as file:
    for item in file_names:
        file.write(item + '\n')

Download tweede_pdfs/dd315b74-4da5-4ce8-9767-00042ac46323.pdf
Download tweede_pdfs/15dc5dc3-53fc-43d8-bc74-000788c06957.pdf
Download tweede_pdfs/9cf6538c-6b2a-431d-8bb2-000b32ea89ab.pdf
Download tweede_pdfs/5f8392cc-32e4-4fb9-8888-000e80fcc1af.pdf
Download tweede_pdfs/d5fc3943-f174-4e62-8959-000f196aa3cd.pdf
Download tweede_pdfs/0db4593d-8fd5-4951-88c2-001425b404b0.pdf
Download tweede_pdfs/33ff8421-bc8c-4e56-9dbe-001735752003.pdf
Download tweede_pdfs/9bf6dd72-a4bf-4603-ac52-0018b4939fae.pdf
Download tweede_pdfs/1a5f2583-1a6c-4d48-851a-0019f1e46e23.pdf
Download tweede_pdfs/77d767e4-4213-4b04-b7a1-001d41dab90d.pdf
Download tweede_pdfs/e9950d7c-2780-46bf-942a-002502dd4d51.pdf
Download tweede_pdfs/02e2392c-6227-4b6f-82d7-0025d585cdf4.pdf
Download tweede_pdfs/f93a1d7a-643b-4dfe-80ef-00289077d44b.pdf
Download tweede_pdfs/de4fa69e-84be-4996-8323-002895d021a4.pdf
Download tweede_pdfs/e344ea2f-97c6-4eb5-b76d-00293f5a730e.pdf
Download tweede_pdfs/b48c6e61-58ae-4895-a75f-002c95ffc491.pdf
Download

In [5]:
""" 
uses the saved text file to recreate the list
"""
with open('filepaths.txt', 'r') as file:
    file_paths = file.read().splitlines()
print(file_paths)

with open('filenames.txt', 'r') as file:
    file_names = file.read().splitlines()
print(file_names)

['tweede_pdfs/dd315b74-4da5-4ce8-9767-00042ac46323.pdf', 'tweede_pdfs/15dc5dc3-53fc-43d8-bc74-000788c06957.pdf', 'tweede_pdfs/9cf6538c-6b2a-431d-8bb2-000b32ea89ab.pdf', 'tweede_pdfs/5f8392cc-32e4-4fb9-8888-000e80fcc1af.pdf', 'tweede_pdfs/d5fc3943-f174-4e62-8959-000f196aa3cd.pdf', 'tweede_pdfs/0db4593d-8fd5-4951-88c2-001425b404b0.pdf', 'tweede_pdfs/33ff8421-bc8c-4e56-9dbe-001735752003.pdf', 'tweede_pdfs/9bf6dd72-a4bf-4603-ac52-0018b4939fae.pdf', 'tweede_pdfs/1a5f2583-1a6c-4d48-851a-0019f1e46e23.pdf', 'tweede_pdfs/77d767e4-4213-4b04-b7a1-001d41dab90d.pdf', 'tweede_pdfs/e9950d7c-2780-46bf-942a-002502dd4d51.pdf', 'tweede_pdfs/02e2392c-6227-4b6f-82d7-0025d585cdf4.pdf', 'tweede_pdfs/f93a1d7a-643b-4dfe-80ef-00289077d44b.pdf', 'tweede_pdfs/de4fa69e-84be-4996-8323-002895d021a4.pdf', 'tweede_pdfs/e344ea2f-97c6-4eb5-b76d-00293f5a730e.pdf', 'tweede_pdfs/b48c6e61-58ae-4895-a75f-002c95ffc491.pdf', 'tweede_pdfs/dcf9a60d-e395-41f0-891b-002d7b1e758d.pdf', 'tweede_pdfs/cc2daefa-5365-42dd-81d0-003549861b