In [3]:
import requests
from bs4 import BeautifulSoup
import regex as re
from tqdm.notebook import tqdm
import pandas as pd
import time
from math import ceil

import wget
import os

In [2]:
def get_wob_links(webpage, number_of_pages):
    """
    Retrieves all links to pages with a Wob-verzoek.
    Give webpage link and number of pages to search.
    """
    source = 'https://www.rijksoverheid.nl'
    wob_links = []

    for i in range(1, number_of_pages + 1):
        r = requests.get(f'{webpage}{i}')

        # Find publications
        soup = BeautifulSoup(r.content, 'html.parser')
        publications = soup.find_all(class_="publication")

        for publication in publications:
            wob_link = source + publication['href']
            wob_links.append(wob_link)

    return wob_links

In [4]:
def read_wob(wob_link):
    """
    Reads one wob, returns information
    """
    source = 'https://www.rijksoverheid.nl'
    r = requests.get(wob_link)

    # Try again in a minute if status code is not OK
    if r.status_code != 200:
        print(f"Status code is {r.status_code}. Trying again in a minute...")
        time.sleep(60)
        read_wob(wob_link)

    soup = BeautifulSoup(r.content, 'html.parser')

    # Try to find metadata of Wob-verzoek
    try:
        title = soup.find('h1').string.strip()
    except:
        title = None
    try:
        intro = soup.find(class_='intro').string.strip()
    except:
        intro = None
    try:
        responsible = soup.find(class_='brick belongsTo').find('a').string.strip()
    except:
        responsible = None
    
    # Extract date from link
    try:
        date = re.search(r'\d{4}\/\d{2}\/\d{2}', wob_link).group()
    except:
        date = None
    
    # Find all download links for further investigation
    download_links = soup.find_all(class_='download-chunk pdf')
    all_pdf_links = []
    for link in download_links:
        all_pdf_links.append(source + link['href'])

    return {'link': wob_link, 'titel': title, 'intro': intro, 'datum': date, 'verantwoordelijk': responsible, 'pdfs': all_pdf_links}

read_wob('https://www.rijksoverheid.nl/documenten/wob-verzoeken/2022/02/15/besluit-wob-verzoek-correspondentie-kamerbrief-voortgang-stikstofproblematiek')

{'link': 'https://www.rijksoverheid.nl/documenten/wob-verzoeken/2022/02/15/besluit-wob-verzoek-correspondentie-kamerbrief-voortgang-stikstofproblematiek',
 'titel': 'Besluit Wob-verzoek correspondentie Kamerbrief voortgang stikstofproblematiek',
 'intro': 'Besluit op een verzoek om documenten over de afstemming rondom de Kamerbrief van 12 november 2021 over de voortgang van de stikstofproblematiek. Het gaat om een verzoek op basis van de Wet openbaarheid van bestuur (Wob).',
 'datum': '2022/02/15',
 'verantwoordelijk': 'Ministerie van Landbouw, Natuur en Voedselkwaliteit',
 'pdfs': ['https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken/2022/02/15/besluit-wob-verzoek-correspondentie-kamerbrief-voortgang-stikstofproblematiek/besluit-wob-kamerbrief-stikstofproblematiek.pdf',
  'https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken/2022/02/15/besluit-wob-verzoek-correspondentie-kamerbrief-voortgang-stikstofproblematiek/besluit-wob-kamerbrief-s

In [5]:
def get_ministeries():
    """
    Returns all scraped ministeries with their wob-count
    """
    docs_link = 'https://www.rijksoverheid.nl/documenten'
    r = requests.get(docs_link)
    soup = BeautifulSoup(r.content, 'html.parser')

    ministeries_search = soup.find_all('option', value=re.compile('Ministerie'))

    ministeries = []
    for ministerie in ministeries_search:
        ministeries.append(ministerie.string.strip())

    ministeries_count = []
    search_link = "https://www.rijksoverheid.nl/documenten?type=Wob-verzoek"
    for ministerie in ministeries:
        search_link_min = search_link + f'&onderdeel={ministerie.replace(" ", "+")}'
        r = requests.get(search_link_min)
        soup = BeautifulSoup(r.content, 'html.parser')
        N = soup.find(class_='summary').span.string
        
        ministeries_count.append((ministerie, int(N)))

    return ministeries_count

In [6]:
def read_all_wobs():

    search_link = "https://www.rijksoverheid.nl/documenten?type=Wob-verzoek"
    ministeries = get_ministeries()

    print("Extracting links to Wob-pages...")
    wob_links = []
    for ministerie, number_wobs in tqdm(ministeries):
        pages = ceil(number_wobs / 10)

        search_link_min = search_link + f'&onderdeel={ministerie.replace(" ", "+")}' + '&pagina='
        min_wobs = get_wob_links(search_link_min, pages)
        for wob in min_wobs:
            wob_links.append(wob)

    print("Reading Wobs, extracting metadata...")
    wob_meta = []
    for wob in tqdm(wob_links):
        wob_meta.append(read_wob(wob))

    return wob_meta

wob_meta = read_all_wobs()

Extracting links to Wob-pages...


  0%|          | 0/12 [00:00<?, ?it/s]

Reading Wobs, extracting metadata...


  0%|          | 0/2703 [00:00<?, ?it/s]

Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...


In [7]:
df = pd.DataFrame(wob_meta)

df.to_csv('wobs_rijksoverheid.csv')

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
wget.download('https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken/2021/06/29/besluit-op-een-wob-verzoek-over-adviesopdracht-visitatiecommissie-van-het-dolfinarium-harderwijk/Bijlage+3+Dolfinarium.pdf', '../data')

'../data/Bijlage+3+Dolfinarium.pdf'

In [16]:
def get_doc_type(doclink):

    prefix = 'https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken'
    doclink = doclink.lower().strip(prefix)
    
    options = {'Inventaris': ['inventaris'], 'Bijlage': ['bijlage', 'document', 'agenda', 'verslag'], 'Besluit': ['besluit', 'beslissing']}
    
    for option in options:
        for keyword in options[option]:
            if keyword in doclink:
                return option

    return 'Niet herkend'

In [37]:
def download_doc(doc_link, export_dir):

    try:
        wget.download(doc_link, export_dir)
    except:
        time.sleep(60)
        download_doc(doc_link, export_dir)

def get_documents(csv_file, export_dir):
    """
    Extracts all Inventaris documents
    """
    df = pd.read_csv(csv_file)
    df['pdfs'] = df['pdfs'].map(lambda x: x.strip("][").split(', '))

    for idx, wob in tqdm(df.iterrows()):  

        date_string = str(wob['datum']).replace('/', '-')
        
        for doc_link in wob['pdfs']:
            doc_link = doc_link.strip("'")
            doc_type = get_doc_type(doc_link)

            if doc_type == 'Inventaris':
                # try:
                #     os.makedirs(f"{export_dir}/{wob['titel']} {date_string}/{doc_type}")
                # except FileExistsError:
                #     pass
                    
                download_doc(doc_link, export_dir)
                

get_documents('../data/wobs_rijksoverheid.csv', 'D:/School/wobs')

0it [00:00, ?it/s]

In [24]:
'2021/11/29'.replace('/', '-')

'2021-11-29'