In [12]:
import requests
from bs4 import BeautifulSoup
import regex as re
from tqdm.notebook import tqdm
import pandas as pd
import time
from math import ceil

import wget
import os

## Ophalen links naar alle Wob-pagina's

Hiervoor is een webpagina nodig en hoeveel pagina's we willen doorzoeken op deze pagina. Helaas heeft de rijksoverheid een limiet van 50 pagina's, waardoor we een verdeling moeten maken op ministerie om per ministerie de Wob-links op te halen.

In [13]:
def get_ministeries():
    """
    Returns all scraped ministeries with their wob-count (how many wobs?)
    """
    docs_link = 'https://www.rijksoverheid.nl/documenten'
    r = requests.get(docs_link)
    soup = BeautifulSoup(r.content, 'html.parser')

    ministeries_search = soup.find_all('option', value=re.compile('Ministerie'))

    ministeries = []
    for ministerie in ministeries_search:
        ministeries.append(ministerie.string.strip())

    ministeries_count = []
    search_link = "https://www.rijksoverheid.nl/documenten?type=Wob-verzoek"
    for ministerie in ministeries:
        search_link_min = search_link + f'&onderdeel={ministerie.replace(" ", "+")}'
        r = requests.get(search_link_min)
        soup = BeautifulSoup(r.content, 'html.parser')
        N = soup.find(class_='summary').span.string
        
        ministeries_count.append((ministerie, int(N)))

    return ministeries_count

In [14]:
def get_wob_links(webpage, number_of_pages):
    """
    Retrieves all links to pages with a Wob-verzoek.
    Give webpage link and number of pages to search.
    """
    source = 'https://www.rijksoverheid.nl'
    wob_links = []

    for i in range(1, number_of_pages + 1):
        r = requests.get(f'{webpage}{i}')

        # Find publications
        soup = BeautifulSoup(r.content, 'html.parser')
        publications = soup.find_all(class_="publication")

        for publication in publications:
            wob_link = source + publication['href']
            wob_links.append(wob_link)

    return wob_links

## Ophalen informatie uit Wob-links

Nadat er een lijst is gemaakt, kunnen we informatie gaan halen uit deze links. Dit is gedaan door te kijken in de source code van deze pagina's en op basis hiervan de metadata te verzamelen.

In [15]:
def get_doc_type(doclink):
    """
    Based on the link to a doc, try to identify the type of document.
    """
    prefix = 'https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken'
    doclink = doclink.lower().strip(prefix)
    
    options = {'Inventaris': ['inventaris'], 'Bijlage': ['bijlage', 'document', 'agenda', 'verslag'], 'Besluit': ['besluit', 'beslissing']}
    
    for option in options:
        for keyword in options[option]:
            if keyword in doclink:
                return option

    return 'Niet herkend'

In [16]:
import re
re.search_all(r"\| (\d+) [a-z]*", "PDF document | 3 pagina's | 842 kB")

AttributeError: module 're' has no attribute 'search_all'

In [21]:
def read_wob(wob_link, wob_id):
    """
    Reads one wob, returns information in json format.
    Really specific for rijksoverheid.nl, because of the unique html tags and information.
    """
    source = 'https://www.rijksoverheid.nl'
    r = requests.get(wob_link)
    wob_meta = dict()

    # Try again in a minute if status code is not OK
    if r.status_code != 200:
        time.sleep(60)
        read_wob(wob_link)

    soup = BeautifulSoup(r.content, 'html.parser')

    wob_meta['url'] = wob_link
    wob_meta['wob_id'] = wob_id

    # Try to find metadata of Wob-verzoek
    try:
        title = soup.find('h1').string.strip()
        wob_meta['titel'] = title
    except:
        pass
    try:
        intro = soup.find(class_='intro').string.strip()
        wob_meta['beschrijving'] = intro
    except:
        pass
    try:
        responsible = soup.find(class_='brick belongsTo').find('a').string.strip()
        wob_meta['verantwoordelijk'] = responsible
    except:
        pass
    
    # Extract date from link
    try:
        date = re.search(r'\d{4}\/\d{2}\/\d{2}', wob_link).group()
        wob_meta['datum_besluit'] = re.sub(r'/', '-', date)
    except:
        pass
    
    # Find all download links for further investigation
    download_links = soup.find_all(class_='download-chunk pdf')
    documenten = []
    for id, link in enumerate(download_links):
        full_link = source + link['href']
        document = dict()
        document['document_id'] = f"{str(wob_id)}-{str(id)}"
        document['url'] = full_link
        document['bestandsnaam'] = full_link.split('/')[-1]
        document['bestandstype'] = full_link.split('/')[-1].split('.')[-1]
        document['documenttype'] = get_doc_type(full_link)
        try:
            document['titel'] = re.search(r"'(.*)'", link.find('span').string.strip()).group(1)
        except:
            pass
        try:
            document['aantal_paginas'] = int(re.search(r"\| (\d+) .* \|", link.find('p').string.strip()).group(1))
        except:
            pass
        try:
            document['grootte'] = re.search(r"\d+ (?!pagina).*", link.find('p').string.strip()).group()
        except:
            pass
        
        documenten.append(document)

    wob_meta['documenten'] = documenten
    wob_meta['aantal_documenten'] = len(documenten)
    return wob_meta

# Example
read_wob('https://www.rijksoverheid.nl/documenten/wob-verzoeken/2022/01/27/besluit-op-wob-verzoek-inzake-pegasus')

{'url': 'https://www.rijksoverheid.nl/documenten/wob-verzoeken/2022/01/27/besluit-op-wob-verzoek-inzake-pegasus',
 'titel': 'Besluit op Wob-verzoek inzake Pegasus',
 'beschrijving': 'De minister van Justitie en Veiligheid heeft op 27 januari 2022 een besluit genomen op een verzoek in het kader van de Wet openbaarheid van bestuur. Het besluit heeft betrekking op informatie inzake de\xa0 NSO Group en Pegasus software.',
 'verantwoordelijk': 'Ministerie van Justitie en Veiligheid',
 'datum_besluit': '2022-01-27',
 'documenten': [{'document_nr': 0,
   'url': 'https://www.rijksoverheid.nl/binaries/rijksoverheid/documenten/wob-verzoeken/2022/01/27/besluit-op-wob-verzoek-inzake-pegasus/Besluit+op+Wob+-verzoek+inzake+Pegasus.pdf',
   'bestandsnaam': 'Besluit+op+Wob+-verzoek+inzake+Pegasus.pdf',
   'bestandstype': 'pdf',
   'type': 'Besluit',
   'titel': 'Besluit op Wob -verzoek inzake Pegasus',
   'aantal_paginas': 6,
   'grootte': '2 MB'},
  {'document_nr': 1,
   'url': 'https://www.rijksover

## Combineren: in één keer alle Wobs lezen!

Onderstaande functie combineert alles:
- Het ophalen van ministeries en het aantal Wobs
- Per ministerie het aantal pagina's doorzoeken (aantal Wobs / 10, want 10 Wobs per pagina) en vinden van links
- Per link een dict met metadata maken

In [9]:
def read_all_wobs():

    search_link = "https://www.rijksoverheid.nl/documenten?type=Wob-verzoek"
    ministeries = get_ministeries()

    print("Extracting links to Wob-pages...")
    wob_links = []
    for ministerie, number_wobs in tqdm(ministeries):
        pages = ceil(number_wobs / 10)

        search_link_min = search_link + f'&onderdeel={ministerie.replace(" ", "+")}' + '&pagina='
        min_wobs = get_wob_links(search_link_min, pages)
        for wob in min_wobs:
            wob_links.append(wob)

    print("Reading Wobs, extracting metadata...")
    wob_meta = []
    for id, wob in tqdm(enumerate(wob_links)):
        wob_meta.append(read_wob(wob, id))

    return wob_meta

wob_meta = read_all_wobs()

Extracting links to Wob-pages...


  0%|          | 0/12 [00:00<?, ?it/s]

Reading Wobs, extracting metadata...


  0%|          | 0/2751 [00:00<?, ?it/s]

Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...
Status code is 429. Trying again in a minute...


In [10]:
import json

with open('wobs_rijksoverheid.json', 'w') as f:
    json.dump(wob_meta, f)

## Inventarissen

Code om op basis van de opgehaalde informatie alle geïdentificeerde inventarissen op te halen.

In [37]:
def download_doc(doc_link, export_dir):

    try:
        wget.download(doc_link, export_dir)
    except:
        time.sleep(60)
        download_doc(doc_link, export_dir)

def get_documents(csv_file, export_dir):
    """
    Extracts all Inventaris documents
    """
    df = pd.read_csv(csv_file)
    df['pdfs'] = df['pdfs'].map(lambda x: x.strip("][").split(', '))

    for idx, wob in tqdm(df.iterrows()):  

        date_string = str(wob['datum']).replace('/', '-')
        
        for doc_link in wob['pdfs']:
            doc_link = doc_link.strip("'")
            doc_type = get_doc_type(doc_link)

            if doc_type == 'Inventaris':
                # try:
                #     os.makedirs(f"{export_dir}/{wob['titel']} {date_string}/{doc_type}")
                # except FileExistsError:
                #     pass
                    
                download_doc(doc_link, export_dir)
                

get_documents('../data/wobs_rijksoverheid.csv', 'D:/School/wobs')

0it [00:00, ?it/s]