In [3]:
import requests
from bs4 import BeautifulSoup
import regex as re
from tqdm.notebook import tqdm
import pandas as pd
import time
from math import ceil

import wget

In [16]:
def get_wob_links():
    wob_links = []
    for pagenumber in tqdm(range(1, 95)):
        r = requests.get(f'https://www.utrecht.nl/bestuur-en-organisatie/publicaties/openbaar-gemaakte-informatie-na-wob-verzoeken/page/{pagenumber}/')
        soup = BeautifulSoup(r.content, 'html.parser')

        for a in soup.find(class_='limiter').find_all('a'):
            link = a['href']

            if 'besluit' in a.string:
                wob_links.append(link)
    return wob_links

wob_links = get_wob_links()

  0%|          | 0/94 [00:00<?, ?it/s]

In [17]:
len(wob_links)

782

In [43]:
def get_doc_type(doclink):
    """
    Based on the link to a doc, try to identify the type of document.
    """
    prefix = 'https://www.utrecht.nl/fileadmin/uploads/documenten/7.extern/wob'
    doclink = re.sub(prefix, '', doclink.lower())
    options = {'Inventaris': ['inventaris'], 'Bijlage': ['bijlage', 'document', 'agenda', 'verslag'], 'Besluit': ['besluit', 'beslissing'], 'Verzoek': ['verzoek']}
    
    for option in options:
        for keyword in options[option]:
            if keyword in doclink:
                return option

    return 'Niet herkend'

In [44]:
def read_wob(wob_link):
    """
    Reads one wob, returns information in json format.
    Really specific for rijksoverheid.nl, because of the unique html tags and information.
    """
    source = 'https://www.utrecht.nl' 
    r = requests.get(wob_link)
    wob_meta = dict()

    # Try again in a minute if status code is not OK
    if r.status_code != 200:
        time.sleep(60)
        read_wob(wob_link)

    soup = BeautifulSoup(r.content, 'html.parser')

    # Try to find metadata of Wob-verzoek
    try:
        title = soup.find('h1').string.strip()
        wob_meta['titel'] = title
    except:
        pass
    try:
        intro = soup.find(class_='MsoNoSpacing').string.strip()
        wob_meta['beschrijving'] = intro

        try:
            date = re.search(r'\d+ [a-zA-Z]+ \d{4}', intro)
            wob_meta['datum_besluit'] = date.group()
        except:
            pass
    except:
        pass
    
    wob_meta['verantwoordelijk'] = 'Gemeente Utrecht'
    
    # Find all download links for further investigation
    download_links = soup.find(class_='limiter').find_all('a')
    documenten = []
    for id, link in enumerate(download_links):
        full_link = source + link['href']
        document = dict()
        document['document_nr'] = id
        document['link'] = full_link
        document['bestandsnaam'] = full_link.split('/')[-1]
        document['type'] = get_doc_type(full_link)
        
        documenten.append(document)

    wob_meta['documenten'] = documenten
    # wob_meta['aantal_pdf'] = len(documenten)
    return wob_meta

In [45]:
def read_all_wobs():
    wobs = []
    print("Getting Wob links...")
    wob_links = get_wob_links()

    print("Extracting Wob metadata...")
    for link in tqdm(wob_links):
        wobs.append(read_wob(link))

    return wobs

wob_meta = read_all_wobs()

Getting Wob links...


  0%|          | 0/94 [00:00<?, ?it/s]

Extracting Wob metadata...


  0%|          | 0/782 [00:00<?, ?it/s]

In [46]:
import json

with open('wobs_utrecht.json', 'w') as f:
    json.dump(wob_meta, f)