In [1]:
import json
import itertools
import os
import re
from urllib.parse import urlsplit

import requests
from lxml import html as etree

import pdfcutter

In [2]:
BASE_URL = 'https://www.landesvertretung.bremen.de'
INDEX_URL = 'https://www.diebevollmaechtigte.bremen.de/service/bundesratsbeschluesse-17466'
PDF_URL = 'https://www.diebevollmaechtigte.bremen.de/sixcms/media.php/13/{number}.%20BR-Sitzung_Kurzbericht.pdf'

In [3]:
LINK_TEXT_RE = re.compile(r'(\d+)\. Sitzung')

def get_pdf_urls():
    response = requests.get(INDEX_URL)
    root = etree.fromstring(response.content)
    names = root.xpath('.//ul/li/a/span')
    for name in names:
        text = name.text_content()
        num = LINK_TEXT_RE.search(text)
        if num is None:
            continue
        num = int(num.group(1))
        yield num, PDF_URL.format(number=num)

In [4]:
PDF_URLS = dict(get_pdf_urls())

In [5]:
os.makedirs('./_cache', exist_ok=True)

def get_filename_url(url):
    splitresult = urlsplit(url)
    filename = splitresult.path.replace('/', '_')
    filename = os.path.join('./_cache', filename)
    if os.path.exists(filename):
        return filename
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception('{} not found'.format(url))
    with open(filename, 'wb') as f:
        f.write(response.content)
    return filename

def get_session_pdf_filename(session):
    url = PDF_URLS[session['number']]
    return get_filename_url(url)


In [6]:
with open('../bundesrat/sessions.json') as f:
    sessions = json.load(f)
len(sessions)

64

In [7]:
def with_next(iterable):
    a, b = itertools.tee(iterable)
    next(b, None)
    return itertools.zip_longest(a, b)

In [8]:
def reformat_top_num(top_num):
    try:
        num = int(top_num)
        return top_num.zfill(3)
    except ValueError:
        return '{} {}'.format(top_num[:-1].zfill(3), top_num[-1])

def get_reformatted_tops(top_nums):
    return [reformat_top_num(t) for t in top_nums]


def get_beschluesse_text(session, filename):
    cutter = pdfcutter.PDFCutter(filename=filename)
    debugger = cutter.get_debugger()
    top_nums = [t['number'] for t in session['tops'] if t['top_type'] == 'normal']
    reformatted_top_nums = get_reformatted_tops(top_nums)

    session_number = session['number']
    page_heading = cutter.filter(search='Ergebnisse der {}. Sitzung des Bundesrates'.format(session_number))[0]

    page_number = list(cutter.filter(search='1', page=1))[-1]
    column_two = 705

    for top_num, (current, next_) in zip(top_nums, with_next(reformatted_top_nums)):
        current_top = cutter.filter(auto_regex='^{}$'.format(current))

        next_top = None
        if next_ is not None:
            next_top = cutter.filter(auto_regex='^{}$'.format(next_))

        senats = cutter.filter(auto_regex='^Senats-?') | cutter.filter(auto_regex='^Beschluss$')
        senats = senats.below(current_top)
        if next_top:
            senats = senats.above(next_top)

        ergebnis_br = cutter.filter(auto_regex='^Ergebnis BR$').below(current_top)
        if next_top:
            ergebnis_br = ergebnis_br.above(next_top)
            
        senats_text = cutter.all().filter(
            doc_top__gte=senats.doc_top,
            top__gte=page_heading.bottom,
            bottom__lt=page_number.bottom,
            right__lt=column_two
        )

        br_text = cutter.all().filter(
            doc_top__gte=ergebnis_br.doc_top,
            top__gte=page_heading.bottom,
            bottom__lt=page_number.bottom,
            right__lt=column_two
        )

        if next_top:
            br_text = br_text.above(next_top)
            senats_text = senats_text.above(ergebnis_br)
            
        senats_text = senats_text.right_of(senats)
        br_text = br_text.right_of(ergebnis_br)

        yield top_num, {'senat': senats_text.clean_text(), 'bundesrat': br_text.clean_text()}

In [9]:
def get_session(session):
    try:
        filename = get_session_pdf_filename(session)
    except KeyError:
        return
    return dict(get_beschluesse_text(session, filename))

In [10]:
FILENAME = 'session_tops.json'
if os.path.exists(FILENAME):
    with open(FILENAME) as f:
        session_tops = json.load(f)
else:
    session_tops = {}

for session in sessions:
    num = session['number']
    print('Session', num)
    if str(num) in session_tops:
        continue
    result = get_session(session)
    if result is None:
        continue
    session_tops[str(num)] = result
    with open(FILENAME, 'w') as f:
        json.dump(session_tops, f)

print('Total sessions:', len(session_tops))

Session 973
Session 972
Session 971
Session 970
Session 969
Session 968
Session 967
Session 966
Session 965
Session 964
Session 963
Session 962
Session 961
Session 960
Session 959
Session 958
Session 957
Session 956
Session 955
Session 954
Session 953
Session 952
Session 951
Session 950
Session 949
Session 948
Session 947
Session 946
Session 945
Session 944
Session 943
Session 942
Session 941
Session 940
Session 939
Session 938
Session 937
Session 936
Session 935
Session 934
Session 933
Session 932
Session 931
Session 930
Session 929
Session 928
Session 927
Session 926
Session 925
Session 924
Session 923
Session 922
Session 921
Session 920
Session 919
Session 918
Session 917
Session 916
Session 915
Session 914
Session 913
Session 912
Session 911
Session 910
Total sessions: 40
