In [None]:
import requests
from lxml import html

import pandas as pd

In [None]:
YEARS = [2016]

In [None]:
BASE_URL = 'http://saturn.etat.lu/mafea/result.do'
ENCODING = 'ISO-8859-1'

In [None]:
response = requests.get(BASE_URL)
root = html.fromstring(response.text)
COMMUNES = root.xpath('//select[@name="commune"]/option/@value')
AMOUNTS = root.xpath('//select[@name="value"]/option/@value')

In [None]:
def get_queries():
    for year in YEARS:
        for c in COMMUNES:
            yield (year, c,)


def download_year(year, commune):
    print('Running with', year, commune)
    session = requests.Session()

    for amount_range in AMOUNTS:
        response = session.post(BASE_URL, data={
            'year': str(year),
            'fond': '',
            'commune': commune,
            'name': '',
            'value': amount_range,
            'action': 'Rechercher'
        })
        response.encoding = ENCODING
        if 'Votre recherche ne retourne aucun résultat' in response.text:
            continue
        try:
            yield from extract_recipients(year, commune, amount_range, html.fromstring(response.text))
        except Exception as e:
            print('Exception at', year, commune, amount_range)
            raise e
    print('Done with', year, commune)

def extract_recipients(year, commune, amount_range, dom):
    for div in dom.xpath('.//div[@class="fieldsetWrapper"]'):
        results = div.xpath('.//table[@class="results"]//tr[@class="results"]')
        name_row = results[0]
        name_tds = name_row.xpath('./td/text()')
        if len(name_tds) == 3:
            name = '%s %s' % (name_tds[1], name_tds[0])
            location = name_tds[2]
            recipient_id = None
        else:
            name = None
            recipient_id = name_tds[0]
            location = name_tds[1]

        if recipient_id is None:
            recipient_id = 'LU-%s-%s' % (location, name)
        else:
            recipient_id = 'LU-%s-%s' % (year, recipient_id)

        value_rows = results[1:-1]
        for value_row in value_rows:
            val = value_row.xpath('./td/text()')
            scheme = val[1]
            amount = float(val[2].replace('€', ''))
            year = int(val[0])
            yield {
                'recipient_id': recipient_id,
                'recipient_name': name,
                'recipient_postcode': None,
                'recipient_location': location,
                'scheme': scheme,
                'amount': amount,
                'currency': 'EUR',
                'country': 'LU',
                'year': year,
            }

In [None]:
df = pd.DataFrame()
for year, commune in get_queries():
    df = pd.concat([df, pd.DataFrame(download_year(year, commune))])
    print(len(df))
    df.to_csv('lu_%s.csv.gz' % year, compression='gzip', index=False)