In [1]:
import json
import os
import glob

import requests
import pandas as pd

In [2]:
YEAR = 2021
LIMIT = 1000
# From select dropdown value
BUDGET_YEARS = {
    2018: 54,
    2019: 289,
    2020: 291,
    2021: 292,
}
BASE_URL = 'https://www.belpa.be/wsExportDataTable?limit={limit}&offset={offset}&lg=fr&budget_year=' + str(BUDGET_YEARS[YEAR]) + '&sort=none&&sortType=ASC&'

In [3]:
!mkdir -p data

In [4]:
offset = 0
while True:
    filename = 'data/be_raw_%s_%s.json' % (YEAR, offset)
    if os.path.exists(filename):
        offset += LIMIT
        continue
    print(offset)
    response = requests.get(BASE_URL.format(
        limit=LIMIT,
        offset=offset
    ))
    data = response.json()
    with open(filename, 'w') as f:
        json.dump(data['data'][0], f)
    if not data['pager']['nextAvalaible']:
        break
    offset += LIMIT

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000


In [5]:
def parse_data(data):
    for amount_key in data['amount'].keys():
        if amount_key.endswith(('_total', '_feaga', '_feader')):
            # ignore total and total of feaga (sub feaga amounts are present)
            continue
        scheme = amount_key.replace('field_mnt_', '')
        yield {
            'recipient_name': data['organisation']['label'],
            'recipient_postcode': data['organisation']['code_postal'],
            'recipient_location': data['organisation']['state'],
            'year': int(data['year']),
            'scheme': scheme,
            'amount': float(data['amount'][amount_key]),
            'currency': 'EUR',
            'country': 'BE',
            'recipient_id': 'BE-{}-{}'.format(YEAR, data['organisation']['id'])
        }

def get_data(year):
    for filename in glob.glob('data/be_raw_{}_*'.format(year)):
        with open(filename) as f:
            for x in json.load(f):
                yield from parse_data(x)
            
df = pd.DataFrame(get_data(YEAR))
df.head()

Unnamed: 0,recipient_name,recipient_postcode,recipient_location,year,scheme,amount,currency,country,recipient_id
0,BLAISE FERNAND,BE-4800,Verviers,2021,ii1,1880.73,EUR,BE,BE-2021-7996
1,BLAISE FERNAND,BE-4800,Verviers,2021,ii3,3364.43,EUR,BE,BE-2021-7996
2,BLAISE FERNAND,BE-4800,Verviers,2021,ii4,1741.11,EUR,BE,BE-2021-7996
3,BLAISE FERNAND,BE-4800,Verviers,2021,ii7,3079.15,EUR,BE,BE-2021-7996
4,BLAISE FERNAND,BE-4800,Verviers,2021,ii10,121.15,EUR,BE,BE-2021-7996


In [6]:
df['scheme'].value_counts()

ii4      33446
ii1      33438
ii10     28732
ii3      13186
iva15    11530
ii7       9050
iva18     5366
iva17     2998
iva16     2177
ii6       1820
iva4      1781
iii3      1252
i1        1050
iva6       438
vb1_6      340
iva24      187
iva9       121
vb1_2       91
iva7        87
iva21       82
iva1        30
iva12       25
vb2_4       20
iva25       20
iva2        16
iva10       11
iii4         9
i4           6
iii2         5
iii7         4
iva14        4
vb3_1        4
iii10        3
i7           2
via_1        1
iva8         1
Name: scheme, dtype: int64

In [7]:
df.to_csv('be_{}.csv.gz'.format(YEAR), index=False, encoding='utf-8', compression='gzip')