In [39]:
import os
import glob
import math
from collections import defaultdict

import requests
from lxml import html
from tqdm import tqdm

import pandas as pd
from slugify import slugify

In [2]:
YEARS = [2015, 2016]

In [34]:
BASE_URL = 'http://iacs-online.dfz.bg/apex/f'

def download_year(year):
#     import pdb; pdb.set_trace()
    print('Initial %d' % year)
    session = requests.Session()
    response = session.get(BASE_URL, params={'p': '100:1:2519959547410070'}) # initial instance
    instance_id = response.url.split(':')[-1]
    print(instance_id)
    rows_per_page = 10000

    # 10000000 is max count
    data = '''p_flow_id:100
    p_flow_step_id:1
    p_instance:{instance_id}
    p_page_submission_id:440674281878682
    p_request:APPLY_SEARCH
    p_arg_names:13020010139748916
    p_t01:
    p_arg_names:13024311094768143
    p_t02:
    p_arg_names:13025631311783482
    p_t03:GT
    p_arg_names:13027400751793490
    p_t04:
    p_arg_names:13031001171749357
    p_t05:TOTAL
    p_arg_names:13043117504858129
    p_t06:{rows_per_page}
    p_arg_names:67245119208950917
    p_t07:{year}
    p_md5_checksum:'''.format(instance_id=instance_id, year=year, rows_per_page=rows_per_page).splitlines()
    data = [[k.strip() for k in x.split(':')] for x in data]
    post_data = defaultdict(list)
    for k, v in data:
        post_data[k].append(v)
#     import pdb; pdb.set_trace()
    print('Searching...')
    response = session.post('http://iacs-online.dfz.bg/apex/wwv_flow.accept', data=post_data)
    
    root = html.fromstring(response.content)
    count = root.xpath('//span[@class="fielddata"]/text()')[0]
    result_rows = int(count.split()[-1])
    max_pages = math.ceil(result_rows / rows_per_page)
    print(result_rows, max_pages)
    # http://iacs-online.dfz.bg/apex/f?p=100:1:897845325062580::NO:::
    # http://iacs-online.dfz.bg/apex/f?p=100:1:897845325062580:FLOW_EXCEL_OUTPUT_R7009528473046037_bg

    #     args = response.url.split('=')[-1]
    #     args = args.replace('::NO:::', '')
    #     args += ':FLOW_EXCEL_OUTPUT_R7009528473046037_bg'
    #     url = '?p=' + args
    print('Downloading...')
    
    page = 1
    for page in tqdm(range(1, max_pages + 1)):
        page_data = {
            'p': '100:1:{instance_id}:pg_R_7009528473046037:NO'.format(instance_id=instance_id),
            'pg_min_row': str((page - 1) * rows_per_page + 1),
            'pg_max_rows': str(rows_per_page),
            'pg_rows_fetched': str(rows_per_page),
        }
        response = session.get(BASE_URL, params=page_data)
        yield get_rows(response, year)


def get_rows(response, year):
    root = html.fromstring(response.content)
    table = root.xpath('//table[@class="t3standard"]')[0]
    rows = table.xpath('./tr')[1:-1]
    mapping = 'recipient_name	district	municipality	EAGF DP	EAGF	EAFRD - ND	Public storage	Total	Measure	Description'.split('\t')
    for row in rows:
        cells = row.xpath('.//td/text()')
        row = dict(zip(mapping, cells))
        base_info = {
            'recipient_id': slugify('BG-%s-%s-%s' % (row['recipient_name'], row['municipality'], row['district'])),
            'recipient_name': row['recipient_name'],
            'recipient_location': '%s, %s' % (row['municipality'], row['district']),
            'recipient_postcode': None,
            'recipient_address': None,
            'country': 'BG',
        }
        for key in ('EAGF DP', 'EAGF', 'EAFRD - ND'):
            amount = float(row[key].replace(',', ''))
            if amount == 0.0:
                continue
            d = {
                'amount': amount,
                'year': year,
                'currency': 'BGN',
                'scheme': '%s - %s - %s' % (key, row['Measure'].strip(), row['Description'].strip())
            }
            d.update(base_info)
            yield d


In [30]:
!mkdir -p data

In [44]:
YEAR = 2017

def get_year(year):
    page_generator = download_year(year)
    for i, row_generator in enumerate(page_generator):
        df = pd.DataFrame(row_generator)
        df.to_csv('data/BG_{}_{}.csv.gz'.format(year, i), index=False, compression='gzip')

In [38]:
get_year(YEAR)

Initial 2017
3391091644796414
Searching...




  0%|          | 0/37 [00:00<?, ?it/s][A[A

365629 37
Downloading...




  3%|▎         | 1/37 [00:16<09:38, 16.06s/it][A[A

  5%|▌         | 2/37 [00:32<09:21, 16.04s/it][A[A

  8%|▊         | 3/37 [00:56<10:44, 18.96s/it][A[A

 11%|█         | 4/37 [01:14<10:16, 18.68s/it][A[A

 14%|█▎        | 5/37 [01:30<09:41, 18.18s/it][A[A

 16%|█▌        | 6/37 [01:46<09:08, 17.68s/it][A[A

 19%|█▉        | 7/37 [02:21<10:05, 20.18s/it][A[A

 22%|██▏       | 8/37 [02:37<09:32, 19.73s/it][A[A

 24%|██▍       | 9/37 [02:53<09:00, 19.31s/it][A[A

 27%|██▋       | 10/37 [03:09<08:31, 18.93s/it][A[A

 30%|██▉       | 11/37 [03:24<08:03, 18.61s/it][A[A

 32%|███▏      | 12/37 [03:40<07:40, 18.41s/it][A[A

 35%|███▌      | 13/37 [03:57<07:17, 18.25s/it][A[A

 38%|███▊      | 14/37 [05:37<09:14, 24.09s/it][A[A

 41%|████      | 15/37 [06:21<09:19, 25.45s/it][A[A

 43%|████▎     | 16/37 [07:06<09:19, 26.64s/it][A[A

 46%|████▌     | 17/37 [08:29<09:59, 30.00s/it][A[A

 49%|████▊     | 18/37 [09:12<09:43, 30.70s/it][A[A

 51%|█████▏    | 

In [45]:
df = pd.concat([pd.read_csv(fn, compression='gzip') for fn in glob.glob('data/BG_{}_*.csv.gz'.format(YEAR))])
df.head()

Unnamed: 0,amount,country,currency,recipient_address,recipient_id,recipient_location,recipient_name,recipient_postcode,scheme,year
0,356.49,BG,BGN,,bg-mekhmed-ademov-omurtag-trgovishche,"Омуртаг, Търговище",МЕХМЕД АДЕМОВ,,EAGF DP - СПП - Схема за преразпределително пл...,2017
1,507.53,BG,BGN,,bg-mekhmed-ademov-omurtag-trgovishche,"Омуртаг, Търговище",МЕХМЕД АДЕМОВ,,EAGF DP - ДПЖ - Схема за обвързано подпомагане...,2017
2,305.33,BG,BGN,,bg-mekhmed-ademov-omurtag-trgovishche,"Омуртаг, Търговище",МЕХМЕД АДЕМОВ,,EAGF DP - Схема за плащане за селскостопански ...,2017
3,1370.33,BG,BGN,,bg-mekhmed-adil-alfatar-silistra,"Алфатар, Силистра",МЕХМЕД АДИЛ,,EAGF DP - ДПЖ - Схема за обвързано подпомагане...,2017
4,162.0,BG,BGN,,bg-mekhmed-adil-alfatar-silistra,"Алфатар, Силистра",МЕХМЕД АДИЛ,,"EAFRD - ND - Държавна помощ de minimis, съглас...",2017


In [46]:
df.to_csv('bg_%s.csv.gz' % YEAR, index=False, encoding='utf-8', compression='gzip')