In [1]:
import os
from collections import defaultdict

import requests

import pandas as pd

In [2]:
YEARS = [2015, 2016]

In [6]:
def download_year(year):
#     import pdb; pdb.set_trace()
    print('Initial %d' % year)
    response = requests.get('http://iacs-online.dfz.bg/apex/f?p=100:1:2519959547410070')
    instance_id = response.url.split(':')[-1]

    max_rows = 10000000

    # 10000000 is max count
    data = '''p_flow_id:100
    p_flow_step_id:1
    p_instance:{instance_id}
    p_page_submission_id:440674281878682
    p_request:APPLY_SEARCH
    p_arg_names:13020010139748916
    p_t01:
    p_arg_names:13024311094768143
    p_t02:
    p_arg_names:13025631311783482
    p_t03:GT
    p_arg_names:13027400751793490
    p_t04:
    p_arg_names:13031001171749357
    p_t05:TOTAL
    p_arg_names:13043117504858129
    p_t06:{max_rows}
    p_arg_names:67245119208950917
    p_t07:{year}
    p_md5_checksum:'''.format(instance_id=instance_id, year=year, max_rows=max_rows).splitlines()
    data = [[k.strip() for k in x.split(':')] for x in data]
    post_data = defaultdict(list)
    for k, v in data:
        post_data[k].append(v)
#     import pdb; pdb.set_trace()
    print('Searching...')
    response = requests.post('http://iacs-online.dfz.bg/apex/wwv_flow.accept', data=post_data)

    # http://iacs-online.dfz.bg/apex/f?p=100:1:897845325062580::NO:::
    # http://iacs-online.dfz.bg/apex/f?p=100:1:897845325062580:FLOW_EXCEL_OUTPUT_R7009528473046037_bg

    args = response.url.split('=')[-1]
    args = args.replace('::NO:::', '')
    args += ':FLOW_EXCEL_OUTPUT_R7009528473046037_bg'
    url = 'http://iacs-online.dfz.bg/apex/f?p=' + args
    print('Downloading...')
    response = requests.get(url)
    return response.content


for year in YEARS:
    filename = 'data_%d_raw.csv' % year
    if not os.path.exists(filename):
        content = download_year(year)
        with open(filename, 'wb') as f:
            f.write(content)

Initial 2015
Searching...
Downloading...
Initial 2016
Searching...
Downloading...


In [3]:
def get_year_dfs(years):
    for year in years:
        df = pd.read_csv(f'data_{year}_raw.csv', delimiter=';', encoding='windows-1251')
        df['year'] = year
        yield df

df_raw = pd.concat(get_year_dfs(YEARS))
df_raw.head()

Unnamed: 0,"<!DOCTYPE HTML PUBLIC ""-//IETF//DTD HTML 2.0//EN"">",year
0,<html><head>,2015
1,<title>500 Internal Server Error</title>,2015
2,</head><body>,2015
3,<h1>Internal Server Error</h1>,2015
4,<p>The server encountered an internal error or,2015


In [4]:
original = u'Бенефициент	Област	Община	ЕФГЗ - ДП	ЕФГЗ	ЕЗФРСР - НБ	Публично складиране	Общо'.split('\t')
translation = u'Beneficiary\tDistrict\tMunicipality\tEAGF - DP\tEAGF\tEAFRD - NB\tPublic storage\tTotal'.split('\t')
df_raw = df_raw.rename(columns=dict(zip(original, translation)))
df_raw = df_raw.drop(['Unnamed: 8', 'Public storage'])
df_raw.head()

Unnamed: 0,"<!DOCTYPE HTML PUBLIC ""-//IETF//DTD HTML 2.0//EN"">",year
0,<html><head>,2015
1,<title>500 Internal Server Error</title>,2015
2,</head><body>,2015
3,<h1>Internal Server Error</h1>,2015
4,<p>The server encountered an internal error or,2015


In [5]:
id_vars = 'Beneficiary	District	Municipality'.split('\t')
value_vars = 'EAGF - DP	EAGF	EAFRD - NB'.split('\t')

df = pd.melt(df_raw, id_vars=id_vars, value_vars=value_vars, value_name='amount', var_name='scheme')
df.head()

KeyError: "None of [['Beneficiary', 'District', 'Municipality', 'EAGF - DP', 'EAGF', 'EAFRD - NB']] are in the [columns]"

In [None]:
df['amount'] = pd.to_numeric(df['amount'].str.replace(',', ''), errors='coerce')
df.head()

In [None]:
from slugify import slugify
df = df.rename(columns={
    'Beneficiary': 'recipient_name',
})

df['recipient_location'] = df.apply(lambda row: u'%s, %s' % (row['Municipality'], row['District']), 1)
df['recipient_id'] = df.apply(lambda x: 'BG-%s-%s' % (slugify(x['Municipality']), slugify(x['recipient_name'])), 1)
df['country'] = 'BG'
df['currency'] = 'BGN'
df.head()

In [None]:
df = df.drop(['District', 'Municipality'], 1)
df.to_csv('bg_%s.csv.gz' % '_'.join(YEARS), index=False, encoding='utf-8', compression='gzip')