In [None]:
import requests
from lxml import html

import pandas as pd

from IPython.core.display import display, HTML


In [None]:
r.request.headers

In [None]:
BASE_URL = 'https://portal.nma.lt/nma-portal/pages/fas_search'
PROGRAM = 'KP13'

QUERY = {
    'pa': 'pl',
    'pTipas': 'p',
    'psl_nr': '1',
    'programos_kodas': PROGRAM,
    'fin_metai': '2015',
    'pareiskejas': '',
    'apskritis': '',
    'savivaldybe': '',
    'priemone': '',
    't_suma': '',
    'k_suma': '',
    'v_suma': '',
    'b_suma': '',
    'action': 'IeÅ¡koti',
}

def get_page(session, year=2015, page=1):
    query = dict(QUERY)
    query.update({
        'psl_nr': str(page),
        'fin_metai': str(year)
    })
    response = session.post(BASE_URL, data=query, headers={
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://portal.nma.lt/nma-portal/pages/fas_search',
    })
    return response


In [None]:
class EndOfPagination(Exception):
    pass

SUBSIDY_TYPES = {
    0: 'EAGF',
    1: 'EAGF - other',
    2: 'EAFRD',
}

def extract_subsidies(text, year):
    root = html.fromstring(text)
    table = root.xpath('//table[1]')
    if not table:
        raise EndOfPagination
    table = table[0]
    trs = table.xpath('.//tr')
    base = None
    for tr in trs:
        tds = tr.xpath('./td')
        if tr.attrib.get('id'):
            base = {
                'year': year,
                'recipient_id': tr.attrib.get('id'),
                'recipient_name': tds[0].text_content(),
                'recipient_location': '%s, %s' % (
                    tds[2].text_content().replace('rajonas', '').strip(),
                    tds[1].text_content().replace('apskritis', '').strip(),
                )
            }
        else:
            subsidy_name = tds[3].text_content()
            for i in range(3):
                amount = float(tds[4 + i].text_content().replace(',', '.'))
                if amount > 0:
                    subsidy = dict(base)
                    subsidy.update({
                        'scheme': '%s (%s)' % (subsidy_name, SUBSIDY_TYPES[i]),
                        'amount': amount
                    })
                    yield subsidy




In [None]:
def get_subsidies(year):
    session = requests.Session()
    response = session.get(BASE_URL)
    # display(HTML(response.text))
    response = session.post(BASE_URL, data={'pa': 'sf', 'programos_kodas': PROGRAM}, headers={
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://portal.nma.lt/nma-portal/pages/fas_search'
    })
    # print(response.request.headers)
    # display(HTML(response.text))
    page = 1
    while True:
        r = get_page(session, year=year, page=page)
        yield extract_subsidies(r.text, year)
        page += 1
    except EndOfPagination:
        pass

In [None]:
YEAR = 2016
df = pd.DataFrame()
for page_gen in get_subsidies(YEAR):
    try:
        df = pd.concat([df, pd.DataFrame(page_gen)])
    except EndOfPagination:
        pass
    df.to_csv('lt_%s.csv.gz' % YEAR, compression='gzip')
    print(len(df))

In [None]:
df.head()

In [None]:
!pwd