In [12]:
import requests
from lxml import html

import pandas as pd

from slugify import slugify

from IPython.core.display import display, HTML


In [21]:
YEAR = 2020

In [22]:
BASE_URL = 'https://portal.nma.lt/nma-portal/pages/fas_search'
PROGRAM = 'KP13'

QUERY = {
    'pa': 'pl',
    'pTipas': 'p',
    'psl_nr': '1',
    'programos_kodas': PROGRAM,
    'fin_metai': str(YEAR),
    'pareiskejas': '',
    'apskritis': '',
    'savivaldybe': '',
    'priemone': '',
    't_suma': '',
    'k_suma': '',
    'v_suma': '',
    'b_suma': '',
    'action': 'Ieškoti',
}

def get_page(session, year=YEAR, page=1):
    query = dict(QUERY)
    query.update({
        'psl_nr': str(page),
        'fin_metai': str(year)
    })
    response = session.post(BASE_URL, data=query, headers={
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://portal.nma.lt/nma-portal/pages/fas_search',
    })
    return response


In [5]:
class EndOfPagination(Exception):
    pass

SUBSIDY_TYPES = {
    0: 'EAGF',
    1: 'EAGF - other',
    2: 'EAFRD',
}

def extract_subsidies(text, year):
    root = html.fromstring(text)
    table = root.xpath('//table[1]')
    if not table:
        raise EndOfPagination
    table = table[0]
    trs = table.xpath('.//tr')
    base = None
    for tr in trs:
        tds = tr.xpath('./td')
        if tr.attrib.get('id'):
            base = {
                'year': year,
                'country': 'LT',
                'currency': 'EUR',
#                 'recipient_id': tr.attrib.get('id'), # bad not id
                'recipient_name': tds[0].text_content(),
                'recipient_location': '%s, %s' % (
                    tds[2].text_content().replace('rajonas', '').strip(),
                    tds[1].text_content().replace('apskritis', '').strip(),
                )
            }
        else:
            subsidy_name = tds[3].text_content()
            for i in range(3):
                amount = float(tds[4 + i].text_content().replace(',', '.'))
                if amount > 0:
                    subsidy = dict(base)
                    subsidy.update({
                        'scheme': '%s (%s)' % (subsidy_name, SUBSIDY_TYPES[i]),
                        'amount': amount
                    })
                    yield subsidy


In [23]:
def get_subsidies(year):
    session = requests.Session()
    response = session.get(BASE_URL)
    # display(HTML(response.text))
    response = session.post(BASE_URL, data={'pa': 'sf', 'programos_kodas': PROGRAM}, headers={
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://portal.nma.lt/nma-portal/pages/fas_search'
    })
    # print(response.request.headers)
    # display(HTML(response.text))
    page = 1
    while True:
        r = get_page(session, year=year, page=page)
        yield extract_subsidies(r.text, year)
        page += 1

In [None]:
df = pd.DataFrame()
for page_gen in get_subsidies(YEAR):
    try:
        df = pd.concat([df, pd.DataFrame(page_gen)])
    except EndOfPagination:
        pass
    df.to_csv('lt_%s.csv.gz' % YEAR, compression='gzip', index=False)
    print(len(df))

496
970
1466
2027
2528
3090
3633
4193
4698
5200
5716
6205
6741
7283
7829
8321
8799
9286
9788
10247
10731
11300
11834
12342
12846
13390
13934
14445
14970
15510
16055
16587
17109
17630
18162
18751
19340
19907
20453
21004
21550
22138
22661
23187
23687
24224
24747
25328
25912
26498
27055
27618
28174
28682
29300
29919
30540
31126
31781
32282
32736
33241
33710
34276
34794
35324
35805
36357
36875
37415
37959
38535
39087
39656
40160
40693
41249
41745
42271
42859
43466
44071
44639
45273
45864
46478
47080
47634
48209
48775
49335
49880
50471
51027
51326
51861
52372
52945
53492
54070
54637
55138
55699
56246
56837
57425
57969
58527
59036
59541
60003
60517
61026
61556
62078
62539
63065
63564
64054
64559
65070
65595
66121
66616
67166
67718
68302
68904
69425
69986
70557
71112
71660
72177
72740
73329
73871
74432
74994
75512
76050
76576
77116
77610
78129
78642
79122
79592
80119
80666
81248
81843
82461
83031
83635
84256
84818
85407
85988
86488
87036
87683
88275
88883
89393
89941
90470
90984
91567
92136
9

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842
587842

In [25]:
df["recipient_name"].nunique()

132659

In [26]:
df['recipient_id'] = df.apply(lambda x: "LT-{}-{}".format(slugify(x['recipient_name']), slugify(x['recipient_location'])), 1)

In [27]:
df.head()

Unnamed: 0,year,country,currency,recipient_id,recipient_name,recipient_location,scheme,amount
0,2020,LT,EUR,LT-a-vaiciukynienes-individuali-imone-jurbarko...,A. Vaičiukynienės individuali įmonė,"Jurbarko, Tauragės",Žalinimo išmoka (EAGF),1373.08
1,2020,LT,EUR,LT-a-vaiciukynienes-individuali-imone-jurbarko...,A. Vaičiukynienės individuali įmonė,"Jurbarko, Tauragės",Finansų disciplinos kompensavimo išmoka (EAGF),37.54
2,2020,LT,EUR,LT-a-vaiciukynienes-individuali-imone-jurbarko...,A. Vaičiukynienės individuali įmonė,"Jurbarko, Tauragės",Susietoji parama už baltyminius augalus (EAGF),87.1
3,2020,LT,EUR,LT-a-vaiciukynienes-individuali-imone-jurbarko...,A. Vaičiukynienės individuali įmonė,"Jurbarko, Tauragės",Tiesioginės išmokos už pasėlius (EAGF),1690.05
4,2020,LT,EUR,LT-a-vaiciukynienes-individuali-imone-jurbarko...,A. Vaičiukynienės individuali įmonė,"Jurbarko, Tauragės",Išmoka už pirmuosius hektarus (EAGF),1591.48


In [19]:
df.to_csv('lt_%s.csv.gz' % YEAR, compression='gzip', index=False)