In [26]:
import re
import glob
import codecs
import io

import requests

import pandas as pd
import numpy as np
from slugify import slugify

from tqdm import tqdm_notebook

from lxml import html

NUM_RE = re.compile(r'\d+')

YEAR = 2016

In [11]:
BASE_URL = 'https://eps.lad.gov.lv/payment_recipients'
QUERY = {
    'commit': 'Meklēt',
    'eps_payment[display_name]': '',
    'eps_payment[district]': '',
    'eps_payment[fund]': 'elf',
    'eps_payment[schema]': '',
    'eps_payment[sum_from]': '',
    'eps_payment[sum_to]': '',
    'eps_payment[tax_payer_number]': '',
    'eps_payment[year]': str(YEAR),
    'eps_payment[year_type]': 'F',
    'format': 'csv',
    'utf8': '✓',
}

In [8]:
response = requests.get('https://eps.lad.gov.lv/payment_recipients')
root = html.fromstring(response.content)
districts = root.xpath('.//select[@name="eps_payment[district]"]/option/@value')[1:]
districts

['Latvija',
 'Aglonas nov.',
 'Aizkraukles nov.',
 'Aizputes nov.',
 'Aknīstes nov.',
 'Alojas nov.',
 'Alsungas nov.',
 'Alūksnes nov.',
 'Amatas nov.',
 'Apes nov.',
 'Auces nov.',
 'Ādažu nov.',
 'Babītes nov.',
 'Baldones nov.',
 'Baltinavas nov.',
 'Balvu nov.',
 'Bauskas nov.',
 'Beverīnas nov.',
 'Brocēnu nov.',
 'Burtnieku nov.',
 'Carnikavas nov.',
 'Cesvaines nov.',
 'Cēsu nov.',
 'Ciblas nov.',
 'Dagdas nov.',
 'Daugavpils',
 'Daugavpils nov.',
 'Dobeles nov.',
 'Dundagas nov.',
 'Durbes nov.',
 'Engures nov.',
 'Ērgļu nov.',
 'Garkalnes nov.',
 'Grobiņas nov.',
 'Gulbenes nov.',
 'Iecavas nov.',
 'Ikšķiles nov.',
 'Ilūkstes nov.',
 'Inčukalna nov.',
 'Jaunjelgavas nov.',
 'Jaunpiebalgas nov.',
 'Jaunpils nov.',
 'Jelgava',
 'Jelgavas nov.',
 'Jēkabpils',
 'Jēkabpils nov.',
 'Jūrmala',
 'Kandavas nov.',
 'Kārsavas nov.',
 'Kocēnu nov.',
 'Kokneses nov.',
 'Krāslavas nov.',
 'Krimuldas nov.',
 'Krustpils nov.',
 'Kuldīgas nov.',
 'Ķeguma nov.',
 'Ķekavas nov.',
 'Lielvārdes n

In [12]:
for district in tqdm_notebook(districts):
    query = dict(QUERY)
    query['eps_payment[district]'] = district
    print('requesting', district)
    response = requests.get(BASE_URL, params=query)
    with open('lv_%s_%s.csv' % (YEAR, district), 'wb') as f:
        f.write(response.content)
        

requesting Latvija
requesting Aglonas nov.
requesting Aizkraukles nov.
requesting Aizputes nov.
requesting Aknīstes nov.
requesting Alojas nov.
requesting Alsungas nov.
requesting Alūksnes nov.
requesting Amatas nov.
requesting Apes nov.
requesting Auces nov.
requesting Ādažu nov.
requesting Babītes nov.
requesting Baldones nov.
requesting Baltinavas nov.
requesting Balvu nov.
requesting Bauskas nov.
requesting Beverīnas nov.
requesting Brocēnu nov.
requesting Burtnieku nov.
requesting Carnikavas nov.
requesting Cesvaines nov.
requesting Cēsu nov.
requesting Ciblas nov.
requesting Dagdas nov.
requesting Daugavpils
requesting Daugavpils nov.
requesting Dobeles nov.
requesting Dundagas nov.
requesting Durbes nov.
requesting Engures nov.
requesting Ērgļu nov.
requesting Garkalnes nov.
requesting Grobiņas nov.
requesting Gulbenes nov.
requesting Iecavas nov.
requesting Ikšķiles nov.
requesting Ilūkstes nov.
requesting Inčukalna nov.
requesting Jaunjelgavas nov.
requesting Jaunpiebalgas nov

In [21]:
!head "lv_2016_Engures nov..csv"

����
 " M a k s j u m u   s a Fm j i " 
 " S a Fm j s " ; " A t b a l s t a   s a Fm j a   a t r a aa n s   v i e t a " ; " S a Fe m t o   m a k s j u m u   p e r i o d s " ; " E L G F   u n   E L F L A ,   E U R " 
 " J u r i s   a r b a r t s " ; " E n g u r e s   n o v . " ; " 2 0 1 5 - 2 0 1 6 " ; " 3 3 3 8 . 6 " 
 " " ; " " ; " P r i e k al a i c +g   p e n s i o n aa n s " ; " 2 4 9 6 . 3 2 " 
 " " ; " " ; " P r i e k al a i c +g   p e n s i o n aa n s " ; " 8 4 2 . 2 8 " 
 " A u s t r a   T o l k a e v a " ; " E n g u r e s   n o v . " ; " 2 0 1 5 - 2 0 1 6 " ; " 5 7 0 . 0 " 
 " " ; " " ; " M a z o   l a u k s a i m n i e k u   a t b a l s t a   s h m a s   m a k s j u m s " ; " 5 0 0 . 0 " 
 " " ; " " ; " K o m p e n s c i j a s   m a k s j u m s   p a r   c i t i e m   a p g a b a l i e m ,   k u r o s   i r   i e v r o j a m i   d a b a s   i e r o b e ~o j u m i " ; " 7 0 . 0 " 
 " B e n i t a   L c e " ; " E

In [47]:
COLUMNS = ['recipient_name',
           'recipient_location',
           'scheme',
           'amount'
]
FILENAMES = glob.glob('lv_%s_*.csv' % YEAR)
df = pd.concat([pd.read_csv(filename, encoding='utf-16', sep=';', skiprows=3, names=COLUMNS) for filename in FILENAMES])
df['recipient_name'] = df['recipient_name'].fillna(method='ffill')
df['recipient_location'] = df['recipient_location'].fillna(method='ffill')

# Remove totals
df = df[df['scheme'] != ('%s-%s' % (YEAR - 1, YEAR))]

df['recipient_postcode'] = None
df['recipient_address'] = None
df['recipient_country'] = 'LV'
df['currency'] = 'EUR'
df['year'] = YEAR
df.head()

Unnamed: 0,recipient_name,recipient_location,scheme,amount,recipient_postcode,recipient_address,recipient_country,currency,year
1,Juris Čarbarts,Engures nov.,Priekšlaicīgā pensionēšanās,2496.32,,,LV,EUR,2016
2,Juris Čarbarts,Engures nov.,Priekšlaicīgā pensionēšanās,842.28,,,LV,EUR,2016
4,Austra Tolkačeva,Engures nov.,Mazo lauksaimnieku atbalsta shēmas maksājums,500.0,,,LV,EUR,2016
5,Austra Tolkačeva,Engures nov.,"Kompensācijas maksājums par citiem apgabaliem,...",70.0,,,LV,EUR,2016
7,Benita Lāce,Engures nov.,Mazo lauksaimnieku atbalsta shēmas maksājums,500.0,,,LV,EUR,2016


In [48]:
df['recipient_id'] = df['recipient_name'].apply(lambda x: x if pd.notnull(x) and NUM_RE.match(x) else np.nan)
df['recipient_id'] = df.apply(lambda x: 'LV-%s-%s' % (x['year'], x['recipient_id']) if pd.notnull(x['recipient_id']) else
                              'LV-%s-%s' % (slugify(
                                      x['recipient_location'] if pd.notnull(x['recipient_location']) else ''
                                  ), slugify(
                                      x['recipient_name'] if pd.notnull(x['recipient_name']) else ''
                                  )
                                ), 1)
df['recipient_name'] = df['recipient_name'].apply(lambda x: np.nan if pd.isnull(x) or NUM_RE.match(x) else x)

In [49]:
df.head()

Unnamed: 0,recipient_name,recipient_location,scheme,amount,recipient_postcode,recipient_address,recipient_country,currency,year,recipient_id
1,Juris Čarbarts,Engures nov.,Priekšlaicīgā pensionēšanās,2496.32,,,LV,EUR,2016,LV-engures-nov-juris-carbarts
2,Juris Čarbarts,Engures nov.,Priekšlaicīgā pensionēšanās,842.28,,,LV,EUR,2016,LV-engures-nov-juris-carbarts
4,Austra Tolkačeva,Engures nov.,Mazo lauksaimnieku atbalsta shēmas maksājums,500.0,,,LV,EUR,2016,LV-engures-nov-austra-tolkaceva
5,Austra Tolkačeva,Engures nov.,"Kompensācijas maksājums par citiem apgabaliem,...",70.0,,,LV,EUR,2016,LV-engures-nov-austra-tolkaceva
7,Benita Lāce,Engures nov.,Mazo lauksaimnieku atbalsta shēmas maksājums,500.0,,,LV,EUR,2016,LV-engures-nov-benita-lace


In [50]:
df.to_csv('lv_%s.csv.gz' % YEAR, index=False, compression='gzip', encoding='utf-8')