In [4]:
import re
import math
import os

import requests

from multidict import MultiDict
from lxml import html

from tqdm import tqdm

from slugify import slugify

import pandas as pd

from IPython.core.display import display, HTML

In [5]:
YEAR = 2018
jvs = 'javax.faces.ViewState'
search_config = {
    'searchForm:searchButton': 'Search',
    'searchForm:searchLan': '-1',
    'searchForm:searchKommun': '-1',
    'searchForm:searchPostort': '-1',
}
BASE_URL = 'https://etjanst.sjv.se/asken/faces/jbstod/searchJbstod.jsp'

In [12]:
def get_form_data_from_form(form):
    form_data = MultiDict()
    for el in form.xpath('.//input[@name]|.//select[@name]|.//textarea[@name]|.//button[@name]'):
        data = {}
        if el.tag == 'input':
            if el.attrib.get('type') == 'radio' or el.attrib.get('type') == 'checkbox':
                if el.attrib.get('checked', None):
                    data[el.attrib['name']] = el.attrib.get('value', '')
            else:
                data[el.attrib['name']] = el.attrib.get('value', '')
        elif el.tag == 'select':
            options = el.xpath('./option[@selected]')
            if options:
                data[el.attrib['name']] = options[0].attrib.get('value', '')
            else:
                data[el.attrib['name']] = ''
        elif el.tag == 'textarea':
            data[el.sttrib['name']] = el.text or ''
        elif el.tag == 'button':
            if el.attrib.get('type', None) == 'submit':
                data[el.attrib['name']] = el.attrib.get('value', '')
        form_data.extend(data)
    return form_data

In [29]:
WS_RE = re.compile('\s')

def remove_ws(s):
    return WS_RE.sub('', s)

def get_initial_form_data(session):
    response = session.get(BASE_URL)
    root = html.fromstring(response.content)
    return get_form_data(root)

def get_form_data(root):
    form_data = get_form_data_from_form(root.xpath('//form')[0])
    submit_buttons = root.xpath('//form//input[@type="submit"]')
    clear_buttons = [el for el in submit_buttons if 'searchButton' not in el.attrib['name']]
    form_data.update(search_config)
    for cb in clear_buttons:
        form_data.pop(cb.attrib['name']) # remove clear button
    return form_data

def make_ajax_request(session, form_data, data):
    d = dict(form_data)
    d.update(data)
    response = session.post(BASE_URL, data=d)
    root = html.fromstring(response.content.decode('utf-8'))
    return root

def get_detail(session, item, state):
    item_key = item.xpath('.//a/@name')[0]
    d = {
        'AJAXREQUEST': 'j_id0',
        'searchResultsForm_SUBMIT': '1',
    }
    d.update({item_key: item_key})
    root = make_ajax_request(session, d, state)
    return root

def extract_detail(item):
    name = item.xpath('.//div[@class="stodmottagareColumn namn"]/text()')[0]
    lan = item.xpath('.//div[@class="stodmottagareColumn lan"]/text()')[0]
    kommun = item.xpath('.//div[@class="stodmottagareColumn kommun"]/text()')[0]
    postadress = item.xpath('.//div[@class="stodmottagareColumn postadress"]/text()')[0]
    try:
        postcode, city = postadress.split(' ', 1)
        location = '%s, %s, %s' % (city, kommun, lan)
    except ValueError:
        postcode = None
        location = '%s, %s, %s' % (kommun, lan, postadress)

    base_info = {
        'recipient_name': name,
        'recipient_location': location,
        'recipient_postcode': postcode,
        'recipient_id': slugify('SE-%s-%s' % (name, postadress)),
        'country': 'SE'
    }
    for row in item.xpath('.//div[@class="stodRow"]'):
        fund_type = row.xpath('.//div[@class="stodItem fondtyp"]/text()')[0]
        kategori = row.xpath('.//div[@class="stodItem kategori"]/text()')[0]
        amount = row.xpath('.//div[@class="stodItem belopp"]/text()')[0]
        amount = float(remove_ws(amount))
        d = {
            'amount': amount,
            'year': YEAR,
            'currency': 'SEK',
            'scheme': '%s: %s' % (fund_type, kategori) 
        }
        d.update(base_info)
        yield d
        
def load_details(session, root, state):
    items = get_items(root)
    for i, item in enumerate(items):
        item_root = get_detail(session, item, state)
        new_items = get_items(item_root)
        yield from extract_detail(new_items[i])
    
def get_page(session, page, state):
    d = {
        'AJAXREQUEST': 'j_id0',
        'searchResultsForm_SUBMIT': '1',
        'searchResultsForm:j_id_jsp_121545192_117': str(page),
        'AJAX:EVENTS_COUNT': '1'
    }
    return make_ajax_request(session, d, state)

def get_items(root):
    return root.xpath('''.//div[@onclick="this.getElementsByTagName('a')[0].click();"]''')

In [32]:
def init_search(year=YEAR):
    session = requests.Session()
    print('Initializing...')
    form_data = get_initial_form_data(session)
    form_data.update({
        'searchForm:_idcl': 'searchForm:j_id_jsp_121545192_6', # indicates english, _4 is swedish
    })
    # switch to english
    print('Switch to english...')
    response = session.post(BASE_URL, data=form_data)
    root = html.fromstring(response.content)
    form_data = get_form_data(root)
    # Set year
    print('Setting year...')
    make_ajax_request(session, form_data, {
        'searchForm:searchRakenskapsar': year,
        'AJAXREQUEST': 'j_id0',
        'searchForm:j_id_jsp_121545192_11': 'searchForm:j_id_jsp_121545192_11' # no idea was that is
    })
    search_data = {
        'searchForm:searchRakenskapsar': year,
        'searchForm:searchLan': '-1',
        'searchForm:searchPostnummer': '',
        'searchForm:searchStodmottagare': '',
        'searchForm:searchKommun': '-1',
        'searchForm:searchPostort': '-1',
        'searchForm:garantifondenDirektstod': 'true',
        'searchForm:garantifondenOvrigt': 'true',
        'searchForm:landsbygdsfonden': 'true',
        'searchForm:minAmount': '',
        'searchForm:maxAmount': '',
        'searchForm:searchButton': 'Search',
        'searchForm_SUBMIT': '1',
    }
    search_data.update({
        jvs: form_data[jvs]
    })
    # Search
    print('Searching...')
    response = session.post(BASE_URL, data=search_data)
    print(search_data)
    root = html.fromstring(response.content)
    total_results = int(remove_ws(root.xpath('//div[@class="totalResults value"]/text()')[0]))
    print('Result count', total_results)
    page_size = 100
    max_pages = math.ceil(total_results / 100)
    form_data = get_form_data(root)

    # increase pagination
    pagination_data = {
        'AJAXREQUEST': 'searchResultsFilterForm:j_id_jsp_121545192_63',
        'searchResultsFilterForm:stodmottagareNamnFilter': '',
        'searchResultsFilterForm:j_id_jsp_121545192_75': '100', # page size
        'searchResultsFilterForm_SUBMIT': '1',
        'searchResultsFilterForm:j_id_jsp_121545192_81': 'searchResultsFilterForm:j_id_jsp_121545192_81',
        'AJAX:EVENTS_COUNT': '1'
    }
    state = {
        jvs: form_data[jvs]
    }
    pagination_data.update(state)
    print('Increase pagination...')
    print(pagination_data)
    root = make_ajax_request(session, pagination_data, {})
    return session, root, state, max_pages

# current_page = root.xpath('//td[@class="rich-datascr-act"]/text()')[0]

In [40]:
already = {}

In [41]:
session, root, state, max_pages = init_search(YEAR)
print(max_pages)

Initializing...
Switch to english...
Setting year...
Searching...
{'searchForm:searchRakenskapsar': 2018, 'searchForm:searchLan': '-1', 'searchForm:searchPostnummer': '', 'searchForm:searchStodmottagare': '', 'searchForm:searchKommun': '-1', 'searchForm:searchPostort': '-1', 'searchForm:garantifondenDirektstod': 'true', 'searchForm:garantifondenOvrigt': 'true', 'searchForm:landsbygdsfonden': 'true', 'searchForm:minAmount': '', 'searchForm:maxAmount': '', 'searchForm:searchButton': 'Search', 'searchForm_SUBMIT': '1', 'javax.faces.ViewState': 'NW+pZjjzadkqa+5tkOtqrXhN9yplrUBzEo4Q23XgxDAZ0fpJNM2JYxaxh3ERRXEtWd/V03x0vLLgDV6/Qd0O++QtfkskrJHLE7ZdLidJe7vzOSoJn8sCMalX5vk1MTotvM1yd+LMmLpXaiFd'}
Result count 63860
Increase pagination...
{'AJAXREQUEST': 'searchResultsFilterForm:j_id_jsp_121545192_63', 'searchResultsFilterForm:stodmottagareNamnFilter': '', 'searchResultsFilterForm:j_id_jsp_121545192_75': '100', 'searchResultsFilterForm_SUBMIT': '1', 'searchResultsFilterForm:j_id_jsp_121545192_81':

In [25]:
!mkdir -p data

In [42]:
filename = 'data/SE_%s_{}.csv.gz' % YEAR

for page in tqdm(range(1, max_pages + 1), desc='pagination'):
    if root is None:
        root = get_page(session, page, state)

    if page not in already:
        generator = load_details(session, root, state)
        temp_df = pd.DataFrame(generator)
        temp_df.to_csv(filename.format(page), compression='gzip', index=False)
        already[page] = True
    root = None


pagination: 100%|██████████| 639/639 [3:30:23<00:00, 16.29s/it]
