In [1]:
import base64
from collections import OrderedDict
import time
import json
import os
import string

import requests
from lxml import html

from IPython.core.display import display, HTML
from IPython.display import clear_output

from tqdm import tqdm_notebook

import pandas as pd

from slugify import slugify

from captcha.agea_captcha import captcha

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
cs = captcha.CaptchaSolver(model_path='captcha/agea_11.h5')

In [3]:
YEAR = 2017

In [4]:
START_URL = 'https://www.sian.it/pubbAimu/start.do'
START2_URL = 'https://www.sian.it/pubbAimu/beneficiari/start.do'
YEAR_SELECTION = 'https://www.sian.it/pubbAimu/beneficiari/ricerca/switch.do'
CAPTCHA_URL = 'https://www.sian.it/pubbAimu/Captcha.jpg'
SEARCH_URL = 'https://www.sian.it/pubbAimu/beneficiari/ricerca/switch.do'
BAD_CAPTCHA = "I caratteri digitati non corrispondono a quelli presenti"
SESSION_ERROR = 'ERRORE DI SISTEMA'
NO_RECIPIENTS = 'beneficiario trovato per i criteri'
TOKEN_NAME = 'org.apache.struts.taglib.html.TOKEN'

In [5]:
def to_curl(response):
    req = response.request

    command = "curl -X {method} -H {headers} -d '{data}' '{uri}'"
    method = req.method
    uri = req.url
    data = req.body
    headers = ['"{0}: {1}"'.format(k, v) for k, v in req.headers.items()]
    headers = " -H ".join(headers)
    return command.format(method=method, headers=headers, data=data, uri=uri)

def extract_select(options):
    for macro in options:
        key = macro.attrib['value']
        label = macro.text_content()
        yield key, label

def get_session():
    session = requests.Session()
    session.get(START_URL)
    response = session.get(START2_URL)
    
    root = html.fromstring(response.text)
    amounts = root.xpath('//select[@name="ricercaImportoPagamenti"]/option/@value')

    macro_measure = dict(extract_select(root.xpath('//select[@name="ricercaMacroMisura"]/option')[1:]))
    return session, {'amounts': amounts[1:], 'measures': macro_measure}

In [6]:
def collect_measure(session, year, macro):
    post_data = {
        'ricercaEsercizioFinanziario': str(year),
        'ricercaDenominazione': '',
        'ricercaComuneResidenza': '',
        'ricercaImportoPagamenti': '',
        'ricercaMacroMisura': str(macro),
        'caratteriImmagine': '',
        'desiredTarget': '',
    }
    response = session.post(SEARCH_URL, data=post_data, allow_redirects=False, headers={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://www.sian.it/pubbAimu/beneficiari/elenco/load.do',
        'Host': 'www.sian.it',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    })
    root = html.fromstring(response.text)
    return dict(extract_select(root.xpath('//select[@name="ricercaMisura"]/option')[1:]))
        
def collect_measures(year):
    session, params = get_session()
    for m in tqdm_notebook(params['measures']):
        yield m, collect_measure(session, year, m)
  

In [7]:
MEASURES = {
    YEAR: dict(collect_measures(YEAR))
}
MEASURES

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




{2017: {'1': {'1': 'Regime di pagamento unico (RPU)',
   '102': 'Altro (aiuti diretti disaccoppiati)',
   '103': 'Premi per le vacche nutrici',
   '104': 'Premio supplementare per vacca nutrice',
   '105': 'Premio per ovini e caprini',
   '106': 'Premio supplementare per ovini e caprini',
   '107': 'Aiuto per i bachi da seta',
   '108': 'Pagamenti per specifici tipi di colture e di produzione di qualità',
   '109': 'Supplemento per i coltivatori di barbabietole da zucchero e canna da zucchero',
   '111': 'Pagamento transitorio per i prodotti ortofrutticoli - Prodotti diversi dai pomodori',
   '112': 'Sostegno specifico (articolo 68) - Aiuti diretti accoppiati',
   '115': 'Altro (aiuti diretti)',
   '116': 'Aiuti aggiuntivi',
   '2': 'Regime di pagamento unico per superficie (RPUS)',
   '238': 'Rimborso degli aiuti diretti con i meccanismi di disciplina finanziaria',
   '3': 'Pagamento distinto per lo zucchero',
   '4': 'Pagamento separato per i prodotti ortofrutticoli',
   '5': 'Sosteg

In [None]:
class SessionError(Exception):
    pass


class TooManyError(Exception):
    def __init__(self, query):
        self.query = query


def has_persistent(key, filename='already.json'):
    if os.path.exists(filename):
        with open(filename) as f:
            data = json.load(f)
        return key in data
    else:
        return False

    
def has_failed(key):
    return has_persistent(key, filename='failed.json')

    
def add_failed(query, data=None):
    add_persistent(query, data=data, filename='failed.json')

def add_persistent(key, data=None, filename='already.json'):
    if data is None:
        if os.path.exists(filename):
            with open(filename) as f:
                data = json.load(f)
        else:
            data = {}
    data[key] = True
    with open(filename, 'w') as f:
        json.dump(data, f)


def get_sub_queries(query):
    if len(query) == 3:
        yield from get_macro_sub_queries(query)
    elif len(query) == 4:
        yield from get_name_sub_queries(query)
        
        
def get_name_sub_queries(query):
    letters = string.ascii_uppercase
    for l1 in letters:
        for l2 in letters:
            q = OrderedDict(query)
            q.update({
                'ricercaDenominazione': l1 + l2
            })
            yield q
        
    
def get_macro_sub_queries(query):
    year, macro = query['ricercaEsercizioFinanziario'], query['ricercaMacroMisura']
    for micro in MEASURES[year][macro]:
        q = OrderedDict(query)
        q.update({
            'ricercaMisura': micro
        })
        yield q

def get_queries(params, year):
    for amount in reversed(params['amounts']):
        for macro in params['measures']:
            yield OrderedDict([
                ('ricercaImportoPagamenti', amount),
                ('ricercaMacroMisura', macro),
                ('ricercaEsercizioFinanziario', year),
            ])

def start_search(year):
    session, params = get_session()
    queries = get_queries(params, year)
    for result_list in start_queries(queries):
        if result_list is not None:
            yield result_list

        
def start_queries(queries):
    for query in queries:
        yield from start_query(query)

def start_query(query):
    key = str(query)
    if has_failed(key):
        yield from run_subqueries(query)
        return
    try:
        result_list = run_with_query(query)
        yield result_list
    except TooManyError as e:
        add_failed(key)
        yield from run_subqueries(query)

def run_subqueries(query):
    for subquery in get_sub_queries(query):
        yield from start_query(subquery)
        

def run_with_query(query):
    key = str(query)
    if has_persistent(key):
        return None

    session, _ = get_session()
    count = 0
    while True:
        try:
            print('Searching', query)
            result_list = perform_search(session, query, count=count)
            add_persistent(key)
            return result_list
        except SessionError:
            count += 1
            print('Session Error!', count)
            session, _ = get_session()
                
                
def get_detail(session, token, query, idList):
    print('Getting detail for', idList, 'sleeping...')
    time.sleep(5)
    print('Getting detail for', idList)
    post_data = OrderedDict([
        ('org.apache.struts.taglib.html.TOKEN', token,),
        ('ricercaDenominazione', '',),
        ('ricercaComuneResidenza', '',),
        ('ricercaImportoPagamenti', '1',),
        ('ricercaEsercizioFinanziario', str(YEAR),),
        ('ricercaMacroMisura', '1',),
        ('ricercaMisura', '',),
        ('idLista', idList,),
        ('desiredTarget', 'Detail',),
    ])
    post_data.update(query)
    response = session.post(SEARCH_URL, data=post_data, headers={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://www.sian.it/pubbAimu/beneficiari/elenco/load.do',
        'Host': 'www.sian.it',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    })
    print(to_curl(response))
    %debug
    root = html.fromstring(response.text)
    table = root.xpath('//table[contains(@class, "regpub_dati")]/following-sibling::table')[0]
    trs = table.xpath('.//tr[position() > 1]')
    for tr in trs:
        tds = tr.xpath('./td')
        measure_1 = tds[0].text_content().replace(' - ', '')
        measure_2 = tds[1].text_content().replace(' - ', '')
        amount = float(tds[2].text_content().replace('.', '').replace(',', '.'))
        yield {
            'scheme': '%s - %s' % (measure_1, measure_2),
            'amount': amount,
            'currency': 'EUR',
            'year': YEAR
        }


def extract_recipients(session, token, query, table):
    trs = table.xpath('.//tr[position() > 1]')
    num_results = len(trs)
    for num, tr in enumerate(trs):
        print('%.2f%%\r' % (num / num_results * 100), end='')
        tds = tr.xpath('./td')
        amount = float(tds[4].text_content().replace('.', '').replace(',', '.'))
        idList = tds[5].xpath('./input/@value')[0]
        recipient_name = tds[0].text_content()
        recipient_location = '%s, %s' % (tds[1].text_content(), tds[3].text_content())
        recipient_postcode = tds[2].text_content()
        base = {
            'recipient_name': recipient_name,
            'recipient_location': recipient_location,
            'recipient_postcode': recipient_postcode,
            'total_amount': amount,
            'country': 'IT',
        }
        base['recipient_id'] = '%s-%s-%s' % (slugify(recipient_name),
                                             recipient_postcode,
                                             slugify(recipient_location))
        yield base
#         subsidies = get_detail(session, token, query, idList)
#         for sub in subsidies:
#             d = dict(base)
#             d.update(sub)
#             yield d
    
    
def enter_captcha(session):
    response = session.get(CAPTCHA_URL)
    uri = ("data:" + 
           response.headers['Content-Type'] + ";" +
           "base64," + base64.b64encode(response.content).decode('utf-8'))
    display(HTML('<img src="{}">'.format(uri)))
    return input('Captcha: '), uri

def solve_captcha(session):
    response = session.get(CAPTCHA_URL)
    solution = cs.predict_from_bytes(response.content)
    uri = ("data:" + 
           response.headers['Content-Type'] + ";" +
           "base64," + base64.b64encode(response.content).decode('utf-8'))
#     display(HTML('<img src="{}">'.format(uri)))
#     print('Auto-Captcha:', solution)
    return solution, uri

def store_captcha(solution, uri):
    with open('captchas.txt', 'a') as f:
        f.write('%s|%s\n' % (solution, uri))
    
def perform_search(session, query, count=0):
    minor_measures = None
    while True:
#         print('Trying captcha for', query)
        solution, uri = solve_captcha(session)

        year_post = {
            'ricercaEsercizioFinanziario': str(YEAR),
            'ricercaDenominazione':'',
            'ricercaComuneResidenza':'',
            'ricercaImportoPagamenti': '1',
            'ricercaMacroMisura': '1',
            'caratteriImmagine': solution,
            'desiredTarget':'Find',
        }
        year_post.update(query)
        try:
            response = session.post(SEARCH_URL, data=year_post, headers={
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer': 'https://www.sian.it/pubbAimu/beneficiari/ricerca/switch.do',
            })
        except requests.exceptions.RequestException as e:
            # connection error
            raise TooManyError(query)

        if not response.text:
            raise SessionError

        if BAD_CAPTCHA in response.text:
            print('Bad captcha!')
            continue

        #  Store correctly solved captcha for training data
        store_captcha(solution, uri)

        if NO_RECIPIENTS in response.text:
            return []
        
        if SESSION_ERROR in response.text:
            raise SessionError
        root = html.fromstring(response.text)
        table = root.xpath('//table[@id="results"]')
        if not table:
            print('no table')
            display(HTML(response.text))
            if count > 1:
                break
            raise SessionError
        token = root.xpath('.//input[@name="%s"]/@value' % TOKEN_NAME)[0]
        return list(extract_recipients(session, token, query, table[0]))


In [9]:
filename = 'it_%s_raw.csv.gz' % YEAR
if os.path.exists(filename):
    df = pd.read_csv(filename, compression='gzip', converters={'recipient_postcode': str})
else:
    df = pd.DataFrame()
for data in tqdm_notebook(start_search(YEAR)):
    df = pd.concat([df, pd.DataFrame(data)])
    print(len(df))
    df.to_csv(filename, index=False, compression='gzip')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '7'), ('ricercaEsercizioFinanziario', 2017)])
Bad captcha!
Bad captcha!
Bad captcha!
2
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '6'), ('ricercaEsercizioFinanziario', 2017)])
Bad captcha!
Bad captcha!
2
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '2'), ('ricercaEsercizioFinanziario', 2017)])
4
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '4'), ('ricercaEsercizioFinanziario', 2017)])
Bad captcha!
9
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '9'), ('ricercaEsercizioFinanziario', 2017)])
Bad captcha!
9
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '8'), ('ricercaEsercizioFinanziario', 2017)])
9
Searching OrderedDict([('ricercaImportoPagamenti', '6'), ('ricercaMacroMisura', '5'), ('ricercaEsercizioFinanziario', 2017)])
9
Searching Ord

In [10]:
len(df)

2275452

In [13]:
df.head()

Unnamed: 0,recipient_id,recipient_location,recipient_name,recipient_postcode,total_amount,country,year,currency,scheme
0,unaprol-consorzio-olivicolo-italiano-societa-c...,"ROMA, ROMA",UNAPROL - CONSORZIO OLIVICOLO ITALIANO SOCIETA...,187,11607652.26,IT,2017,EUR,
1,consorzio-nazionale-degli-olivicoltori-societa...,"ROMA, ROMA",CONSORZIO NAZIONALE DEGLI OLIVICOLTORI SOCIETA...,187,8045116.55,IT,2017,EUR,
0,genagricola-spa-34132-trieste-trieste,"TRIESTE, TRIESTE",GENAGRICOLA SPA,34132,4686814.5,IT,2017,EUR,
1,consiglio-per-la-ricerca-in-agricoltura-e-l-an...,"ROMA, ROMA",CONSIGLIO PER LA RICERCA IN AGRICOLTURA E L'AN...,198,3537866.93,IT,2017,EUR,
0,genagricola-spa-34132-trieste-trieste,"TRIESTE, TRIESTE",GENAGRICOLA SPA,34132,4686814.5,IT,2017,EUR,
