In [6]:
import re
import ujson
import dask.bag as db
from dask.diagnostics import ProgressBar
from bs4 import BeautifulSoup
import pandas as pd

pbar = ProgressBar()
pbar.register()


def extract(blob):
    return blob['dataList']
        

def remove_empty(data_list):
    return data_list


def valid_columns(data):
    return {'detalleNorma': data['detalleNorma'], 'idTramite': data['idTramite']}


def extract_text(data):
    try:
        return {
            'idTramite': data['idTramite'],
            'parsedText': BeautifulSoup(data['detalleNorma'], "html.parser").getText().strip()
        }
    except:
        return None


bag = db.read_text("xac", blocksize=5000000)

df = bag.map(ujson.loads).map(extract).filter(remove_empty).map(valid_columns).map(extract_text).filter(lambda x: x is not None)

In [7]:
from subprocess import check_output, CalledProcessError, Popen, PIPE
from lxml import etree

version = (0, 1, 0)
version_string = "PyFreeling version %d.%d.%d" % version
__copyright__ = 'Copyright (c) 2016 Marcos Vanetta'


binary = None


def find_binary():
    try:
        return check_output(['which', 'analyze']).split()[0]
    except (CalledProcessError, KeyError):
        return None


class Analyzer(object):
    def __init__(self, *args, **kwargs):
        self.config = kwargs.get('config', 'analyzer.cfg')
        self.lang = kwargs.get('lang', 'en')
        self.timeout = kwargs.get('timeout', 30)
        self.binary = find_binary()

    def run(self, text, *args, **kwargs):
        cmd = self._build_cmd(*args, **kwargs)
        proc = Popen(cmd, stdin=PIPE, stdout=PIPE)
        outs, errs = proc.communicate(text)

        if errs is None:
            return etree.XML("<sentences>{}</sentences>".format(outs))
        else:
            raise Exception(errs)

    def _build_param(self, key, val):
        return '--{}'.format(key), val

    def _build_flag(self, a):
        return '--{}'.format(a)

    def _build_cmd(self, *flags, **kwargs):
        cmd = [self.binary, '-f', self.config]

        for f in flags:
            flag = self._build_flag(f)
            if flag:
                cmd.append(flag)

        for key, val in iter(kwargs.items()):
            param, value = self._build_param(key, val)
            cmd += [param, value]

        cmd += ['--output', 'xml']
        return cmd

In [8]:
from lxml import etree

def tokenize(data):
    try:
        analyzer = Analyzer(config='/usr/local/Cellar/freeling/4.0/share/freeling/config/es-ar.cfg')
        tokens = []
        xml_root = analyzer.run(data['parsedText'].encode('utf-8'), 'nec')
        for element in xml_root.iter():
            if element.tag == 'token':
                tokens.append(dict(element.attrib))
    except Exception as e:
        print(data['idTramite'])
        print(e)

    return {'idTramite': data['idTramite'], 'tokens': tokens}
tokens = df.map(tokenize)

In [None]:
a = tokens.compute()

[                                        ] | 0% Completed |  1min 54.4s

In [None]:
import json
with open('xac.json', 'w') as outfile:
    json.dump(a, outfile)