In [1]:
"""Modulo para extraer texto de archivos binarios."""
from pathlib import Path

from tika import parser
import json

In [2]:
def create_sibling_dir(dirpath, newdir):
    """
    Crea directorio `newdir` al mismo nivel de `dirpath`.

    Parameters
    ----------
    dirpath: str
    newdir: str

    Returns
    -------
    Path
       Nuevo directorio
    """
    initial = Path(dirpath)
    parent = initial.parent
    
    newpath = parent.joinpath(newdir)
    newpath.mkdir(parents=True, exist_ok=True)
    
    return newpath

In [3]:
def extract(filepath):
    """
    Extrae texto y metadata de archivo en `filepath`.

    Parameters
    ----------
    filepath: str|Path

    Returns
    -------
    tuple
    """
    try:
        parsed = parser.from_file(str(filepath))
            
    except Exception as e:
        print(f'Error Tika en {Path(filepath).name} : {e}')
        parsed = dict()

    text = parsed.get('content')
    meta = parsed.get('metadata')
    
    return text, meta

In [4]:
def save_extracted(obj, filepath, content=None):
    """
    Almacena `obj` en `filepath`.

    Parameters
    ----------
    obj: str|dict
    filepath: str|Path
    content: str|None
    """
    if content == 'text':
        with open(filepath, "w", newline='\n', encoding='utf-8') as out:
            for line in obj.splitlines():
                out.write(line)
                out.write('\n')
    
    elif content == 'meta':
        with open(filepath, mode='w', encoding='utf-8') as out:
            json.dump(obj, out, ensure_ascii=False)
    
    else:
        print(f'Parámetro content debe ser text o meta para almacenar: {filepath}')

In [5]:
def extract_all(docspath, textspath, metapath):
    """
    Extrae texto y metadata de cada archivo en `docspath`.
    
    Almacena texto y metadata en directorios `textspath` y `metapath`.

    Parameters
    ----------
    docspath: str
    textspath: str
    metapath: str
    """
    dirdocs = Path(docspath)
    dirtexts = create_sibling_dir(docspath, textspath)
    dirmeta = create_sibling_dir(docspath, metapath)
    
    files = 0

    for fp in dirdocs.iterdir():
        if fp.is_file():
            textfile = dirtexts.joinpath(f'{fp.stem}.txt')
            if not textfile.exists():
                text, meta = extract(fp)

                if text and meta:
                    save_extracted(text, textfile, content='text')

                    metafile = dirmeta.joinpath(f'{fp.stem}.json')
                    save_extracted(meta, metafile, content='meta')

                    files += 1

    print(f'{files} nuevos archivos procesados')

In [6]:
dirin = '/Users/tombito/Dropbox/datasets/banrep/inflacion/original/'
extract_all(dirin, 'text_raw', 'text_meta')

2019-05-03 17:24:32,144 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /var/folders/hq/0wl64_9908373dlcxm2rl4fh0000gn/T/tika-server.jar.
2019-05-03 17:27:32,485 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to /var/folders/hq/0wl64_9908373dlcxm2rl4fh0000gn/T/tika-server.jar.md5.
2019-05-03 17:27:33,056 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


81 nuevos archivos procesados
