In [1]:
import re
import tarfile
import xml.etree.ElementTree as ET
from itertools import chain
from typing import Iterator
from cytoolz import compose

In [2]:
# String Constants
ARCHIVEFILE = '../intermediate/utility-patent.tar.gz'
PLAINTEXTFILE = '../intermediate/titleabstract.txt'

In [3]:
def get_archivexml(filepath: str) -> Iterator[str]:
    '''Extract xml files from tar.gz archive one at a time
    Parameter
        filepath: full filepath to xml tar.gz archive
    Returns
        Iterator of utf-8 encoded xml string
    '''
    tar = tarfile.open(filepath)
    tarmems = tar.getmembers()
    for member in tarmems[1:]:
        f = tar.extractfile(member)
        yield f.read().decode(encoding='utf-8')
        f.close()
    tar.close()

In [4]:
def xml2plaintext(doc: Iterator[str]) -> str:
    '''Convert the xml documents into plain text from selected tags
    Parameters
        doc: full xml document as an iterator of string
    Returns
        plain text string containing text from select tags
    '''
    tagstoget = ['invention-title','abstract']
    root = ET.fromstring(doc)
    return ' '.join(''.join(chain.from_iterable([[t for t in root.find('.//' + tagtoget).itertext()] 
                                                 for tagtoget in tagstoget])).splitlines())

In [5]:
def remove_numbers(docstr: str) -> str:
    return re.sub('[0-9]+', ' ', docstr)

def remove_specialchar(docstr: str) -> str:
    pattern = re.compile(r'[#$%&()*+_/:<=>@^{}|] | \[ | ] | -')
    return pattern.sub(" ", docstr)

def remove_nonascii(s: str) -> str:
    return "".join(i for i in s if ord(i) < 128)

def remove_extrawhitespace(docstr: str) -> str:
    return re.sub(' {2,}', ' ', docstr)

In [6]:
def cleantext(infile: str, outfile: str) -> None:
    '''Cleaned Text starting with xml file archive and returing text from selected fields
    Parameters
        infile: full path to tar.gz xml archive
        outfile: full path to outputed clean text file
    '''
    plaintext = compose(remove_extrawhitespace, 
                        remove_nonascii, 
                        remove_specialchar, 
                        remove_numbers,
                        xml2plaintext)
    with open(outfile, 'w', encoding='utf-8') as f:
        for doc in get_archivexml(infile):
            text = plaintext(doc)
            f.write(text + '\n')

In [7]:
cleantext(ARCHIVEFILE, PLAINTEXTFILE)