### Plain Text from XML ###

In [1]:
import re
import tarfile
import xml.etree.ElementTree as ET
from itertools import chain
from typing import Iterator
from cytoolz import compose
from common import file_find, get_archivexml

In [2]:
# File String Constants
TOP = '/Volumes/Lexar'
PLAINTEXTFILE = '/Volumes/Lexar/plaintext/01titleabstract.txt'

#### Getting Plain Text ####

We get plain text from the xml tags of interest. The itertext() method is used to get text from a tag that has subtags in it (such as the abstract). The text is concatenated into a string.

In [3]:
def xml2plaintext(doc: Iterator[str]) -> str:
    '''Convert the xml documents into plain text from selected tags
    Parameters
        doc: full xml document as an iterator of string
    Returns
        plain text string containing text from select tags
    '''
    tagstoget = ['invention-title','abstract']
    root = ET.fromstring(doc)
    return ' '.join(''.join(chain.from_iterable([[t for t in root.find('.//' + tagtoget).itertext()] 
                                                 for tagtoget in tagstoget])).splitlines())

#### Cleaning Plain Text ####

Patent text contains tokens like numbers (usually refering to figures) and special characters. These routines can be composed together to remove them. We do keep periods and commas in this step to allow the spacy library to split the documents into sentences which are required for n-gram models and word2vec.

In [4]:
def remove_numbers(docstr: str) -> str:
    return re.sub('[0-9]+', ' ', docstr)

def remove_specialchar(docstr: str) -> str:
    pattern = re.compile(r'[#$%&()*+_/:<=>@^{}|] | \[ | ] | -')
    return pattern.sub(" ", docstr)

def remove_nonascii(s: str) -> str:
    return "".join(i for i in s if ord(i) < 128)

def remove_extrawhitespace(docstr: str) -> str:
    return re.sub(' {2,}', ' ', docstr)

In [5]:
def cleantext(infile: str, outfile: str) -> None:
    '''Cleaned Text starting with xml file archive and returing text from selected fields.
       Each document is saved as a newline separated string to the file.
    Parameters
        infile: full path to tar.gz xml archive
        outfile: full path to outputed clean text file
    '''
    plaintext = compose(remove_extrawhitespace, 
                        remove_nonascii, 
                        remove_specialchar, 
                        remove_numbers,
                        xml2plaintext)
    with open(outfile, 'a', encoding='utf-8') as f:
        for doc in get_archivexml(infile):
            text = plaintext(doc)
            f.write(text + '\n')

In [6]:
for archivefile in file_find('utility*.tar.gz', TOP):
    cleantext(archivefile, PLAINTEXTFILE)