In [1]:
import tarfile
import xml.etree.ElementTree as ET
from typing import Iterator

In [2]:
# String Constants
ARCHIVEFILE = '../intermediate/utility-patent.tar.gz'
PATENTNUMFILE = '../intermediate/patentnumbers.txt'
CLASSCODEFILE = '../intermediate/classifications.txt'
REFERENCEFILE = '../intermediate/references.txt'

In [3]:
def get_archivexml(filepath: str) -> Iterator[str]:
    '''Extract xml files from tar.gz archive one at a time
    Parameter
        filepath: full filepath to xml tar.gz archive
    Returns
        Iterator of utf-8 encoded xml string
    '''
    tar = tarfile.open(filepath)
    tarmems = tar.getmembers()
    for member in tarmems[1:]:
        f = tar.extractfile(member)
        yield f.read().decode(encoding='utf-8')
        f.close()
    tar.close()

In [4]:
def get_patentnumbers(filepath: str) -> Iterator[str]:
    '''Get the patent number from the patent xml file and convert it to a string for writing to a file
    '''
    for doc in get_archivexml(filepath):
        yield str(int(ET.fromstring(doc).findall('.//doc-number')[0].text))

In [5]:
def patentnumber2file(infile: str, outfile: str) -> None:
    with open(outfile, 'w', encoding='utf-8') as of:
        for doc in get_patentnumbers(infile):
            of.write(doc + '\n')

In [6]:
patentnumber2file(ARCHIVEFILE, PATENTNUMFILE)

In [7]:
def get_classifications(filepath: str) -> Iterator[str]:
    '''Get the CPC classfication
    Parameter
        filepath: full path to file containing xml
    Returns
        iterator of patent classfication strings, as a space separated string
    '''
    for doc in get_archivexml(filepath):
        root = ET.fromstring(doc)
        #prefix = './/'
        #yield ''.join([root.findtext(prefix + 'section'),
        #               root.findtext(prefix + 'class'),
        #               root.findtext(prefix + 'subclass'),
        #               root.findtext(prefix + 'main-group'),
        #               root.findtext(prefix + 'subgroup')])
        cpcs = root.findall('.//classification-cpc')
        s = ''
        for cpc in cpcs:
            code = ''.join([cpc.findtext('.//section'), 
                            cpc.findtext('.//class'), 
                            cpc.findtext('.//subclass'), 
                            cpc.findtext('.//main-group'), 
                            cpc.findtext('.//subgroup'), ' '])
            s += code
        yield s.strip(' ')

In [8]:
def classification2file(infile: str, outfile: str) -> None:
    with open(outfile, 'w', encoding='utf-8') as of:
        for doc in get_classifications(infile):
            of.write(doc + '\n')

In [9]:
classification2file(ARCHIVEFILE, CLASSCODEFILE)

In [10]:
def get_references(filepath: str) -> Iterator[str]:
    '''Get the references
    Parameter
        filepath: full path to file containing xml
    Returns
        iterator of references as a space separated string
    '''
    for doc in get_archivexml(filepath):
        root = ET.fromstring(doc)
        refs = root.findall('.//us-references-cited/us-citation')
        s = ''
        for ref in refs:
            country = ref.findtext('.//country').replace(' ', '') if ref.findtext('.//country') is not None else ''
            doc_number = ref.findtext('.//doc-number').replace(' ', '') if ref.findtext('.//doc-number') is not None else ''
            kind = ref.findtext('.//kind').replace(' ', '') if ref.findtext('.//kind') is not None  else ''
            s += ''.join([country, doc_number, kind, ' '])
        yield s.strip(' ')

In [11]:
def references2file(infile: str, outfile: str) -> None:
    with open(outfile, 'w', encoding='utf-8') as of:
        for doc in get_references(infile):
            of.write(doc + '\n')

In [12]:
references2file(ARCHIVEFILE, REFERENCEFILE)