In [1]:
import tarfile
import xml.etree.ElementTree as ET
from typing import Iterator

In [9]:
# String Constants
ARCHIVEFILE = '../intermediate/utility-patent.tar.gz'
PATENTNUMFILE = '../intermediate/patentnumbers.txt'
CLASSCODEFILE = '../intermediate/classifications.txt'

In [3]:
def get_archivexml(filepath: str) -> Iterator[str]:
    '''Extract xml files from tar.gz archive one at a time
    Parameter
        filepath: full filepath to xml tar.gz archive
    Returns
        Iterator of utf-8 encoded xml string
    '''
    tar = tarfile.open(filepath)
    tarmems = tar.getmembers()
    for member in tarmems[1:]:
        f = tar.extractfile(member)
        yield f.read().decode(encoding='utf-8')
        f.close()
    tar.close()

In [4]:
def get_patentnumbers(filepath: str) -> Iterator[str]:
    '''Get the patent number from the patent xml file and convert it to a string for writing to a file
    '''
    for doc in get_archivexml(filepath):
        yield str(int(ET.fromstring(doc).findall('.//doc-number')[0].text))

In [5]:
def patentnumber2file(infile: str, outfile: str) -> None:
    with open(outfile, 'w', encoding='utf-8') as of:
        for doc in get_patentnumbers(infile):
            of.write(doc + '\n')

In [6]:
patentnumber2file(ARCHIVEFILE, PATENTNUMFILE)

In [11]:
def get_classifications(filepath: str) -> Iterator[str]:
    '''Get the CPC classfication
    Parameter
        filepath: full path to file containing xml
    Returns
        iterator of patent classfication strings, one per patent
    '''
    for doc in get_archivexml(filepath):
        root = ET.fromstring(doc)
        yield ''.join([root.findall('.//section')[0].text,
                       root.findall('.//class')[0].text,
                       root.findall('.//subclass')[0].text])

In [12]:
def classification2file(infile: str, outfile: str) -> None:
    with open(outfile, 'w', encoding='utf-8') as of:
        for doc in get_classifications(infile):
            of.write(doc + '\n')

In [13]:
classification2file(ARCHIVEFILE, CLASSCODEFILE)