In [1]:
import xml.etree.ElementTree as ET
import tempfile
import shutil
from typing import Iterator

In [2]:
# File path strings
XMLFILE = '../intermediate/patent.xml'

In [3]:
def get_indivdocs(filepath: str) -> Iterator[str]:
    '''Split a file contanining multiple xml docs into a list that contains
        each xml doc as a string.
        Parameter
            filepath: full path string to file containing multiple xml docs
         Returns
             iterator of strs with each string being an individual xml document
    '''
    s = ""
    with open(filepath, 'r') as f:
        for l in f:
            if l == '<?xml version="1.0" encoding="UTF-8"?>\n':
                if len(s) > 0:
                    yield s
                s = ""
            s += l
        yield s

In [4]:
def patent_type(doc: str, patenttype_tocheck: str) -> bool:
    '''Check whether the patent xml document is the type we want to collect
    Parameters
        doc: str containing xml document
        patenttype_tocheck: utility, design, plant
    Returns
        True if patent type is the type we want
        False if the type filed does not exist or if it is a different type
    '''
    root = ET.fromstring(doc)
    bib = root.findall('us-bibliographic-data-grant')
    if not bib:
        return False
    return (True if bib[0][1].attrib['appl-type'] == patenttype_tocheck
            else False)

In [5]:
def patentxml2utilityxmlarch(filepath: str) -> None:
    '''Take the composite xml patent doc and writes xml a tar.gz compressed archive of individual utility
       patent xml files using the patent number.xml as the file name
       Parameters
           filepath: full filepath string to xml document       
    '''
    filtered_docs = (doc for doc in get_indivdocs(XMLFILE) if patent_type(doc, 'utility'))
    with tempfile.TemporaryDirectory() as tmpdirname:
        for doc in filtered_docs:
            pn = str(int(ET.fromstring(doc).findall('.//doc-number')[0].text))
            with open(tmpdirname + '/' + pn + '.xml', 'w', encoding='utf-8') as f:
                f.write(doc)
        shutil.make_archive('../intermediate/utility-patent', 'gztar', tmpdirname + '/')
            

In [6]:
patentxml2utilityxmlarch(XMLFILE)