### US Patent XML Files ###

In [9]:
import re
import os
import xml.etree.ElementTree as ET
import tempfile
from zipfile import ZipFile
import shutil
from typing import Iterator
from common import file_find

In [2]:
# File path strings
ROOTDIR = '/Volumes/Lexar'
XMLDIR = '/Volumes/Lexar/patent-xmlfiles'
ARCHIVEDIR = '/Volumes/Lexar/utility-targz/'

#### Getting Individual Files ####

Zipped versions of US patent XML files from 1976 onward are available at http://patents.reedtech.com/pgrbft.php . These files contain issued Utility patents, Plant Patents, and Design Patents.

The zip file names are of the form ipgxxyyzz.zip where xx=2 digit year (e.g., 17 for 2017), yy=2 digit month and zz=2 digit day. Each zip file contains one xml file with the same prefix.

To download a zip file, click on the file name as shown in the picture.

![Website](./patent-website.png)

We begin by extracting the xml files from the download zip files.

In [3]:
def zip2xml(topdir: str, xmldir: str) -> None:
    for zipf in file_find('i*.zip', topdir):
        with ZipFile(zipf) as zf:
            zf.extractall(path=xmldir)

In [4]:
zip2xml(ROOTDIR, XMLDIR)

#### XML Files ####
Each XML file contains multiple patent XML documents:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v45-2014-04-03.dtd" [ ]>
<us-patent-grant lang="EN" dtd-version="v4.5 2014-04-03" file="US08925112-20150106.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20141219" date-publ="20150106">

...Patent information

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v45-2014-04-03.dtd" [ ]>

...Patent information

These concatenated files are separated into individual strings using get_indivdocs.

In [5]:
def get_indivdocs(filepath: str) -> Iterator[str]:
    '''Split a file contanining multiple xml docs into a list that contains
        each xml doc as a string.
        Parameter
            filepath: full path string to file containing multiple xml docs
         Returns
             iterator of strs with each string being an individual xml document
    '''
    s = ""
    with open(filepath, 'r', encoding='utf-8') as f:
        for l in f:
            if l == '<?xml version="1.0" encoding="UTF-8"?>\n':
                if len(s) > 0:
                    yield s
                s = ""
            s += l
        yield s

#### Patent Types ####

Patent types are Utility, Plant, and Design. We are only interested in Utility patents. They type of patent is found in the 'appl-type' attribute in 'application-reference' tag.

patent_type is used to filter out only utility patents. Some patents are missing the attribute so we do return patents with the missing attribute tag.

In [6]:
def patent_type(doc: str, patenttype_tocheck: str) -> bool:
    '''Check whether the patent xml document is the type we want to collect
    Parameters
        doc: str containing xml document
        patenttype_tocheck: utility, design, plant
    Returns
        True if patent type is the type we want
        False if the type filed does not exist or if it is a different type
    '''
    root = ET.fromstring(doc)
    bib = root.findall('us-bibliographic-data-grant')
    if not bib:
        return False
    return (True if bib[0][1].attrib['appl-type'] == patenttype_tocheck
            else False)

#### Archive File ####

Each utility patent is written to a separate file with a file name: patentnumber.xml. The files are compressed into a tar.gz archive. The tar.gz archive has a name of the form: utilityxxyyzz.tar.gz where xxyyzz is the same form of the date code for zip and xml files.

In [7]:
def patentxml2utilityxmlarch(filepath: str) -> None:
    '''Take the composite xml patent doc and writes xml a tar.gz compressed archive of individual utility
       patent xml files using the patent number.xml as the file name
       Parameters
           filepath: full filepath string to xml document       
    '''
    date = re.findall(r'\d\d\d\d\d\d', os.path.splitext(os.path.basename(filepath))[0])[0]
    archivefile = ARCHIVEDIR + 'utility' + date
    filtered_docs = (doc for doc in get_indivdocs(filepath) if patent_type(doc, 'utility'))
    with tempfile.TemporaryDirectory() as tmpdirname:
        for doc in filtered_docs:
            pn = str(int(ET.fromstring(doc).findall('.//doc-number')[0].text))
            with open(tmpdirname + '/' + pn + '.xml', 'w', encoding='utf-8') as f:
                f.write(doc)
        shutil.make_archive(archivefile, 'gztar', tmpdirname + '/')
            

In [10]:
for xmlfile in file_find('i*.xml', ROOTDIR):
    patentxml2utilityxmlarch(xmlfile)