In [1]:
import xml.etree.ElementTree as ET
from typing import List
from common import file_find, get_archivexml

In [2]:
# File String Constants
TOP = '/Volumes/Lexar'
PATENTNUMFILE = '/Volumes/Lexar/plaintext/01patentnumbers.txt'
CLASSCODEFILE = '/Volumes/Lexar/plaintext/01classifications_ipc.txt'
REFERENCEFILE = '/Volumes/Lexar/plaintext/01references.txt'

In [3]:
def get_patentnumbers(doc: str) -> str:
    '''Get the patent number from the patent xml file and convert it to a string for writing to a file
    Parameter
        doc: xml document string
    Returns
        Patent number as string
    '''
    return str(int(ET.fromstring(doc).findall('.//doc-number')[0].text))

In [4]:
def get_references(doc: str) -> str:
    '''Get the references
    Parameter
        doc: xml document string
    Returns
        References as a space separated string
    '''

    root = ET.fromstring(doc)
    refs = root.findall('.//us-references-cited/us-citation')
    s = ''
    for ref in refs:
        s += ''.join([ref.findtext('.//country', default='').replace(' ', ''),
                      ref.findtext('.//doc-number', default='').replace(' ', ''), 
                      ref.findtext('.//kind', default='').replace(' ', ''),
                      ' '])
    s += str(int(root.findall('.//doc-number')[0].text)) # Add current patent number for forward references
    return s.strip(' ')

In [5]:
def get_classifications_ipc(doc: str) -> str:
    '''Get the IPC classfication
    Parameter
        doc: xml document string
    Returns
        Patent classfication strings, as a space separated string
    '''
    root = ET.fromstring(doc)
    ipcs = root.findall('.//classification-ipcr')
    s = ''
    for ipc in ipcs:
        code = ''.join([ipc.findtext('.//section'), 
                        ipc.findtext('.//class'), 
                        ipc.findtext('.//subclass'), 
                        ipc.findtext('.//main-group'), 
                        ipc.findtext('.//subgroup'), 
                        ' '])
        s += code
    return s.strip(' ')

In [6]:
def metadata2files(infile: str, outfile: List[str]) -> None:
    with open(outfile[0], 'a', encoding='utf-8') as of0:
        with open(outfile[1], 'a', encoding='utf-8') as of1:
            with open(outfile[2], 'a', encoding='utf-8') as of2:
                for doc in get_archivexml(infile):
                    of0.write(get_patentnumbers(doc) + '\n')
                    of1.write(get_references(doc) + '\n')
                    of2.write(get_classifications_ipc(doc) + '\n')

In [7]:
for archivefile in file_find('utility*.tar.gz', TOP):
    metadata2files(archivefile, [PATENTNUMFILE, REFERENCEFILE, CLASSCODEFILE])