In [2]:
from parsers.pdf_parser import parse_pdf
from lxml import etree
#from indexer.indexer import index_patent
#from analyzer.analyzer import analyze_patent

In [None]:
from split_xml import process_xml_file
# split the xml file with the patents into individual xml files
process_xml_file('patents/ipg071120.xml')

In [67]:
def extract_text(element):
    """Helper function to extract text from an element and its children."""
    texts = [(element.text or "").strip()]
    for child in element:
        if child.tag == 'claim-ref':
            texts.append((child.text or "").strip())
        if child.tail:
            texts.append((child.tail or "").strip())
    return ' '.join(filter(None, texts))

def print_claim_texts(element, level=0):
    """Recursively print each claim-text, including text after claim-ref."""
    if element.tag == 'claim-text':
        print("  " * level + extract_text(element))
    for child in element:
        print_claim_texts(child, level + 1)

def print_tag_text(tag_name, root):
    for element in root.findall(f'.//{tag_name}'):
        if element.text:
            print(f"{tag_name}: {element.text.strip()}")

def print_citation_doc_numbers(root):
    for idx, doc_number in enumerate(root.xpath('.//citation//patcit//document-id//doc-number'), 1):
        print(f"Citation {idx}: Document Number {doc_number.text.strip()}")

def main():
    try:
        with open('xml_patents/split_1291.xml', 'rb') as file:
            tree = etree.parse(file, etree.XMLParser(recover=True))
        root = tree.getroot()

    except Exception as e:
        print(f"Error parsing XML: {e}")
        return

    patent_details = {
        "Patent Name": './/invention-title',
        "Patent Number": './/doc-number',
        "Patent Abstract": './/abstract/p',
        "Patent U.S. CI. Main classification": './/main-classification'
    }

    for detail, xpath in patent_details.items():
        element = root.find(xpath)
        if element is not None and element.text:
            print(f"{detail}: {element.text.strip()}")
        else:
            print(f"No {detail.lower()} found.")

    print("Patent International Classification:")
    for tag in [
        'classification-level', 'section', 'class', 'subclass',
        'main-group', 'subgroup', 'further-classification'
    ]:
        print_tag_text(tag, root)

    for claim in root.findall('.//claim'):
        print(f"Claim {claim.get('num')}:")
        print_claim_texts(claim)
        print()  # New line between claims

    try:
        print_citation_doc_numbers(root)

    except Exception as e:
        print(f"Error parsing XML: {e}")

if __name__ == "__main__":
    main()

Patent Name: Spring lock interface engagement system
Patent Number: 07297014
Patent Abstract: An engagement system having a spring lock design to initially attach the two halves of the system together. After which the use of a multi start Acme lead screw provides a, consistent, low torque means of engagement. Although the engager requires the use of lubrication, to be applied initially, the lube is contained in a sealed environment. This eliminates the risk of contamination to electrical components and eliminates the need to apply additional lube during the products life. Because the present invention allows the ACME threads to be “meshed” at all times, it also eliminates the possibly of cross-threading.
Patent U.S. CI. Main classification: 439372
Patent International Classification:
classification-level: A
section: H
class: 01
subclass: R
main-group: 13
subgroup: 62
further-classification: 439953
Claim 00001:
  1. An interface device comprising:
    a test adapter comprising:
      a 

In [81]:
from preprocess import extract_and_convert_to_json, json_to_patent
import json

patent_number = '07297014'

json_data, patent_as_class = extract_and_convert_to_json(f'xml_patents_wn/{patent_number}.xml')
#print(json_data)
data_dict = json.loads(json_data)


with open(f'json_patents/{patent_number}.json', 'w') as file:
    json.dump(data_dict, file, indent=4)

with open(f'json_patents/{patent_number}.json', 'r') as file:
    data = json.load(file)
    
claims_subset = patent.claims[:1]
print(json.dumps(claims_subset, indent=4))

[
    {
        "claim_number": "CLM-00001",
        "preamble": "1. An interface device comprising:",
        "components": [
            {
                "component": "a test adapter comprising:",
                "subcomponents": [
                    "a frame; and",
                    "an engagement mechanism mounted to said frame, said engagement mechanism comprising:",
                    "a spring lock nut;",
                    "a lead screw drive comprising a housing having interior threads, an Acme lead screw in said housing, and a spring lock pin having at least one retractable locking tab; and",
                    "a handle connected to said spring lock nut; and"
                ]
            },
            {
                "component": "a frame; and",
                "subcomponents": []
            },
            {
                "component": "an engagement mechanism mounted to said frame, said engagement mechanism comprising:",
                "subcomponents": [
     

In [84]:
def convert_all_xml_to_json(xml_directory, json_directory):
    os.makedirs(json_directory, exist_ok=True)
    
    for file_name in os.listdir(xml_directory):
        if file_name.endswith('.xml'):
            patent_number = file_name.split('.')[0]
            try:
                json_data, patent_as_class = extract_and_convert_to_json(f'{xml_directory}/{file_name}')
                data_dict = json.loads(json_data)

                with open(f'{json_directory}/{patent_number}.json', 'w') as file:
                    json.dump(data_dict, file, indent=4)

            except ValueError:
                print(f"Skipping {file_name} due to unpacking error.")

# folders
xml_directory = 'xml_patents_wn'
json_directory = 'json_patents'

convert_all_xml_to_json(xml_directory, json_directory)

Error processing XML: 'NoneType' object has no attribute 'strip'
Skipping RE039923.xml due to unpacking error.


In [79]:
import re
import json
from itertools import islice

def clean_text(text, patterns):
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

# Define regex patterns
patterns_preamble = [r'^\d+\.\s*', r'comprising:$', r'^(a |an |An |A)']
patterns_component = [r'comprising:$', r'; and$', r'^(a |an |An |A)']
patterns_subcomponent = [r'^(comprising\s+|a\s+|an\s+|\d+\s+)*', r'comprising:$', r'; and$', r'^(a |an |An |A)']

# Initialize questions dictionary
questions_dict = {}

claims = patent.claims

for claim in claims:
    preamble_key = claim["preamble"]
    preamble = clean_text(claim["preamble"], patterns_preamble)
    components = claim.get("components", [])
    
    if preamble not in questions_dict:
        questions_dict[preamble_key] = {}
        
    if not components:
        # If only a preamble exists
        if '' not in questions_dict[preamble_key]:
            questions_dict[preamble_key][''] = {
                '': [f"What is the role of {preamble}?",
                    f"How is {preamble} utilized?",
                    f"What outcomes are produced by {preamble}?",
                    f"What alternative components can substitute {preamble}?"]
            }
    else:
        for component_data in components:
            component_key = component_data["component"]
            component = clean_text(component_data["component"], patterns_component)
            subcomponents = component_data.get("subcomponents", [])
            
            if component not in questions_dict[preamble_key]:
                questions_dict[preamble_key][component_key] = {}
                
            if not subcomponents:
                if '' not in questions_dict[preamble_key][component_key]:
                    questions_dict[preamble_key][component_key][''] = [f"What is the role of the {component} of the {preamble}?",
                                                              f"How is the {component} of the {preamble} utilized?",
                                                              f"What outcomes are produces by the {component} of the {preamble}?",
                                                              f"What alternative components can substitute the {component} of the {preamble}?"]
            else:
                for subcomponent in subcomponents:
                    subcomponent_key = subcomponent
                    subcomponent = clean_text(subcomponent, patterns_subcomponent)
                    
                    if subcomponent not in questions_dict[preamble_key][component_key]:
                        questions_dict[preamble_key][component_key][subcomponent_key] = []
                    
                    # Add questions only if the list is empty
                    if not questions_dict[preamble_key][component_key][subcomponent_key]:
                        questions_dict[preamble_key][component_key][subcomponent_key].extend([
                            f"What is the role of the {subcomponent} in the {component} of the {preamble}?",
                            f"How is the {subcomponent} utilized within the {component} of the {preamble}?",
                            f"What outcomes are produced by {subcomponent} in the {component} of the {preamble}?",
                            f"What alternative components can substitute {subcomponent} in the {component} of the {preamble}?"
                        ])
                        
first_three_items = dict(islice(questions_dict.items(), 1))
print(json.dumps(first_three_items, indent=4))

{
    "1. An interface device comprising:": {
        "a test adapter comprising:": {
            "a frame; and": [
                "What is the role of the frame in the test adapter  of the interface device ?",
                "How is the frame utilized within the test adapter  of the interface device ?",
                "What outcomes are produced by frame in the test adapter  of the interface device ?",
                "What alternative components can substitute frame in the test adapter  of the interface device ?"
            ],
            "an engagement mechanism mounted to said frame, said engagement mechanism comprising:": [
                "What is the role of the engagement mechanism mounted to said frame, said engagement mechanism  in the test adapter  of the interface device ?",
                "How is the engagement mechanism mounted to said frame, said engagement mechanism  utilized within the test adapter  of the interface device ?",
                "What outcomes are pr

In [68]:
import os
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential

os.environ["AZURE_INFERENCE_CREDENTIAL"] = ""
api_key = os.getenv("AZURE_INFERENCE_CREDENTIAL", '')
if not api_key:
    raise Exception("A key should be provided to invoke the endpoint")
    
client = ChatCompletionsClient(
    endpoint='https://Phi-3-5-MoE-instruct-gaqxj.eastus2.models.ai.azure.com',
    credential=AzureKeyCredential(api_key)
)

def structure_to_string(data):
    claim_strings = []
    for claim in data:
        claim_str = f"{claim['claim_number']}\n{claim['preamble']}\n"
        for component in claim['components']:
            claim_str += f"  {component['component']}\n"
            for subcomponent in component['subcomponents']:
                claim_str += f"    {subcomponent}\n"
        claim_strings.append(claim_str)
    return "\n".join(claim_strings)

def generate_answers(abstract, patent_title, claims, questions):
    title_example = "Apparatus for docking a printed circuit board"
    question_example = "What is the role of a guide plate selectively securable in a position adjacent to a first electronic connector, the guide plate including at least one linear slot of the apparatus?"
    answer_example = "The goal is to select and secure the electric circuit board in a designated location."
    question_example1 = "What alternative components can substitute the guide plate selectively securable in a position adjacent to a first electronic connector, the guide plate including at least one linear slot of the apparatus?"
    answer_example1 = "Alternative componets may include adjustable brackets, spring clips, guide rails, magnetic strips, or standarized slots or holes."
    #print(title_example)

    # Initialize messages with the example
    initial_messages = [
        {
            "role": "system",
            "content": "You are a patent expert and answer questions according to the doctrine of equivalents in a brief and direct manner in less than one hundred words."
        },
        {
            "role": "user",
            "content": f"{question_example}"
        },
        {
            "role": "assistant",
            "content": f"{answer_example}"
        },
        {
            "role": "user",
            "content": f"{question_example1}"
        },
        {
            "role": "assistant",
            "content": f"{answer_example1}"
        }
    ]

    answers = []
    for i, question in enumerate(questions):
        # Include the first example to follow the few-shot strategy
        messages = initial_messages + [
            {
                "role": "user",
                "content": f"The patent title is: {patent_title}. The patent abstract is: {abstract}. {question}"
            }
        ]

        payload = {
            "messages": messages,
            "max_tokens": 100, 
            "temperature": 0.5, 
            "top_p": 0.1,
            "presence_penalty": 0.5,
            "frequency_penalty": 0.5 
        }
        #print("payload", payload)
        response = client.complete(payload)
        #print("the response to the questions are:", response.choices[0].message.content)
        answers.append(response.choices[0].message.content)
    #print(json.dumps(answers, indent=4))
    return answers


def query_patent(patent, questions):
    answers = {}
    claims = patent.claims
    abstract = patent.abstract
    patent_title = patent.title

    for preamble in questions:
        if preamble not in answers:
            answers[preamble] = {}

        # Check if there are components under the preamble
        components = questions[preamble]
        if not components:
            # Directly answer if there are no components
            answers[preamble] = generate_answers(
                abstract, patent_title, structure_to_string(claims), preamble
            )
            continue
        
        for component in components:
            if component not in answers[preamble]:
                answers[preamble][component] = {}

            # Check if there are subcomponents under the component
            subcomponents = questions[preamble][component]
            if not subcomponents:
                # Directly answer if there are no subcomponents
                answers[preamble][component] = generate_answers(
                    abstract, patent_title, structure_to_string(claims), component
                )
                continue

            for subcomponent in subcomponents:
                if subcomponent not in answers[preamble][component]:
                    answers[preamble][component][subcomponent] = generate_answers(
                        abstract, patent_title, structure_to_string(claims),
                        questions[preamble][component][subcomponent]
                    )


    return answers

answers = query_patent(patent, questions_dict)
with open(f'question_feature/{patent.document_number}.json', 'w') as file:
    json.dump(answers, file, indent=4)

In [71]:
from general_patent_query import query_patent_general

patent_questions = ["What is the essential function of the product or process?",
                    "How does the product or process operate?",
                    "What results does this product or process achieve?",
                    "What ingredients, materials, or processes are alternatives that can work in the same way?"
                   ]

answers = query_patent_general(client, patent, patent_questions)

combined_data = {
    "title": patent.title,
    "abstract": patent.abstract,
    "claims": patent.claims,
    "answers": answers
}

# Write the combined data to a JSON file
with open(f'question_patent/{patent.document_number}.json', 'w') as file:
    json.dump(combined_data, file, indent=4)

print(answers)

[' The essential function of the patented spring lock interface engagement system is to securely attach two components, typically a test adapter and a receiver, using a spring lock nut and an Acme lead screw mechanism. This system ensures precise alignment, consistent low torque engagement, and protection against contamination for electrical components.', ' The patented spring lock interface engagement system operates by initially attaching a test adapter and receiver using a spring lock nut and an Acme lead screw mechanism. The handle connected to the spring lock nut is turned, causing the lead screw to draw the adapter and receiver together. This ensures precise alignment, consistent low torque engagement, and protection against contamination for electrical components. Alternative materials or processes could include adjustable brackets, magnetic strips, or standardized', ' The patented spring lock interface engagement system securely attaches two components, such as a test adapter a