In [47]:
from dotenv import load_dotenv

load_dotenv("../../../.env")

True

In [63]:
from anthropic import Anthropic
from bs4 import BeautifulSoup
from functools import partial
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from typing import Any

import os
import re
import requests

log = print


class GPatentEngine:
    def __init__(self):
        # Set up the Chrome WebDriver
        options = Options()
        options.add_argument("--headless=new")
        self.driver = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(driver=self.driver, timeout=10)

        self.client = Anthropic(
            api_key=os.environ.get("ANTHROPIC_API_KEY"),  # This is the default and can be omitted
        )


    def _selenium_patent_search(self,
                                destination,
                                wait_fn,
                                fetch_fn,
                                process_fn):
        self.driver.get(destination)
        wait_fn()

        patents = []

        # Parse through search results as they load
        previous_count = 0
        while True:
            # Get all currently loaded search result elements
            results = fetch_fn()

            # We're done? Exit loop
            if len(results) == previous_count:
                break

            # Process newly loaded elements
            for result in results[previous_count:]:
                try:
                    process_fn(result)
                except Exception as e:
                    log(f"Encountered error when parsing patent results: {e}")

            previous_count = len(results)

            # Scroll to bottom to trigger more results [optional]
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    def _patent_direct_search(self, query: str) -> list[str]:
        target = "https://patents.google.com/"
        
        def _wait_for_search_box(driver, wait):
            wait.until(EC.presence_of_element_located((By.NAME, "q")))
            search_box = driver.find_element(By.NAME, "q")  # the input box uses name="q"
            # Execute the search
            search_box.send_keys(query)
            search_box.send_keys(Keys.RETURN)
            # Wait for the first batch to load
            wait.until(EC.presence_of_element_located((By.XPATH, "//article[contains(@class, 'search-result-item')]/following::a[1]")))

        def _fetch_results(driver) -> list[Any]:
            return driver.find_elements(By.XPATH, "//state-modifier[contains(@class, 'search-result-item')]")

        patents = []
        def _process_fn(result) -> None:
            patents.append(result.get_attribute('data-result').split("/")[1])

        self._selenium_patent_search(destination=target,
                                     wait_fn=partial(_wait_for_search_box, driver=self.driver, wait=self.wait),
                                     fetch_fn=partial(_fetch_results, driver=self.driver),
                                     process_fn=_process_fn)

        return patents

    def _patent_internet_search(self, query: str) -> list[str]:
        target = "https://www.duckduckgo.com/"

        def _wait_for_search_box(driver, wait):
            wait.until(EC.presence_of_element_located((By.NAME, "q")))
            search_box = driver.find_element(By.NAME, "q")  # the input box uses name="q"
            # Execute the search
            search_box.send_keys(f"{query} site:patents.google.com")
            search_box.send_keys(Keys.RETURN)
            # Wait for the first batch to load
            wait.until(EC.presence_of_element_located((By.XPATH, "//article[@data-nrn='result']")))

        def _fetch_results(driver) -> list[Any]:
            return driver.find_elements(By.XPATH, "//article[@data-nrn='result']//a")

        patents = []
        def _process_fn(result) -> None:
            link_value = result.get_attribute("href")
            if link_value and link_value.startswith("https://patents.google.com"):
                patent = re.match(r".*/patent/(.*)/.*", link_value)
                if patent and patent.group(1) is not None:
                    patents.append(patent.group(1))

        self._selenium_patent_search(destination=target,
                                     wait_fn=partial(_wait_for_search_box, driver=self.driver, wait=self.wait),
                                     fetch_fn=partial(_fetch_results, driver=self.driver),
                                     process_fn=_process_fn)
        return patents

    def search(self, query: str) -> list[str]:
        patents: set[str] = set()

        for patent_candidate in self._patent_direct_search(query):
            if patent_candidate not in patents:
                patents.add(patent_candidate)
        for patent_candidate in self._patent_internet_search(query):
            if patent_candidate not in patents:
                patents.add(patent_candidate)

        return [patent_id for patent_id in patents if self.is_prior_art(query, patent_id)]

    def is_prior_art(self, idea, patent_id):
        patent_claim_dict = self.get_patent_claims(patent_id)
        claude_output = self.client.messages.create(
            system="You are an assistant for a patent law firm helping a client do prior art discovery for a patent they are interested in pursuing. Given their proposed idea and a description of another patent someone in the firm found during discovery, determine whether the patent *could* constitute prior art for the idea. Return either true or false only.",
            messages=[
                {
                    "role": "user",
                    "content": f"IDEA: {idea}\nPATENT: {patent_claim_dict}",
                }
            ],
            model="claude-3-7-sonnet-latest",
            max_tokens=8192,
        )
    
        raw_output = claude_output.content[0].text
        # Don't omit if output is malformed, so != false rather than == true
        return raw_output.strip().lower() != 'false'

    @staticmethod
    def get_patent_claims(patent_id) -> dict[str, str]:
        url = f"https://patents.google.com/patent/{patent_id}/en"
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")

        return {
            "abstract": getattr(soup.find('div', class_='abstract'), "text", ""),
            "claims": [claim.text for claim in soup.find_all(class_='claim-text')]
        }

In [64]:
engine = GPatentEngine()
try:
    patents = engine.search("diaper for birds")
finally:
    engine.driver.quit()

print(patents)

['US5934226A', 'US9468565B2', 'US20120037094A1', 'CN201472694U', 'JPH01215229A', 'JP2015112093A', 'CN107047454B', 'ES2592324A1']


In [62]:
# import requests
# from bs4 import BeautifulSoup

# query = "diaper for birds"
# patent = "US5934226A"

# url = "https://patents.google.com/patent/US5934226A/en"
# resp = requests.get(url)

# def _get_patent_text(patent_id) -> dict[str, str]:
#     url = f"https://patents.google.com/patent/{patent_id}/en"
#     resp = requests.get(url)
#     soup = BeautifulSoup(resp.text, "html.parser")

#     return {
#         "abstract": soup.find('div', class_='abstract').text,
#         "claims": [claim.text for claim in soup.find_all(class_='claim-text')]
#     }


# def is_prior_art(query, patent_text):
#     claude_output = client.messages.create(
#         system="You are an assistant for a patent law firm helping a client do prior art discovery for a patent they are interested in pursuing. Given their proposed idea and a description of another patent someone in the firm found during discovery, determine whether the patent *could* constitute prior art for the idea. Return either true or false only.",
#         messages=[
#             # {
#             #     "role": "system",
#             #     "content": "You are an assistant for a patent law firm helping a client do prior art discovery for a patent they are interested in pursuing. Given their proposed idea and a description of another patent someone in the firm found during discovery, determine whether the patent *could* constitute prior art for the idea. Return either true or false only.",
#             # },
#             {
#                 "role": "user",
#                 "content": f"IDEA: {query}\nPATENT: {patent_text}",
#             }
#         ],
#         model="claude-3-7-sonnet-latest",
#         max_tokens=8192,
#     )

#     return claude_output.content[0].text

# is_prior_art("automatic water bottle opener", _get_patent_text(patent))