In [68]:
import requests

res = requests.get('https://patents.google.com/patent/US12178887B2/en')


In [None]:
with open('test.html', 'w', encoding='utf-8') as f:
    f.write(res.text)


In [None]:

@dataclass
class PatentClaim:
    """Class for storing patent claim information"""
    number: int
    text: str
    dependent_on: Optional[int] = None

@dataclass
class PatentData:
    """Class for storing comprehensive patent information"""
    patent_number: str = ""
    title: str = ""
    assignees: List[str] = field(default_factory=list)
    inventors: List[str] = field(default_factory=list)
    priority_date: str = ""
    filing_date: str = ""
    publication_date: str = ""
    grant_date: str = ""
    abstract: str = ""
    description: str = ""
    claims: List[PatentClaim] = field(default_factory=list)
    
    def __str__(self) -> str:
        """String representation for printing"""
        return (
            f"Patent Number: {self.patent_number}\n"
            f"Title: {self.title}\n"
            f"Assignees: {', '.join(self.assignees)}\n"
            f"Inventors: {', '.join(self.inventors)}\n"
            f"Priority Date: {self.priority_date}\n"
            f"Filing Date: {self.filing_date}\n"
            f"Publication Date: {self.publication_date}\n"
            f"Grant Date: {self.grant_date}\n"
            f"Abstract: {self.abstract[:100]}...\n"
            f"Description: {self.description[:100]}...\n"
            f"Claims: {len(self.claims)} claims found"
        )

def extract_patent_data(html_file: str) -> PatentData:
    """
    Extract patent information from HTML file using BeautifulSoup
    
    :param html_file: Path to the HTML file
    :return: PatentData object with extracted information
    """
    # Read the HTML file
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Initialize patent data
    patent_data = PatentData()
    
    # Extract patent number/ID
    patent_number_elem = soup.find('span', {'itemprop': 'publicationNumber'})
    if patent_number_elem:
        patent_data.patent_number = patent_number_elem.text.strip()
    else:
        # Try alternative selectors
        patent_number_elem = soup.find('meta', {'name': 'citation_patent_number'})
        if patent_number_elem:
            patent_data.patent_number = patent_number_elem.get('content', '')
    
    # Extract title
    title_elem = soup.find('span', {'itemprop': 'title'})
    if title_elem:
        patent_data.title = title_elem.text.strip()
    else:
        # Try alternative selectors
        title_elem = soup.find('meta', {'name': 'citation_title'})
        if title_elem:
            patent_data.title = title_elem.get('content', '')
    
    # Extract assignees
    assignee_elems = soup.find_all('span', {'itemprop': 'assignee'})
    for elem in assignee_elems:
        assignee_name = elem.text.strip()
        if assignee_name:
            patent_data.assignees.append(assignee_name)
    
    # Extract inventors
    inventor_elems = soup.find_all('span', {'itemprop': 'inventor'})
    for elem in inventor_elems:
        inventor_name = elem.text.strip()
        if inventor_name:
            patent_data.inventors.append(inventor_name)
    
    # Extract dates
    # Look for specific date elements
    for date_type in ['priority', 'filing', 'publication', 'grant']:
        date_elem = soup.find('dd', {'itemprop': f'{date_type}Date'})
        if date_elem:
            setattr(patent_data, f"{date_type}_date", date_elem.text.strip())
    
    # Extract abstract
    abstract_elem = soup.find('div', {'class': 'abstract'})
    if abstract_elem:
        patent_data.abstract = abstract_elem.text.strip()
    else:
        # Try alternative selectors
        abstract_elem = soup.find('section', {'itemprop': 'abstract'})
        if abstract_elem:
            patent_data.abstract = abstract_elem.text.strip()
    
    # Extract description
    description_elem = soup.find('div', {'class': 'description'})
    if description_elem:
        patent_data.description = description_elem.text.strip()
    else:
        # Try alternative selectors
        description_elem = soup.find('section', {'itemprop': 'description'})
        if description_elem:
            patent_data.description = description_elem.text.strip()
    
    # Extract claims
    claims_section = soup.find('section', {'itemprop': 'claims'})
    if claims_section:
        claim_elements = claims_section.find_all(['div', 'p'], {'class': 'claim'})
        if not claim_elements:
            claim_elements = claims_section.find_all('li')
        
        for i, elem in enumerate(claim_elements, 1):
            claim_text = elem.text.strip()
            
            # Try to extract claim number
            num_span = elem.find('span', {'class': 'claim-number'})
            claim_num = int(num_span.text.strip().replace('.', '')) if num_span else i
            
            # Check for dependency
            dependent_on = None
            if claim_num > 1 and "claim" in claim_text.lower():
                import re
                dep_match = re.search(r'(?:according to|as claimed in|of)\s+claim\s+(\d+)', claim_text.lower())
                if dep_match:
                    dependent_on = int(dep_match.group(1))
            
            patent_data.claims.append(PatentClaim(
                number=claim_num,
                text=claim_text,
                dependent_on=dependent_on
            ))
    
    return patent_data

def save_to_file(patent_data: PatentData, output_file: str) -> None:
    """
    Save extracted patent data to a text file
    
    :param patent_data: PatentData object with patent information
    :param output_file: Path to the output file
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Patent Number: {patent_data.patent_number}\n")
        f.write(f"Title: {patent_data.title}\n")
        
        f.write("\n===== DATES =====\n")
        f.write(f"Priority Date: {patent_data.priority_date}\n")
        f.write(f"Filing Date: {patent_data.filing_date}\n")
        f.write(f"Publication Date: {patent_data.publication_date}\n")
        f.write(f"Grant Date: {patent_data.grant_date}\n")
        
        f.write("\n===== ASSIGNEES =====\n")
        for assignee in patent_data.assignees:
            f.write(f"{assignee}\n")
        
        f.write("\n===== INVENTORS =====\n")
        for inventor in patent_data.inventors:
            f.write(f"{inventor}\n")
        
        f.write("\n===== ABSTRACT =====\n")
        f.write(f"{patent_data.abstract}\n")
        
        f.write("\n===== DESCRIPTION =====\n")
        f.write(f"{patent_data.description}\n")
        
        f.write("\n===== CLAIMS =====\n")
        for claim in patent_data.claims:
            f.write(f"Claim {claim.number}: {claim.text}\n")
            if claim.dependent_on:
                f.write(f"  Dependent on claim {claim.dependent_on}\n")
            f.write("\n")

def main():
    """Main function to extract patent data from HTML file"""
    # Input and output file paths
    html_file = "test.html"
    output_file = "patent_data.txt"
    
    # Extract patent data
    patent_data = extract_patent_data(html_file)
    
    # Print summary
    print(patent_data)
    
    # Save to file
    save_to_file(patent_data, output_file)
    print(f"\nPatent data saved to {output_file}")

if __name__ == "__main__":
    main()

Patent Number: US20230241253A1
Title: NIR-conjugated tumor-specific antibodies and uses thereof
Assignees: 
Inventors: 
Priority Date: 
Filing Date: 
Publication Date: 
Grant Date: 
Abstract: Disclosed is a tumor-specific antibody and fluorophore conjugate for detecting, localizing and imagi...
Description: PRIORITY CLAIM
This application is a continuation of U.S. application Ser. No. 16/993,950, entitled ...
Claims: 7 claims found

Patent data saved to patent_data.txt
