This is used for benchmark purposes, not a optimized pipeline for llm extraction

In [None]:
from typing import List, Optional
from pydantic import BaseModel
import datetime
class Patent(BaseModel):
    title : str
    Application_Date: Optional[datetime.datetime]
    Publication_Date: Optional[datetime.datetime]
    Applicants: list[str]
    Inventors: list[str]

    class Config:
        schema_extra = {
            "additionalProperties": False,
            "json_encoders": {
                datetime.datetime: lambda v: v.isoformat()
            }
        }
# serialize pydantic model into json schema
pydantic_schema = Patent.schema_json()

prompt = f"You are a helpful assistant that transform historical scans of patents to a JSON format. Make sure to get the dates and names correct and only include keys 'title', 'Application_Date', 'Publication_Date', 'Applicants', 'Inventors'. Take a second to think about your answer. Here's the json schema you must adhere to:\n{pydantic_schema}\n"

In [None]:
import os
from pymongo import UpdateOne
from tqdm import tqdm
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

client = MongoClient("localhost", 29012)
db = client["test-database"]
collection_json = db["collection-json"]
def generate_query(item):
    text_clean = item['OCR'].decode('utf-8').replace('\n\n', ' ')
    return text_clean


In [None]:
from tqdm import tqdm
from google import genai
from google.genai import types
import json
import os
count_token = 0
model = 'gemini-1.5-flash'
os.mkdir(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{model.replace('/','-')}_json_compare") if not os.path.exists(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{model.replace('/','-')}_json_compare") else None
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}, 'clean_applicants': {'$exists': True}, 'clean_inventor': {'$exists': True}}).limit(100))):
    query = generate_query(item)
    #print(query)
    client = genai.Client(api_key="API_KEY")
    count_token += len(f"{prompt}\n The patent text is: {query}\n\n")/4
    
    response = client.models.generate_content(
        model=model,
        contents=f"{prompt}. The latent text is: {query}",
        config=types.GenerateContentConfig(
            safety_settings=[
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_HATE_SPEECH,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_DANGEROUS_CONTENT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_CIVIC_INTEGRITY,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                )],
        )
    )
    print('--response is--')
    print(response)
    json_llm = ''
    if '```json' in response.text:
        try:
            json_llm = response.text.strip('```json\n').strip('```').strip('\n')
            json_llm = json_llm[json_llm.index('{'):]
            # Parse the JSON content into a Python dictionary
            json_llm = json.loads(json_llm)
        except:
            print('error parsing json but started with json')
            print(json_llm)
            json_llm = None
    elif response.text.startswith('{'):
        try:
            json_llm = response.text.replace("'", '"').strip('\n')
            # Parse the JSON content into a Python dictionary
            json_llm = json.loads(json_llm)
            if 'properties' in json_llm:
                json_llm = json_llm['properties']
        except:
            print('error parsing json but started with {')
            print(json_llm)
            json_llm = None
    else:
        print(f"Error parsing JSON: {response.text} ")
    # Store in json file for later evaluation
    with open(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{model.replace('/','-')}_json_compare/json_llm_{item['_id']}.json", "w") as f:
        data = {'Title': item['Title'], 'Application_Date': item['C_Application Date'], 'Publication_Date': item['C_Publication Date'], 'Applicants': item['clean_applicants'], 'Inventors': item['clean_inventor']}
        # Convert datetime objects to strings for JSON serialization
        data_serializable = {
            key: (value.isoformat() if isinstance(value, datetime) else value)
            for key, value in data.items()
        }
        json.dump({'predicted': json_llm, 'expected': data_serializable}, f)
    print(f"Currently at about {count_token} tokens")
    

# With pdfs

In [None]:
from pdf2image import convert_from_path
from PIL import Image

def generate_images(filename):
    # Convert PDF to a list of PIL images
    filename = os.path.join('/scratch/students/ndillenb/metadata/processing/llm/us_sample_patents', filename+'.pdf')
    images = convert_from_path(filename)
    return images
    

In [None]:
from tqdm import tqdm
from google import genai
from google.genai import types
import json
import os
count_token = 0
model = 'gemini-2.0-flash-lite'
prefix = 'img_'
os.mkdir(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{prefix}{model.replace('/','-')}_json_compare") if not os.path.exists(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{prefix}{model.replace('/','-')}_json_compare") else None
for item in tqdm(list(collection_json.find({'Country':"US", 'OCR': {'$exists': True}, 'Title': {'$exists': True}, 'C_Application Date': {'$exists': True}, 'C_Publication Date': {'$exists': True}, 'clean_applicants': {'$exists': True}, 'clean_inventor': {'$exists': True}}).limit(100))[16:]):
    print(f"Trying to process {item['Country']}{item['Publication Number']}{item['Doc_kind']}")
    images = generate_images(f"{item['Country']}{item['Publication Number']}{item['Doc_kind']}")
    print(f"Transformed images")
    client = genai.Client(api_key="API_KEY")
    count_token += len(f"{prompt}\n")/3. #Keep count of token for exepenses
    for image in images:
        largest_dimension = max(image.size)
        count_token += int(largest_dimension/768 * 258) 
    query = [prompt]+[images]
    response = client.models.generate_content(
        model=model,
        contents=query,
        config=types.GenerateContentConfig(
            safety_settings=[
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_HATE_SPEECH,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_DANGEROUS_CONTENT,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
                types.SafetySetting(
                    category=types.HarmCategory. HARM_CATEGORY_CIVIC_INTEGRITY,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                )],
        )
    )
    print('--response is--')
    print(response)
    json_llm = ''
    if '```json' in response.text:
        try:
            json_llm = response.text.strip('```json\n').strip('```').strip('\n')
            json_llm = json_llm[json_llm.index('{'):]
            # Parse the JSON content into a Python dictionary
            json_llm = json.loads(json_llm)
        except:
            print('error parsing json but started with json')
            print(json_llm)
            json_llm = None
    elif response.text.startswith('{'):
        try:
            json_llm = response.text.replace("'", '"').strip('\n')
            # Parse the JSON content into a Python dictionary
            json_llm = json.loads(json_llm)
            if 'properties' in json_llm:
                json_llm = json_llm['properties']
        except:
            print('error parsing json but started with {')
            print(json_llm)
            json_llm = None
    else:
        print(f"Error parsing JSON: {response.text}")
    # Store in json file for later evaluation
    with open(f"/scratch/students/ndillenb/metadata/processing/llm/json_compare/{prefix}{model.replace('/','-')}_json_compare/json_llm_{item['_id']}.json", "w") as f:
        data = {'Title': item['Title'], 'Application_Date': item['C_Application Date'], 'Publication_Date': item['C_Publication Date'], 'Applicants': item['clean_applicants'], 'Inventors': item['clean_inventor']}
        # Convert datetime objects to strings for JSON serialization
        data_serializable = {
            key: (value.isoformat() if isinstance(value, datetime) else value)
            for key, value in data.items()
        }
        json.dump({'predicted': json_llm, 'expected': data_serializable}, f)
    print(f"Currently at about {count_token} tokens")
    