In [9]:
!pip install -qU \
    python-dotenv \
    langchain \
    langchain-community \
    openai \
    langchain-openai      

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import fitz  # PyMuPDF
import os

In [6]:


pdf_path = "W706408.pdf" 
pdf_dir = os.path.dirname(os.path.abspath(pdf_path))  # Get the current folder path

doc = fitz.open(pdf_path)

for page_index in range(len(doc)): #extracting images for the pdf page by page
    page = doc[page_index]
    images = page.get_images(full=True)

    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_filename = f"page{page_index+1}_img{img_index+1}.{image_ext}"
        
        image_path = os.path.join(pdf_dir, image_filename)
        with open(image_path, "wb") as f:
            f.write(image_bytes)

        print(f"Saved {image_path}")


Saved d:\Langchain\page1_img1.png


In [7]:
from pydantic import BaseModel, Field
from typing import Optional, Literal

class BoltFeatures(BaseModel):
    part_name: Optional[str] = None
    title: Optional[str] = None
    diameter: Optional[str] = None  
    length: Optional[str] = None    
    thread_pitch: Optional[str] = None
    head_type: Literal["hex", "socket", "button", "pan", "flat", "round"] = Field()
    drive_type: Literal["phillips", "hex", "slotted", "torx", "allen", "pozidriv", "square", "spanner"] = Field()
    material: Optional[str] = None
    coating: Optional[str] = None
    is_threaded_full: Optional[bool] = None


In [8]:
from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=BoltFeatures)

In [10]:
from openai import OpenAI
import base64

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load and encode image
with open("page1_img1.png", "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")

# Define prompt
prompt = """
Extract the following features from this technical drawing of a bolt.
If the drawing uses labels instead of exact values, return those labels. In the process, please ignore /number/ format labels(e.g. /1/,/20/)
Only return a valid JSON object.
"""

# Create chat completion with image
response = client.chat.completions.parse(
    model="gpt-4o",  
    messages=[
        {"role": "system", "content": "You are an expert in mechanical drawing interpretation."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                },
            ],
        }
    ],
    response_format=BoltFeatures,
    max_tokens=1000,
    temperature=0.0
)

print(response.choices[0].message.content)


{"part_name":"SCREW & WSHR M10X30 HEX P/T PLAS P","title":"SCREW & WSHR M10X30 HEX P/T PLAS P","diameter":"M10-6g","length":"30±0.42","thread_pitch":null,"head_type":"hex","drive_type":"hex","material":"SCREW WA 950 PC 8.8","coating":"PLASTIC PATCH TO WA970","is_threaded_full":null}
