In [9]:
!pip install -qU \
    python-dotenv \
    langchain \
    langchain-community \
    openai \
    langchain-openai      

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
import fitz  # PyMuPDF
import os

In [None]:
pdf_path = "W706408.pdf" 
pdf_dir = os.path.dirname(os.path.abspath(pdf_path))  # Get the current folder path

doc = fitz.open(pdf_path)

for page_index in range(len(doc)): #extracting images for the pdf page by page
    page = doc[page_index]
    images = page.get_images(full=True)

    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_filename = f"page{page_index+1}_img{img_index+1}.{image_ext}"
        
        image_path = os.path.join(pdf_dir, image_filename)
        with open(image_path, "wb") as f:
            f.write(image_bytes)

        print(f"Saved {image_path}")


Saved d:\Langchain\page1_img1.png


In [7]:
from pydantic import BaseModel, Field
from typing import Optional, Literal

class BoltFeatures(BaseModel):
    part_name: Optional[str] = None
    title: Optional[str] = None
    diameter: Optional[str] = None  
    length: Optional[str] = None    
    thread_pitch: Optional[str] = None
    head_type: Literal["hex", "socket", "button", "pan", "flat", "round"] = Field()
    drive_type: Literal["phillips", "hex", "slotted", "torx", "allen", "pozidriv", "square", "spanner"] = Field()
    material: Optional[str] = None
    coating: Optional[str] = None
    is_threaded_full: Optional[bool] = None


In [8]:
from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=BoltFeatures)

In [7]:
import openai
import os

In [9]:
import base64
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Make sure your API key is set via env or explicitly here

def analyze_with_gpt4v(image_path: str, focus_prompt: str) -> str:
    with open(image_path, "rb") as f:
        image_data = f.read()
    base64_image = base64.b64encode(image_data).decode("utf-8")

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in mechanical drawing interpretation."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": focus_prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        },
                    },
                ],
            }
        ],
        max_tokens=1000,
        temperature=0.0
        
    )

    return response.choices[0].message.content.strip()


In [1]:
def run_length_agent(image_path: str) -> str:
    prompt = (
    "From the image below, extract the **length of the bolt**.\n\n"
    "Return your answer in this JSON format:\n\n"
    '{\n'
    '  "answer": "<length in mm>",\n'
    '  "confidence": <float between 0 and 1>,\n'
    '  "reasoning": "<how you arrived at the answer>"\n'
    '}\n\n'
    "Rate your confidence based on clarity of the dimension, presence of labels, and whether your interpretation might be ambiguous."
    )
    return analyze_with_gpt4v(image_path, prompt)


In [2]:
def run_diameter_agent(image_path: str) -> str:
    prompt = (
    "From the image below,determine the **diameter** of the bolt (e.g., M10 = 10mm nominal). "
    "Explain how it was derived from the image, such as notation like 'M10', 'Ø', or dimension labels."
    "Return your answer in this JSON format:\n\n"
    '{\n'
    '  "answer": "<length in mm>",\n'
    '  "confidence": <float between 0 and 1>,\n'
    '  "reasoning": "<how you arrived at the answer>"\n'
    '}\n\n'
    "Rate your confidence based on clarity of the dimension, presence of labels, and whether your interpretation might be ambiguous."
    )
    return analyze_with_gpt4v(image_path, prompt)


In [12]:
def run_head_type_agent(image_path: str) -> str:
    prompt = (
    "From the technical drawing, identify the **head type** of the bolt (e.g., hex, socket, pan, etc.). "
    "Explain how it was derived from the image."
    "Return your answer in this JSON format:\n\n"
    '{\n'
    '  "answer": "",\n'
    '  "confidence": <float between 0 and 1>,\n'
    '  "reasoning": "<how you arrived at the answer>"\n'
    '}\n\n'
    "Rate your confidence based on clarity of the dimension, presence of labels, and whether your interpretation might be ambiguous."
    )
    return analyze_with_gpt4v(image_path, prompt)


In [14]:
image_path = "102.jpg"

def run_all_agents(image_path):
    print("Analyzing bolt drawing with vision agents...\n")

    length = run_length_agent(image_path)
    diameter = run_diameter_agent(image_path)
    head_type = run_head_type_agent(image_path)

    print("Final Bolt Report\n")
    print("Length:\n", length, "\n")
    print("Diameter:\n", diameter, "\n")
    print("Head Type:\n", head_type, "\n")


run_all_agents(image_path)

Analyzing bolt drawing with vision agents...

Final Bolt Report

Length:
 ```json
{
  "answer": "Not specified",
  "confidence": 0.5,
  "reasoning": "The drawing does not explicitly label the total length of the bolt. It provides various dimensions such as grip length, thread length, and other specific measurements, but the overall length is not directly indicated. The confidence is moderate due to the lack of a clear label for the total length."
}
``` 

Diameter:
 ```json
{
  "answer": "D",
  "confidence": 0.8,
  "reasoning": "The diameter of the bolt is indicated by the notation 'ØD' in the side view of the drawing. This notation is typically used to specify the nominal diameter of a bolt. However, the exact numerical value is not visible in the image, so the confidence is moderate."
}
``` 

Head Type:
 ```json
{
  "answer": "socket",
  "confidence": 0.9,
  "reasoning": "The technical drawing shows a bolt with a recessed hexagonal shape on the head, which is characteristic of a socke