In [2]:
# Install required packages (run this cell once)
%pip install pytesseract Pillow PyMuPDF openai python-dotenv

# === Imports ===
import pytesseract
from PIL import Image
import fitz  # PyMuPDF
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import os
from openai import OpenAI
import json

# === (Optional) Set Tesseract path manually if on Windows ===
# Uncomment and edit the path below if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# === File upload using file dialog ===
Tk().withdraw()  # Hide the Tkinter root window
file_name = askopenfilename(title="Select an image or PDF")

# === Text extraction from image or PDF ===
def extract_text(file_name):
    if file_name.lower().endswith(".pdf"):
        doc = fitz.open(file_name)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    else:
        img = Image.open(file_name)
        return pytesseract.image_to_string(img)

# === Clean the extracted text ===
def clean_text(raw):
    lines = raw.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    return " ".join(lines)

# === Set your GROQ API key here ===
GROQ_API_KEY = "your_groq_api_key_here"  # Replace this with your actual key
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# === OpenAI client with Groq's endpoint ===
client = OpenAI(
    api_key="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",
    base_url="https://api.groq.com/openai/v1"
)


# === Extract key legal fields using LLM ===
def extract_deed_info(cleaned_text):
    prompt = f"""
You are an intelligent assistant specialized in Indian legal land records.

Extract the following fields from the land deed text below:

- Deed Type
- Buyer Name
- Seller Name
- Survey Number
- Location
- Date of Execution
- Registration Number

Return ONLY a valid JSON response with those fields and values.

Text:
{cleaned_text}
"""
    response = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

# === Run the process ===
raw_text = extract_text(file_name)
cleaned = clean_text(raw_text)
output = extract_deed_info(cleaned)

# === Print Result ===
print("\n📋 Extracted Legal Info:\n")
try:
    parsed_output = json.loads(output)
    for key, value in parsed_output.items():
        print(f"{key}: {value}")
except json.JSONDecodeError:
    print("⚠️ Failed to parse JSON. Raw response:\n")
    print(output)


Note: you may need to restart the kernel to use updated packages.

📋 Extracted Legal Info:

⚠️ Failed to parse JSON. Raw response:

Here is the extracted information in JSON format:

```
{
  "Deed Type": "SALE DEED",
  "Buyer Name": "Sri Prashanth A Rao and Smt. Sowmya P Rao",
  "Seller Name": "Sri Ashwin Kumar M.A.",
  "Survey Number": "Sy.Nos.38/1-B, 392, and Sy.No.39/3",
  "Location": "Arakere Village, Begur Hobli, Bangalore South Taluk, Bangalore",
  "Date of Execution": "09-07-2015",
  "Registration Number": "BGR-1-02269-2015"
}
```
