In [2]:
pip install numpy opencv-python matplotlib pillow




In [3]:
# System dependencies (for Tesseract + pyttsx3 engine support)
!apt-get update
!apt-get install -y tesseract-ocr libespeak1

# Ultralytics YOLOv10
!pip install ultralytics

# Segment Anything Model (SAM)
!pip install git+https://github.com/facebookresearch/segment-anything.git

# Transformers + timm (for CLIP and BLIP)
!pip install transformers timm

# Google Generative AI SDK (for Gemini)
!pip install google-generativeai

# Text-to-Speech
!pip install pyttsx3

# Core packages
!pip install numpy opencv-python matplotlib pillow pytesseract


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,799 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,748 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4,579 kB]
Get

In [4]:
!wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

--2025-06-26 16:00:15--  https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.50, 13.35.7.38, 13.35.7.128, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2564550879 (2.4G) [binary/octet-stream]
Saving to: ‘sam_vit_h_4b8939.pth’


2025-06-26 16:00:39 (102 MB/s) - ‘sam_vit_h_4b8939.pth’ saved [2564550879/2564550879]



In [5]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed click-8.1.8 gTTS-2.5.4


In [7]:
from ultralytics import YOLO
import cv2
import pytesseract
import os
import google.generativeai as genai
import numpy as np
import torch
from PIL import Image
from segment_anything import sam_model_registry, SamPredictor
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from IPython.display import Audio, display
import io

# --- Configuration ---
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
device = "cuda" if torch.cuda.is_available() else "cpu"
SAM_CHECKPOINT = "sam_vit_h_4b8939.pth"
GEMINI_API_KEY = "AIzaSyCza6dAJf2sSG5p5me1hdOyC95j2xreWxM"  # Replace with your actual API key

# --- Load Models ---
yolo_model = YOLO('yolov10n.pt')

sam = sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT).to(device)
predictor = SamPredictor(sam)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# --- Core Functions ---
def detect_objects(image_path):
    img = cv2.imread(image_path)
    results = yolo_model(img)
    return [result.names[int(cls)] for result in results for cls in result.boxes.cls]

def extract_text(image_path):
    gray = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2GRAY)
    return pytesseract.image_to_string(gray).strip()

def segment_image(image_path):
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    predictor.set_image(img_rgb)
    masks, _, _ = predictor.predict(point_coords=None, point_labels=None, box=None, multimask_output=True)
    return len(masks)

def generate_clip_probs(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(text=["a photo of"], images=image, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    return outputs.logits_per_image.softmax(dim=1)

def generate_blip_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs)
    return blip_processor.decode(output[0], skip_special_tokens=True).strip()

def speak_text(text):
    tts = gTTS(text)
    tts.save("response.mp3")
    display(Audio("response.mp3"))

def build_context(image_path):
    objects = detect_objects(image_path)
    text = extract_text(image_path)
    segments = segment_image(image_path)
    clip_probs = generate_clip_probs(image_path)
    blip_caption = generate_blip_caption(image_path)

    context = ""
    context += f"Detected Objects: {', '.join(objects)}. " if objects else "No objects detected. "
    context += f"Extracted Text: {text}. " if text else "No text extracted. "
    context += f"Number of segments: {segments}. "
    context += f"CLIP Probabilities: {clip_probs.tolist()}. "
    context += f"BLIP Caption: {blip_caption}. "
    return context

def ask_gemini(query, context, image_path):
    system_prompt = """
    You are a visual reasoning assistant. You are provided an image and detailed pre-analyzed context (object detection, OCR, segmentation, etc.).
    PRIORITIZE the given context over your own interpretation of the image. Only use the image if needed to fill missing info.
    Respond accurately, clearly, and concisely.
    """
    image = Image.open(image_path).convert("RGB")
    response = gemini_model.generate_content([
        system_prompt,
        f"Image Context: {context}",
        f"User Query: {query}",
        image
    ])
    return response.text.strip() if response else "No response."

# --- Main Loop ---
def run():
    from google.colab import files
    uploaded = files.upload()
    image_path = next(iter(uploaded))

    context = build_context(image_path)
    print("\n📘 Image Context:\n", context)
    speak_text("Image analyzed. You can now ask your questions.")

    while True:
        query = input("\nAsk a question (or type 'exit'): ")
        if query.lower() == 'exit':
            break
        response = ask_gemini(query, context, image_path)
        print("🔍 Response:", response)
        speak_text(response)

run()


Saving Screenshot 2025-06-09 001602.png to Screenshot 2025-06-09 001602.png

0: 512x640 2 persons, 116.2ms
Speed: 32.4ms preprocess, 116.2ms inference, 7.0ms postprocess per image at shape (1, 3, 512, 640)

📘 Image Context:
 Detected Objects: person, person. Extracted Text: PLACEMENT PREPARATION

1. APTITUDE
2. CODING J

3.DSA

Pa a

as a
oy) 6 INTERVIEW —

oy LIVE CLASSES FROM 10 JUNE

® Complete Placement Preparation (LIVE): Aptitude,
Coding, DSA, Computer Science, Resume, Interview
KnowledgeGATE by Sanchit Sir @
2.1K views * 1 day ago. Number of segments: 3. CLIP Probabilities: [[1.0]]. BLIP Caption: a man and woman are smiling and posing for the camera. 



Ask a question (or type 'exit'): explain the scene
🔍 Response: The image is a thumbnail for a YouTube video titled "Complete Placement Preparation (LIVE): Aptitude, Coding, DSA, Computer Science, Resume, Interview" from KnowledgeGATE by Sanchit Sir. It advertises daily live classes starting from June 10th, covering aptitude, coding, DSA, DBMS, OS, CN, resume writing, and interviews. Two people are in the image as well.



Ask a question (or type 'exit'): exit


In [9]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [10]:
import streamlit as st
import tempfile
from PIL import Image
import cv2
import numpy as np
import pytesseract
import torch
from ultralytics import YOLO
from segment_anything import sam_model_registry, SamPredictor
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
import google.generativeai as genai
import os
from gtts import gTTS
import base64
import io

# --- Setup ---
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # update for your system

# Load models
YOLO_MODEL = YOLO('yolov10n.pt')

SAM_CHECKPOINT = "sam_vit_h_4b8939.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam_model_registry["vit_h"](checkpoint=SAM_CHECKPOINT).to(device)
predictor = SamPredictor(sam)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

GEMINI_API_KEY = "AIzaSyCza6dAJf2sSG5p5me1hdOyC95j2xreWxM"
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

# --- Functions ---
def detect_objects(image):
    results = YOLO_MODEL(image)
    return [results[0].names[int(cls)] for cls in results[0].boxes.cls]

def extract_text(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    return pytesseract.image_to_string(gray).strip()

def segment_image(image):
    img = np.array(image)
    predictor.set_image(img)
    masks, _, _ = predictor.predict(point_coords=None, point_labels=None, box=None, multimask_output=True)
    return len(masks)

def generate_clip_probs(image):
    inputs = clip_processor(text=["a photo of"], images=image, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    return outputs.logits_per_image.softmax(dim=1)

def generate_blip_caption(image):
    inputs = blip_processor(images=image, return_tensors="pt").to(device)
    output = blip_model.generate(**inputs)
    return blip_processor.decode(output[0], skip_special_tokens=True).strip()

def ask_gemini(query, context, image):
    system_prompt = """
    You are a visual reasoning assistant.
    You are provided an image and detailed pre-analyzed context (object detection, OCR, segmentation, etc.).
    PRIORITIZE the given context over your own interpretation of the image. Only use the image if needed to fill missing info.
    Respond accurately, clearly, and concisely.
    """
    response = gemini_model.generate_content([
        system_prompt,
        f"Image Context: {context}",
        f"User Query: {query}",
        image
    ])
    return response.text.strip() if response else "No response."

def speak_text(text):
    tts = gTTS(text)
    tts_buffer = io.BytesIO()
    tts.write_to_fp(tts_buffer)
    tts_buffer.seek(0)
    b64 = base64.b64encode(tts_buffer.read()).decode()
    return f'<audio controls src="data:audio/mp3;base64,{b64}"></audio>'

# --- Streamlit UI ---
st.title("🧠 Visual Scene Assistant for the Visually Impaired")
uploaded_file = st.file_uploader("Upload an image", type=['jpg', 'png', 'jpeg'])

if uploaded_file:
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)

    with st.spinner("Analyzing Image..."):
        objects = detect_objects(image)
        text = extract_text(image)
        segments = segment_image(image)
        clip_probs = generate_clip_probs(image)
        caption = generate_blip_caption(image)

        context = ""
        context += f"Detected Objects: {', '.join(objects)}. " if objects else "No objects detected. "
        context += f"Extracted Text: {text}. " if text else "No text extracted. "
        context += f"Number of segments: {segments}. "
        context += f"CLIP Probabilities: {clip_probs.tolist()}. "
        context += f"BLIP Caption: {caption}. "

    st.subheader("🧾 Generated Context")
    st.text(context)

    user_query = st.text_input("Ask a question about the image:")
    if user_query:
        with st.spinner("Thinking..."):
            response = ask_gemini(user_query, context, image)
        st.success("Response:")
        st.write(response)
        st.markdown(speak_text(response), unsafe_allow_html=True)


2025-06-26 16:29:23.449 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
