# BLIP

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests

# Load model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Load your image
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')

# Generate Caption
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
print("BLIP Caption:", processor.decode(out[0], skip_special_tokens=True))

ModuleNotFoundError: No module named 'transformers'

# QWEN2.5VL

In [None]:
pip install qwen-vl-utils

In [11]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": url},
            {"type": "text", "text": "Describe this image in great detail."},
        ],
    }
]

# Preparation
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info(messages)
inputs = processor(text=[text], images=image_inputs, return_tensors="pt").to("cuda")

# Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
print("Qwen2.5-VL:", output_text[0])

ModuleNotFoundError: No module named 'torchvision'

# FLORENCE-2

In [3]:
from PIL import Image
image = Image.open("../image.png").convert("RGB")

In [12]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

prompt = "<OD>"

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=4096,
    num_beams=3,
    do_sample=False
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

print(parsed_answer)


AttributeError: module 'torch' has no attribute 'cuda'

# MOLMO 7B

In [None]:
pip install einops

In [None]:

from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

processor = AutoProcessor.from_pretrained('allenai/Molmo-7B-D-0924', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('allenai/Molmo-7B-D-0924', trust_remote_code=True, device_map="auto")

inputs = processor.process(images=[image], text="Describe this image.")
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

output = model.generate_from_batch(
    inputs, 
    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
    tokenizer=processor.tokenizer
)

generated_text = processor.tokenizer.decode(output[0, inputs['input_ids'].size(1):], skip_special_tokens=True)
print("Molmo:", generated_text)

# QWEN2.5VL 3B

In [3]:
uv pip install accelerate

Note: you may need to restart the kernel to use updated packages.


c:\Users\Owais\Downloads\video-rag\.venv\Scripts\python.exe: No module named uv


In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
import torch

# 4-bit quantization is the "secret sauce" for your 4GB card
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    quantization_config=bnb_config,
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from PIL import Image


messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

inputs = processor(
    messages,
    return_tensors="pt"
).to(device)

with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=128
    )

output_text = processor.batch_decode(
    output_ids, skip_special_tokens=True
)[0]

print(output_text)


## Gemma 3

zero shot

In [None]:
from ollama import chat
# from pathlib import Path

# Pass in the path to the image
path ='C:\\Users\\fareh\\coding\\video-rag\\sample-2.png'


response = chat(
  model='gemma3',
  messages=[
      {
  "role": "system",
  "content": "You are a helpful assistant that can analyze images and provide captions."
},

    {
      'role': 'user',
      'content': 'What is in this image? .',
      'images': [path],
    }
  ],
)

print(response.message.content)

Okay, let's break down what's depicted in this image. It appears to be a visual representation of forward propagation in a neural network, specifically focusing on a simplified two-layer network.

**Overall Structure:**

*   **Two Layers:**  The diagram shows two layers of neurons. Layer #1 and Layer #2.

**Detailed Breakdown:**

1.  **Inputs:**
    *   `x11`, `x12`, `x13`  - These are the inputs to the first layer.

2.  **Layer #1:**
    *   **Neurons:** The diagram shows three neurons: `x11`, `x12`, and `x13`.
    *   **Weights:**
        *   `w11`, `w12`, `w13` - These are the weights connecting the inputs to the first layer's neurons.
    *   **Calculations:**
        *   `12+2=3` - This shows the first layer's neuron '11' calculating a value using the inputs `x12` and `x11` with a sum.
        *   `6+2=8` -  This shows the first layer's neuron '12' calculating a value.
        *   `3` - This shows the first layer's neuron '13' calculating a value.

3.  **Layer #2:**
    *   **Neur

chain of thought

In [1]:
from ollama import chat
# from pathlib import Path

# Pass in the path to the image
path ='C:\\Users\\fareh\\coding\\video-rag\\sample-2.png'


response = chat(
  model='gemma3',
  messages=[
      {
          'role':'system',
          'content':'''
You are a visual reasoning assistant.
When given an image, first describe the important objects,
then reason step by step, and finally give a concise answer.
Do not hallucinate details that are not visible.
few examples:
Visible Blur: The foreground and parts of the image are out of focus or blurred, indicating either camera movement or subject motion during the shot.
Tall, Ornate Buildings: The structures have multiple floors, detailed balconies, and decorative facades, suggesting older or classic urban architecture.
Street-Level View: Parked cars line both sides of a road or narrow street, confirming an urban environment with typical city traffic and infrastructure.
Soft, Warm Light: The sunlight appears to be hitting the buildings at an angle, creating a warm glow on the façade and enhancing the sense of a real city scene rather than a staged setup.

Final Caption: A blurry photo of a city street with buildings. '''
      },
    {
      'role': 'user',
      'content': 'What is in this image? .',
      'images': [path],
    }
  ],
)

print(response.message.content)

Here's a breakdown of the image:

**Objects:**

The image is a diagram illustrating forward propagation in a neural network. Specifically, it shows a simple two-layer neural network. 

*   **Layers:** There are two layers labeled "Layer #1" and "Layer #2”.
*   **Neurons:** Each layer contains neurons (represented by circles) connected by arrows. 
*   **Weights:** The connections between neurons have associated weights (labeled as w\_i), such as w\_11, w\_12, etc. 
*   **Activation Functions:** There is an expression for the activation function, σ, denoted as “σ(wx + b)”, where “x” represents the input and “b” is the bias.
*   **Output:** There is an output, labeled as “y”.

**Reasoning:**

The diagram represents a basic forward pass through a neural network. The arrows show the flow of data from one layer to the next. Each neuron performs a calculation (weighted sum of inputs plus bias) and then applies an activation function. The output of the network is denoted as "y".

**Concise Ans

object detection 

In [6]:
from ollama import chat
from pydantic import BaseModel
from typing import Literal, Optional

path ='C:\\Users\\fareh\\coding\\video-rag\\sample-2.png'
class Object(BaseModel):
  name: str
  confidence: float
  attributes: str

class ImageDescription(BaseModel):
  summary: str
  objects: list[Object]
  scene: str
  colors: list[str]
  time_of_day: Literal['Morning', 'Afternoon', 'Evening', 'Night']
  setting: Literal['Indoor', 'Outdoor', 'Unknown']
  text_content: Optional[str] = None

response = chat(
  model='gemma3',
  messages=[{
    'role': 'user',
    'content': 'Describe this photo and list the objects you detect.',
    'images': [path]
  }],
  format=ImageDescription.model_json_schema(),
  options={'temperature': 0},
)

image_description = ImageDescription.model_validate_json(response.message.content)
print(image_description)

summary='The image shows a screenshot of a neural network diagram, likely created in a software like MATLAB or similar. It depicts a two-layer feedforward neural network with connections and associated numerical values. The diagram includes matrices representing weights and biases, along with a sigmoid activation function.' objects=[Object(name='Neural Network Diagram', confidence=0.98, attributes='Two-layer feedforward, showing connections between nodes and matrices representing weights and biases.'), Object(name='Matrix', confidence=0.95, attributes='Represents weights (w) and biases (b) for the neural network. Includes values like w11, w12, w13, w21, w22, w23, w31, w32, w33, b11, b12, b13, b21, b22, b23, b31, b32, b33.'), Object(name='Sigmoid Activation Function', confidence=0.9, attributes="Represented by the equation 'σ(w^Tx + b)'"), Object(name='Numbers', confidence=0.99, attributes='Various numerical values are displayed within the diagram, including 1, 2, 3, 5, 7, 8, 9, 15, 12,