In [None]:
cwd = %pwd
print(f'Current Working Directory: {cwd}')

In [None]:
%ls

In [None]:
# gpu status
!nvidia-smi

### Installation

In [None]:
# Install all required packages

%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

%%capture
!pip install "timm==1.0.19"   # Only for Gemma 3N
!pip install "gdown==5.2.0"
!pip install tqdm

import unsloth
# print(unsloth.__version__)
import torch; torch._dynamo.config.recompile_limit = 64;

In [None]:
# --- Unsloth must be imported before transformers, trl, peft ---
from unsloth import FastVisionModel  # or FastLanguageModel
from unsloth import get_chat_template

import os
import json
import sys
import io
from PIL import Image
import requests
import gdown
import zipfile
from tqdm import tqdm

import torch
import torch.cuda
import yaml
from transformers import TextStreamer
from huggingface_hub import login

Download Test Set

In [None]:
def download_and_unzip(url, extract_dir="."):
    """
    Downloads and unzips a file from a URL, handling both standard links
    and Google Drive links automatically.

    Args:
        url (str): The URL of the .zip file to download.
        extract_dir (str, optional): The directory to extract the files into.
                                     Defaults to the current directory.
    """
    # Create the extraction directory if it doesn't exist
    os.makedirs(extract_dir, exist_ok=True)

    # Determine the filename from the URL for standard links
    if "drive.google.com" not in url:
        zip_filename = os.path.basename(url)
    else:
        # gdown will handle naming for Google Drive files
        zip_filename = "downloaded_from_gdrive.zip"

    zip_path = os.path.join(extract_dir, zip_filename)

    print(f"⬇️  Starting download from: {url}")

    try:
        # --- Download Logic ---
        if "drive.google.com" in url:
            # Use gdown for Google Drive URLs
            gdown.download(url, zip_path, quiet=False)
        else:
            # Use requests for standard URLs
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an exception for bad status codes
            total_size = int(response.headers.get("content-length", 0))

            with tqdm(
                total=total_size, unit="iB", unit_scale=True, desc=f"Downloading {zip_filename}"
            ) as pbar:
                with open(zip_path, "wb") as f:
                    for data in response.iter_content(chunk_size=1024 * 4):
                        f.write(data)
                        pbar.update(len(data))

        print(f"\n✅ Download complete. File saved to: {zip_path}")

        # --- Unzip Logic ---
        print(f"\n📦 Unzipping {zip_filename}...")
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extractall(path=extract_dir)

        print(f"✅ Successfully extracted files to: {extract_dir}")

        # --- Cleanup ---
        os.remove(zip_path)
        print(f"🗑️  Removed temporary zip file: {zip_path}")

    except Exception as e:
        print(f"❌ An error occurred: {e}")

In [None]:
# --- Downloading Test Set ---
DOWNLOAD_TEST_SET = False
if DOWNLOAD_TEST_SET:
  extract_dir = "/content/vizwiz_test_images"
  if os.path.exists(extract_dir):
    print(f"--- Directory '{extract_dir}' already exists. Skipping download. ---")
  else:
    print("--- Downloading Test Set  ---")
    # url = "https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip"
    url = "https://drive.google.com/uc?export=download&id=1qiZVdJo9kAVYy7OUaPjY1Dbmh5UirEB9"
    download_and_unzip(url=url, extract_dir=extract_dir)
    print("\n" + "="*50 + "\n")

Setup HuggingFace

In [None]:
# Log in to Hugging Face

def get_hf_token():
    # Check if running in Google Colab
    try:
        import google.colab
        from google.colab import userdata
        hf_token = userdata.get("HF_TOKEN")
        print("Running in Colab: using userdata for HF_TOKEN.")
    except ImportError:
        # Not in Colab, try to load from .env file
        from dotenv import load_dotenv
        load_dotenv()
        hf_token = os.getenv("HF_TOKEN")
        print("Not in Colab: using .env for HF_TOKEN.")
    if not hf_token:
        raise RuntimeError("HF_TOKEN not found in Colab userdata or .env file.")
    return hf_token

hf_token = get_hf_token()
login(token=hf_token)
print(f"HF Token: {hf_token[-5:]}")

In [None]:
def run_inference(model_name, image_url, prompt):
    """
    Loads a fine-tuned model from Hugging Face, runs inference on an image, and prints the result in JSON format.
    Handles both URL and local file paths for images.
    """
    print(f"Loading model: {model_name}")
    model, processor = FastVisionModel.from_pretrained(
        model_name=model_name,
        load_in_4bit=True, # Use 4bit to reduce memory use and speed up inference with Unsloth.
        dtype=None, # None for auto detection
    )
    print("Model and processor loaded successfully.")

    # Chat Template
    processor = get_chat_template(
    processor,
    "gemma-3"
    )

    # Prepare for inference
    FastVisionModel.for_inference(model)

    try:
        print(f"Opening image: {image_url}")
        if image_url.startswith("http://") or image_url.startswith("https://"):
            image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
        elif os.path.exists(image_url):
            image = Image.open(image_url).convert("RGB")
        else:
            print(f"Error: Image not found at {image_url}")
            return

    except Exception as e:
        print(f"Failed to load image: {e}")
        return

    messages = [
    {
        "role": "user",
        "content": [{"type": "image"}, {"type": "text", "text": prompt}],
    }
    ]

    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    # Generate output
    print("Running inference...")
    # Set max_new_tokens to a reasonable value to avoid generating the prompt multiple times
    outputs = model.generate(**inputs, max_new_tokens=300, use_cache=True, eos_token_id=processor.tokenizer.eos_token_id)
    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]

    # Extract only the generated text after the assistant's turn
    assistant_start = result.find("model\n")
    if assistant_start != -1:
        # Get everything after the "model" token
        full_generation = result[assistant_start + len("model\n"):]
        # Split by newline and take only the first complete line
        generated_text = full_generation.split('\n')[0]
    else:
        generated_text = "Could not extract generated text."

    # Prepare JSON-like output
    output_data = {
        "image_source": image_url,
        # "prompt": prompt,
        "generated_description": generated_text.strip() # Remove leading/trailing whitespace
    }

    # Print the result as JSON
    print("\n--- Inference Result (JSON) ---")
    print(json.dumps(output_data, indent=4))

Define Constants

In [None]:
# --- Define Constants ---
MODEL_NAME = "mazqoty/gemma-3n-vizWiz-finetuned"
# IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"  # A sample image of cats
IMAGE_URL = "http://images.cocodataset.org/test-stuff2017/000000000416.jpg"

# Load image from Disk/Downloaded Test Set
# IMAGE_URL=  "/content/vizwiz_test_images/test/VizWiz_test_00000003.jpg"
# IMAGE_URL = "/content/vizwiz_test_images/test/VizWiz_test_00000052.jpg"

PROMPT = """You are a helpful assistant for a visually impaired person. Your task is to describe the scene in the provided image clearly and concisely, focusing on potential obstacles or key objects."""
# PROMPT = "Write a short, clear description of this image."

In [None]:
run_inference(MODEL_NAME, IMAGE_URL, PROMPT)