In [2]:
# install moondream dependencies
!pip install --upgrade torch --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl (780.4 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.1.0+cu118
    Uninstalling torch-2.1.0+cu118:
      Successfully uninstalled torch-2.1.0+cu118
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.0+cu118 requires torch==2.1.0, but you have torch 2.5.1+cu121 which is incompatible.
torchvision 0.16.0+cu118 requires torch==2.1.0, but you have torch 2.5.1+cu121 which is incompatible.[0m[31m
[0mSuccessfully installed torch-2.5.1+cu121
[0m

In [3]:
!pip install flash_attn # requires torch > 2.2

Collecting flash_attn
  Using cached flash_attn-2.7.4.post1.tar.gz (6.0 MB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25ldone
[?25h  Created wheel for flash_attn: filename=flash_attn-2.7.4.post1-cp310-cp310-linux_x86_64.whl size=187797312 sha256=b267f80a08e516292cdd748056a2178a45b8abedf7fca123292eb17c21c8c87c
  Stored in directory: /root/.cache/pip/wheels/59/ce/d5/08ea07bfc16ba218dc65a3a7ef9b6a270530bcbd2cea2ee1ca
Successfully built flash_attn
Installing collected packages: flash_attn
Successfully installed flash_attn-2.7.4.post1
[0m

In [12]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

"""
smolvlm-256m is a 0.25B text-to-image model that has several interesting capabilities trained in.
These include:
- captioning - with various lengths
- general querying (e.g., "how many people are in this image?")
- translate text on image

There are several smaller versions of the model as well.

the repo: https://huggingface.co/collections/HuggingFaceTB/smolvlm-256m-and-500m-6791fafc5bb0ab8acc960fb0
"""

# Automatically determine the best available device
if torch.backends.mps.is_available():
    device = "mps"  # Metal (Apple Silicon)
elif torch.cuda.is_available():
    device = "cuda"  # NVIDIA GPU
else:
    device = "cpu"  # Fallback to CPU

print('device is', device)

def smolvlm_image_to_text(image_path: str) -> str:
    # instantiate the model
    print("INFO: starting download or loading of model - smolVLM...")
    
    # Initialize processor and model
    model_size = "HuggingFaceTB/SmolVLM-256M-Instruct" # HuggingFaceTB/SmolVLM-500M-Instruct
    processor = AutoProcessor.from_pretrained(model_size)
    model = AutoModelForVision2Seq.from_pretrained(
        model_size,
        torch_dtype=torch.bfloat16,
        _attn_implementation="eager" #"flash_attention_2" if device == "cuda" else "eager",
    ).to(device)
    print("INFO: ... done")

    print(f"INFO: starting image to text extraction for image {image_path}...")
    image = Image.open(image_path)
    # Create input messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Can you describe this image?"}
            ]
        },
    ]
    
    # Prepare inputs
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to(device)
    
    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=250)
    generated_texts = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
    )
    print("INFO: ... done")

    # clean up
    raw_output = generated_texts[0]

    substring = "Can you describe this image?"
    if substring in raw_output:
        raw_output = raw_output.split(substring, 1)[-1].strip()
    substring = "### Analysis and Description:"
    if substring in raw_output:
        raw_output = raw_output.split(substring, 1)[0].strip()
    substring = "Assistant: "
    if substring in raw_output:
        raw_output = raw_output.split(substring, 1)[-1].strip()
    return raw_output
    
image_path = "./examples/both pills.jpeg"
smolvlm_image_to_text(image_path)


device is cuda
INFO: starting download or loading of model - smolVLM...


Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


INFO: ... done
INFO: starting image to text extraction for image ./examples/both pills.jpeg...
INFO: ... done


'The image features a close-up of a man\'s face. He appears to be in a somewhat serious or contemplative mood, as indicated by his expression and the way his eyes are directed downwards. The man\'s expression is intense and focused, with a slight furrowed brow and a furrowed chin. His hair is short and appears to be bald, and he is wearing dark sunglasses that cover his eyes. The background of the image is blurred, which helps to focus the viewer\'s attention on the man\'s face.\n\nThe text on the image is superimposed on the image. The text is in white and is positioned at the top and bottom of the image. The text reads:\n\n"Did you just take both pills?"\n\nThe font of the text is bold and sans-serif, making it easy to read. The phrase "Did you just take both pills?" is in a conversational tone, suggesting that the man might be asking a question related to his health or medication.'