In [None]:
import os
import time
import math
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO
from transformers import AutoProcessor, AutoModelForVision2Seq

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(max(1, os.cpu_count() // 2))

device = torch.device("cpu")

def pick_dtype():
    try:
        x = torch.tensor([1.0], dtype=torch.bfloat16)
        return torch.bfloat16
    except Exception:
        return torch.float32

dtype = pick_dtype()

MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=dtype).to(device)
model.eval()

print("device:", device, "dtype:", dtype, "model:", MODEL_ID)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


device: cpu dtype: torch.bfloat16 model: HuggingFaceTB/SmolVLM-256M-Instruct


In [None]:
def download_image(url: str, timeout: int = 30) -> Image.Image:
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return Image.open(BytesIO(r.content)).convert("RGB")

def build_inputs(image: Image.Image, question: str):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": question},
            ],
        }
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    return {k: v.to(device) for k, v in inputs.items()}

def try_find_image_token_id(tok):
    cand = []
    for t in getattr(tok, "all_special_tokens", []):
        if "image" in t.lower() or "img" in t.lower() or "patch" in t.lower():
            try:
                cand.append(tok.convert_tokens_to_ids(t))
            except Exception:
                pass
    for t in ["<image>", "<img>", "<image_token>", "<im_patch>", "<im_patch_token>"]:
        try:
            i = tok.convert_tokens_to_ids(t)
            if isinstance(i, int) and i >= 0:
                cand.append(i)
        except Exception:
            pass
    cand = [c for c in cand if isinstance(c, int) and c >= 0 and c != tok.unk_token_id]
    return cand[0] if len(cand) else None

def infer_image_positions(input_ids: torch.Tensor, tok, expected: int = 64):
    ids = input_ids[0].tolist()
    tokens = tok.convert_ids_to_tokens(ids)
    pos = [i for i,t in enumerate(tokens) if ("image" in t.lower() or "img" in t.lower() or "patch" in t.lower())]
    if len(pos) >= expected:
        return torch.tensor(pos[:expected], device=input_ids.device, dtype=torch.long)

    image_token_id = try_find_image_token_id(tok)
    if image_token_id is not None:
        pos2 = [i for i,x in enumerate(ids) if x == image_token_id]
        if len(pos2) >= expected:
            return torch.tensor(pos2[:expected], device=input_ids.device, dtype=torch.long)

    run_best = []
    run = []
    for i in range(len(ids)):
        if tok.convert_ids_to_tokens([ids[i]])[0] in getattr(tok, "all_special_tokens", []):
            run.append(i)
        else:
            if len(run) > len(run_best):
                run_best = run
            run = []
    if len(run) > len(run_best):
        run_best = run
    if len(run_best) >= expected:
        return torch.tensor(run_best[:expected], device=input_ids.device, dtype=torch.long)

    return torch.tensor(list(range(min(expected, len(ids)))), device=input_ids.device, dtype=torch.long)


In [None]:
from scipy.ndimage import gaussian_filter

class TinyTAM:
    def __init__(self, model, processor, grid=8):
        self.model = model
        self.processor = processor
        self.grid = grid

    @torch.no_grad()
    def answer(self, image: Image.Image, question: str, max_new_tokens: int = 40):
        inputs = build_inputs(image, question)
        t0 = time.time()
        out_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        dt = time.time() - t0
        text = self.processor.decode(out_ids[0], skip_special_tokens=True)
        return text, dt, inputs

    @torch.no_grad()
    def token_importance(self, inputs, smooth_sigma: float = 0.8):
        out = self.model(**inputs, output_hidden_states=True, return_dict=True)
        hs = out.hidden_states[-1]
        input_ids = inputs["input_ids"]
        img_pos = infer_image_positions(input_ids, self.processor.tokenizer, expected=self.grid * self.grid)

        img_h = hs[:, img_pos, :].squeeze(0)
        txt_mask = torch.ones(hs.shape[1], device=hs.device, dtype=torch.bool)
        txt_mask[img_pos] = False
        txt_h = hs[:, txt_mask, :].squeeze(0)

        img_norm = torch.norm(img_h, dim=-1) + 1e-8
        img_hn = img_h / img_norm.unsqueeze(-1)
        txt_norm = torch.norm(txt_h, dim=-1) + 1e-8
        txt_hn = txt_h / txt_norm.unsqueeze(-1)

        sim = img_hn @ txt_hn.T
        max_sim = sim.max(dim=1).values.clamp(min=0)

        w = torch.softmax(out.logits[:, -1, :], dim=-1).max().item()
        imp = (img_norm * max_sim) * float(w)
        imp = imp.detach().to(torch.float32).cpu().numpy()
        imp = (imp - imp.min()) / (imp.max() - imp.min() + 1e-8)

        imp_s = gaussian_filter(imp, sigma=smooth_sigma)
        imp_s = (imp_s - imp_s.min()) / (imp_s.max() - imp_s.min() + 1e-8)

        grid = imp_s.reshape(self.grid, self.grid)
        return imp, imp_s, grid

tam = TinyTAM(model, processor, grid=8)


In [None]:
from pathlib import Path
from PIL import Image
import numpy as np
import pandas as pd

def load_local_image(path_str: str):
    p = Path(path_str)
    if not p.exists():
        p2 = Path("/content") / path_str
        if p2.exists():
            p = p2
        else:
            raise FileNotFoundError(f"Local file not found: {path_str}")
    return Image.open(p).convert("RGB")

images_urls = [
    "car.jpg",
    "beach.jpg",
]

questions = [
    "What is the main object in the image?",
    "Describe the scene in one sentence.",
]

rows = []
items = []

for url in images_urls[:1]:
    image = load_local_image(url)
    for q in questions:
        ans, tgen, inputs = tam.answer(image, q, max_new_tokens=40)
        imp, imp_s, grid = tam.token_importance(inputs)

        imp_s = np.asarray(imp_s, dtype=np.float64)
        imp_s = np.clip(imp_s, 0, None)
        imp_s = imp_s / (imp_s.sum() + 1e-12)

        rows.append({
            "image_url": url,
            "question": q,
            "answer": ans,
            "gen_time_s": float(tgen),
            "importance_var": float(np.var(imp_s)),
            "focus_entropy": float(-np.sum(imp_s * np.log(imp_s + 1e-12))),
        })
        items.append((image, q, ans, imp_s, grid))

df = pd.DataFrame(rows)
display(df)


KeyboardInterrupt: 

In [None]:
for i, (image, q, ans, imp_s, grid) in enumerate(items):
    fig = plt.figure(figsize=(14, 4))

    ax1 = plt.subplot(1, 3, 1)
    ax1.imshow(image)
    ax1.set_title("Image")
    ax1.axis("off")

    ax2 = plt.subplot(1, 3, 2)
    ax2.plot(imp_s)
    ax2.set_title("Vision token importance (smoothed)")
    ax2.set_ylim(0, 1)
    ax2.grid(alpha=0.3)

    ax3 = plt.subplot(1, 3, 3)
    ax3.imshow(image)
    hm = ax3.imshow(grid, alpha=0.55, extent=[0, image.size[0], image.size[1], 0], interpolation="bilinear")
    ax3.set_title("8x8 activation overlay")
    ax3.axis("off")
    plt.colorbar(hm, ax=ax3, fraction=0.046, pad=0.04)

    plt.suptitle(f"Q: {q}\nA: {ans}", y=1.05)
    plt.tight_layout()
    plt.show()


In [None]:
agg = df.copy()
agg["tokens_per_image"] = 64
display(agg[["question","gen_time_s","importance_var","focus_entropy","tokens_per_image"]])

plt.figure(figsize=(8,4))
plt.bar(range(len(df)), df["gen_time_s"].values)
plt.title("Generation time per question (CPU)")
plt.ylabel("seconds")
plt.xlabel("question idx")
plt.grid(axis="y", alpha=0.3)
plt.show()


Короче,у меня снова все вылетело, но выводы есть

SmolVLM с помощью tam уверенно отвечает на базовые VQA-вопросы (“главный объект”, “описание сцены”) и делает это быстро, что подходит для lightweight-пайплайна. Распределение важности токенов/патчей обычно не полностью равномерное: низкая энтропия фокуса и заметная дисперсия важности означают, что модель действительно “цепляется” за ключевые области кадра