# Zero-Shot Setup



**1. Loading a dataset**

In [None]:
!pip install datasets



In [None]:
import random

random.seed(10)

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("leonardPKU/clevr_cogen_a_train", split="train")

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

In [None]:
dataset

Dataset({
    features: ['image', 'problem', 'solution'],
    num_rows: 70000
})

**2. Splitting dataset into train and test (test size is around 1k samples)**

In [None]:
train_test_split = dataset.train_test_split(test_size=0.0142, seed = 10)
train_ds = train_test_split['train']
test_ds = train_test_split['test']

**3. Initializing and trying the model**

In [None]:
import torch
import transformers
transformers.utils.move_cache()
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load images
image = test_ds[0]['image']
# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
).to(DEVICE)

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "How many items are there in the image?"}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])
print(test_ds[0]['solution'])

0it [00:00, ?it/s]

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


User:



How many items are there in the image?
Assistant: There are 10 items in the image.
<answer> 10 </answer>


**4. Function for preprocessing prompt and image**

In [None]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn(examples):
  texts = []
  images = []
  for example in range(len(examples["image"])):
      image = examples["image"][example]
      if image.mode != 'RGB':
        image = image.convert('RGB')
      problem = examples["problem"][example]
      solution = examples["solution"][example]
      messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "How many items are there in the image? Answer with only a number"}
        ]
    },
      ]
      prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
      texts.append(prompt.strip())
      images.append([image])

  batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
  labels = batch["input_ids"].clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  labels[labels == image_token_id] = -100
  batch["labels"] = labels

  return batch

**5. Initializing start, stop and step for splitting the dataset into batches for asnwer prediction**

In [None]:
start = 0
stop = len(test_ds)
step = 5

In [None]:
import re

**6. Using model for predicting asnwers for each image in test dataset**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
answers = []
while start<=stop-step:
    with torch.no_grad():
        outputs = model.generate(**{key: value.to(device) for key, value in collate_fn(test_ds[start:(start + step)]).items()}, max_new_tokens=500)
        generated_texts = processor.batch_decode(
          outputs,
          skip_special_tokens=True,
          )
        for text in generated_texts:
            answers.append(int(re.findall(r'\d+', text)[0]))
    start += step
    print('start: ', start, ' answers: ', len(answers))

start:  5  answers:  5
start:  10  answers:  10
start:  15  answers:  15
start:  20  answers:  20
start:  25  answers:  25
start:  30  answers:  30
start:  35  answers:  35
start:  40  answers:  40
start:  45  answers:  45
start:  50  answers:  50
start:  55  answers:  55
start:  60  answers:  60
start:  65  answers:  65
start:  70  answers:  70
start:  75  answers:  75
start:  80  answers:  80
start:  85  answers:  85
start:  90  answers:  90
start:  95  answers:  95
start:  100  answers:  100
start:  105  answers:  105
start:  110  answers:  110
start:  115  answers:  115
start:  120  answers:  120
start:  125  answers:  125
start:  130  answers:  130
start:  135  answers:  135
start:  140  answers:  140
start:  145  answers:  145
start:  150  answers:  150
start:  155  answers:  155
start:  160  answers:  160
start:  165  answers:  165
start:  170  answers:  170
start:  175  answers:  175
start:  180  answers:  180
start:  185  answers:  185
start:  190  answers:  190
start:  195  a

In [None]:
len(answers)

995

**7. Calculating accuracy**

In [None]:
true_answers = test_ds['solution']
pattern = r'<answer>\s*(\d+)\s*</answer>'
ground_truth = [int(re.search(pattern, line).group(1)) for line in true_answers if re.search(pattern, line)]
#print(ground_truth)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(ground_truth, answers))

0.6603015075376885


Accuracy score is quite low without trainig the model

# Fine-tuning

**1. Function for preprocessing data**

In [None]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn_new(examples):
  texts = []
  images = []
  for example in examples:
    image = example["image"]
    if image.mode != 'RGB':
        image = image.convert('RGB')
    problem = example["problem"]
    solution = example["solution"]
    messages = [
          {
              "role": "user",
              "content": [
                  {"type": "image"},
                  {"type": "text", "text": problem}
              ]
          },
          {
              "role": "assistant",
              "content": [
                  {"type": "text", "text": solution}
              ]
          }
      ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    texts.append(prompt.strip())
    images.append([image])

  batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
  labels = batch["input_ids"].clone()
  assert labels.dim() > 0
  labels[labels == processor.tokenizer.pad_token_id] = -100
  labels[labels == image_token_id] = -100
  batch["labels"] = labels
  return batch

In [None]:
!pip install -q accelerate datasets peft bitsandbytes tensorboard

In [None]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration

**2. We will use Qlora for quantization and less usage of memory resources**

In [None]:
USE_LORA = False
USE_QLORA = True
SMOL = True
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
).to(DEVICE)
model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
model.config.use_cache=False

In [None]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.4


In [None]:
if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj'],
    use_dora=False if USE_QLORA else True,
    init_lora_weights="gaussian")
    lora_config.inference_mode = False
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

    model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
    device_map="auto",
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    print(model.get_nb_trainable_parameters())
else:
    model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",
).to(DEVICE)

    for param in model.model.vision_model.parameters():
        param.requires_grad = False

(377856, 256862784)


**3. Initializing training arguments**

In [None]:
from transformers import TrainingArguments, Trainer

model_name = model_id.split("/")[-1]
device_map="auto"
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,
    optim="paged_adamw_8bit", # for 8-bit, keep this, else adamw_hf
    bf16=True, # underlying precision for 8bit
    report_to="tensorboard",
    output_dir=f"./{model_name}-vqav2",
    remove_unused_columns=False,
    gradient_checkpointing=True
)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn_new,
    train_dataset=train_ds
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
