### Inference with Huggingface VLM

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                device_map="auto", # distribute across gpus
                                                _attn_implementation="eager").to(DEVICE) # flash attention will probably not work


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [None]:
from PIL import Image
from transformers.image_utils import load_image

image1 = load_image("https://huggingface.co/spaces/HuggingFaceTB/SmolVLM/resolve/main/example_images/rococo.jpg")
image2 = load_image("https://huggingface.co/spaces/HuggingFaceTB/SmolVLM/resolve/main/example_images/rococo_1.jpg")

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "image"},
            {"type": "text", "text": "Can you describe the two images?"}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1, image2], return_tensors="pt")
inputs = inputs.to(DEVICE)


In [None]:
# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_ids = generated_ids[:, inputs["input_ids"].shape[1]:]
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])


 The first image is a painting of two cherubs holding flowers, while the second image is a painting of a ship sailing on the sea.


### Using HF model as a frozen submodule

In [None]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load a small subset of the SNLI dataset
def load_snli_subset(hf_dataset_id, split, sample_size):
    ds = load_dataset(hf_dataset_id)
    return ds[split].select(np.arange(sample_size))

# Custom dataset class
class SNLIDataset(Dataset):
    def __init__(self, hf_dataset_id, split, tokenizer, sample_size, max_length=128):
        self.data = load_snli_subset(hf_dataset_id, split, sample_size)
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.data = self.data.filter(lambda example: 0 <= example['label'] <= 2)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data[idx]['premise']
        sentence2 = self.data[idx]['hypothesis']
        label = self.data[idx]['label']

        encoding = self.tokenizer(
            sentence1, sentence2,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define the model class
class EntailmentModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=3):
        super(EntailmentModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        for param in self.bert.parameters():  # Freeze BERT parameters
            param.requires_grad = False
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output  # CLS token output
        logits = self.classifier(cls_output)
        return logits

In [None]:
# Training setup
def train_model(model, train_loader, val_dataloader, criterion, optimizer, device, epochs=5):
    best_val_loss = float('inf')
    best_model_state = None


    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with validation loss: {best_val_loss:.4f}")

    model.load_state_dict(best_model_state)
    return model


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SNLIDataset("stanfordnlp/snli", "train", tokenizer, sample_size=1008)
val_dataset = SNLIDataset("stanfordnlp/snli", "validation", tokenizer, sample_size=96)
test_dataset = SNLIDataset("stanfordnlp/snli", "test", tokenizer, sample_size=96)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

model = EntailmentModel().to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1008 [00:00<?, ? examples/s]

Filter:   0%|          | 0/96 [00:00<?, ? examples/s]

Filter:   0%|          | 0/96 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=2e-4)

model = train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10)

Epoch 1/10, Train Loss: 1.0991, Val Loss: 1.1010
New best model saved with validation loss: 1.1010
Epoch 2/10, Train Loss: 1.1023, Val Loss: 1.0903
New best model saved with validation loss: 1.0903
Epoch 3/10, Train Loss: 1.0948, Val Loss: 1.0897
New best model saved with validation loss: 1.0897
Epoch 4/10, Train Loss: 1.0973, Val Loss: 1.0895
New best model saved with validation loss: 1.0895
Epoch 5/10, Train Loss: 1.0946, Val Loss: 1.0784
New best model saved with validation loss: 1.0784
Epoch 6/10, Train Loss: 1.0940, Val Loss: 1.1062
Epoch 7/10, Train Loss: 1.0936, Val Loss: 1.0849
Epoch 8/10, Train Loss: 1.0883, Val Loss: 1.0797
Epoch 9/10, Train Loss: 1.0944, Val Loss: 1.0925
Epoch 10/10, Train Loss: 1.0870, Val Loss: 1.1167


In [None]:
total_test = 0
correct_test = 0

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
        outputs = model(input_ids, attention_mask)

        predicted_labels = torch.argmax(outputs, dim=-1)

        correct = (predicted_labels == labels).sum().item()
        total = len(labels)

        total_test += total
        correct_test += correct

    accuracy = correct_test / total_test
    print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.3684
