In [4]:
import os 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
import torch
from EncoderClassifier.pss_datasets.pss_multimodal_dataset import PSSMultimodalDataset
from transformers import SiglipImageProcessor, AutoProcessor, AutoModel
from sentence_transformers import SentenceTransformer

root_dir = '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images'
annotations_path = '/home-local/mserrao/PSSComics/multimodal-comic-pss/EncoderClassifier/data/comics_test.json'
precompute_bb_dir = '/home-local/mserrao/PSSComics/multimodal-comic-pss/EncoderClassifier/data/features_test.pt'
precompute_emb_dir = '/home-local/mserrao/PSSComics/multimodal-comic-pss/EncoderClassifier/data/test.pt'

gpu_id = 2

device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")

In [6]:
model_id = "google/siglip-so400m-patch14-384"

print(f"Loading model: {model_id}")

parts = model_id.split('/')[1].split('-')
backbone_name = f'{parts[0]}_{parts[-1]}'

backbone = AutoModel.from_pretrained(model_id).eval()

if 'siglip2' in backbone_name:
    processor = SiglipImageProcessor.from_pretrained(model_id)
else:
    processor = AutoProcessor.from_pretrained(model_id)
    
backbone.to(device)

if 'dinov2' in backbone_name:
    bb_feature_dim = backbone.config.hidden_size
elif 'clip' in backbone_name:
    bb_feature_dim = backbone.config.vision_config.projection_dim
elif 'siglip' in backbone_name:
    bb_feature_dim = backbone.config.vision_config.hidden_size
else:
    raise ValueError(f"Warning: Unknown backbone '{backbone_name}'")

print(f'Loaded {backbone_name} with feature dim {bb_feature_dim}')

Loading model: google/siglip-so400m-patch14-384


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded siglip_384 with feature dim 1152


In [7]:
emb_model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device_map": "auto"},
    tokenizer_kwargs={"padding_side": "left"},
    )

emb_model.eval()

emb_features_dim = emb_model.get_sentence_embedding_dimension()

In [8]:
test_dataset = PSSMultimodalDataset(
        root_dir,
        # -- Textua Embedding Model
        embedding_model = emb_model,
        emb_feature_dim = emb_features_dim,
        precompute_emb = False,
        precompute_emb_dir = precompute_emb_dir,
        # -- Visual Backbone Feature Extractor
        model_id = model_id,
        backbone = backbone,
        backbone_name = backbone_name,
        bb_feature_dim = bb_feature_dim,
        processor = processor,
        precompute_visual_features=False,
        precompute_visial_featres_dir=precompute_bb_dir,
        # ---------------
        annotations_path = annotations_path,  
        max_seq_length=512,
        device=device,
        #  --- Augmentation Parameters --- 
        augment_data=False,            
    )

Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/00411460/056.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/dbbf603f/024.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/dbbf603f/025.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/dbbf603f/026.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/dbbf603f/065.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/537de305/002.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/5b8add2a/012.jpg
Skipping unknown label image /home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/077e29b1/036.jpg
Skipping unknown

In [12]:
book = test_dataset.books[1]
item = test_dataset.__getitem__(1)

for i in book.items():
    print(i)
    
for j in item.items():
    print(j)
    
print(item['textual_features'].shape)
print(item['visual_features'].shape)

('book_id', '761c7920')
('image_paths', ['/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/000.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/001.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/002.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/003.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/004.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/005.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/006.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/007.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920/008.jpg', '/home-local/mserrao/PSSComics/multimodal-comic-pss/datasets.unify/DCM/images/761c7920