In [1]:
import os
from pathlib import Path
import sys
from datasets import load_from_disk
import torch
from transformers import BertTokenizer
sys.path.append(str(Path.cwd().parent))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.models.captioning.nic.model import NeuralImageCaptioner
from src.models.captioning.utils import (
    _transform_test,
    _transform_train,
    compute_metrics,
    get_multiclassification_vit_tools,
)
from src.utils.dirutils import get_data_dir, get_models_dir

In [3]:
dataset = load_from_disk(get_data_dir() / "processed" / "captioning_dataset_augmented_processed")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
image_processor, image_encoder = get_multiclassification_vit_tools()
model = NeuralImageCaptioner(image_encoder, tokenizer.vocab_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
checkpoint = torch.load(get_models_dir() / "captioning" / "nic" / "4.pt")
model.load_state_dict(checkpoint["model_state_dict"])
model.train(False)
model.to(device)

NeuralImageCaptioner(
  (image_encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featu

In [5]:
outputs = []
from tqdm import tqdm

for example in tqdm(dataset["test"]):
    pixel_values = image_processor(images=example["image"], return_tensors="pt").pixel_values.to(device)
    output = model.generate(pixel_values, max_length=50)
    outputs.append({
        "file_name": example["file_name"],
        "caption": tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    })

100%|██████████| 17385/17385 [1:21:14<00:00,  3.57it/s]


In [6]:
import json
with open("nic_outputs.jsonl", "w") as f:
    json.dump(outputs, f, indent=4)