In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Datasets

In [2]:
import datasets

# stair captions
ds_stair_captions = datasets.load_dataset(
    "shunk031/STAIR-Captions", "v1.2.0", split="validation"
).select(
    range(1000)
)  # 先頭1000件のみ使用

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

In [3]:
# jdocqa
ds_questions = datasets.load_dataset(
    "oshizo/japanese-text-image-retrieval", "question"
)["test"]
ds_contexts = datasets.load_dataset("oshizo/japanese-text-image-retrieval", "context")[
    "test"
]
ds_corpus = datasets.load_dataset("oshizo/japanese-text-image-retrieval", "corpus")[
    "test"
]

# Evaluator

In [4]:
from evaluation_utils import MultiModalInformationRetrievalEvaluator, resize_image

In [5]:
question_image_evaluator = MultiModalInformationRetrievalEvaluator(
    queries={item["query_id"]: item["text"] for item in ds_questions},
    corpus={item["image_id"]: resize_image(item["image"], 584) for item in ds_corpus},
    relevant_docs={
        item["query_id"]: item["positive_image_ids"] for item in ds_questions
    },
    name="jdocqa-question-image",
    batch_size=1,
    show_progress_bar=False,
    ndcg_at_k=[1, 3, 5, 10],
)

In [6]:
context_image_evaluator = MultiModalInformationRetrievalEvaluator(
    queries={item["query_id"]: item["text"] for item in ds_contexts},
    corpus={item["image_id"]: resize_image(item["image"], 584) for item in ds_corpus},
    relevant_docs={
        item["query_id"]: item["positive_image_ids"] for item in ds_contexts
    },
    name="jdocqa-context-image",
    batch_size=1,
    show_progress_bar=False,
    ndcg_at_k=[1, 3, 5, 10],
)

In [7]:
stair_captions_evaluator = MultiModalInformationRetrievalEvaluator(
    queries={
        f"{i:05d}": item["annotations"]["caption"][0]
        for i, item in enumerate(ds_stair_captions)
    },  # 先頭のキャプションを使用
    corpus={
        f"{i:05d}": resize_image(item["image"], 224)
        for i, item in enumerate(ds_stair_captions)
    },
    relevant_docs={
        f"{i:05d}": [f"{i:05d}"] for i, item in enumerate(ds_stair_captions)
    },
    name="stair_captions",
    batch_size=1,
    show_progress_bar=False,
    ndcg_at_k=[1, 3, 5, 10],
)

# Evaluate

## japanese-clip-qwen2_vl-exp-0101

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "oshizo/japanese-clip-qwen2_vl-exp-0101", trust_remote_code=True
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [9]:
question_image_evaluator(model)

{'jdocqa-question-image_cosine_accuracy@1': 0.35374149659863946,
 'jdocqa-question-image_cosine_accuracy@3': 0.5153061224489796,
 'jdocqa-question-image_cosine_accuracy@5': 0.5858843537414966,
 'jdocqa-question-image_cosine_accuracy@10': 0.6683673469387755,
 'jdocqa-question-image_cosine_precision@1': np.float64(0.35374149659863946),
 'jdocqa-question-image_cosine_precision@3': np.float64(0.17205215419501133),
 'jdocqa-question-image_cosine_precision@5': np.float64(0.11734693877551021),
 'jdocqa-question-image_cosine_precision@10': np.float64(0.06709183673469388),
 'jdocqa-question-image_cosine_recall@1': np.float64(0.35253684807256236),
 'jdocqa-question-image_cosine_recall@3': np.float64(0.5134637188208617),
 'jdocqa-question-image_cosine_recall@5': np.float64(0.5840419501133787),
 'jdocqa-question-image_cosine_recall@10': np.float64(0.6665958049886621),
 'jdocqa-question-image_cosine_ndcg@1': np.float64(0.35374149659863946),
 'jdocqa-question-image_cosine_ndcg@3': np.float64(0.44737

In [10]:
context_image_evaluator(model)

{'jdocqa-context-image_cosine_accuracy@1': 0.7474489795918368,
 'jdocqa-context-image_cosine_accuracy@3': 0.8341836734693877,
 'jdocqa-context-image_cosine_accuracy@5': 0.8605442176870748,
 'jdocqa-context-image_cosine_accuracy@10': 0.9030612244897959,
 'jdocqa-context-image_cosine_precision@1': np.float64(0.7474489795918368),
 'jdocqa-context-image_cosine_precision@3': np.float64(0.27976190476190477),
 'jdocqa-context-image_cosine_precision@5': np.float64(0.173469387755102),
 'jdocqa-context-image_cosine_precision@10': np.float64(0.091156462585034),
 'jdocqa-context-image_cosine_recall@1': np.float64(0.742842970521542),
 'jdocqa-context-image_cosine_recall@3': np.float64(0.8317743764172335),
 'jdocqa-context-image_cosine_recall@5': np.float64(0.858843537414966),
 'jdocqa-context-image_cosine_recall@10': np.float64(0.9019982993197279),
 'jdocqa-context-image_cosine_ndcg@1': np.float64(0.7474489795918368),
 'jdocqa-context-image_cosine_ndcg@3': np.float64(0.7956702087633971),
 'jdocqa-c

In [11]:
stair_captions_evaluator(model)

{'stair_captions_cosine_accuracy@1': 0.492,
 'stair_captions_cosine_accuracy@3': 0.712,
 'stair_captions_cosine_accuracy@5': 0.804,
 'stair_captions_cosine_accuracy@10': 0.891,
 'stair_captions_cosine_precision@1': np.float64(0.492),
 'stair_captions_cosine_precision@3': np.float64(0.2373333333333333),
 'stair_captions_cosine_precision@5': np.float64(0.1608),
 'stair_captions_cosine_precision@10': np.float64(0.08910000000000001),
 'stair_captions_cosine_recall@1': np.float64(0.492),
 'stair_captions_cosine_recall@3': np.float64(0.712),
 'stair_captions_cosine_recall@5': np.float64(0.804),
 'stair_captions_cosine_recall@10': np.float64(0.891),
 'stair_captions_cosine_ndcg@1': np.float64(0.492),
 'stair_captions_cosine_ndcg@3': np.float64(0.6198064464857181),
 'stair_captions_cosine_ndcg@5': np.float64(0.6578510347982717),
 'stair_captions_cosine_ndcg@10': np.float64(0.6866645199828476),
 'stair_captions_cosine_mrr@10': 0.6214964285714286,
 'stair_captions_cosine_map@100': np.float64(0.6

In [12]:
import torch

del model
torch.cuda.empty_cache()

## jina-clip-v2

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-clip-v2", trust_remote_code=True)

  @custom_fwd
  @custom_bwd


In [14]:
question_image_evaluator(model)

{'jdocqa-question-image_cosine_accuracy@1': 0.10459183673469388,
 'jdocqa-question-image_cosine_accuracy@3': 0.17772108843537415,
 'jdocqa-question-image_cosine_accuracy@5': 0.2185374149659864,
 'jdocqa-question-image_cosine_accuracy@10': 0.29081632653061223,
 'jdocqa-question-image_cosine_precision@1': np.float64(0.10459183673469388),
 'jdocqa-question-image_cosine_precision@3': np.float64(0.059240362811791375),
 'jdocqa-question-image_cosine_precision@5': np.float64(0.04370748299319729),
 'jdocqa-question-image_cosine_precision@10': np.float64(0.029081632653061226),
 'jdocqa-question-image_cosine_recall@1': np.float64(0.10459183673469388),
 'jdocqa-question-image_cosine_recall@3': np.float64(0.17772108843537415),
 'jdocqa-question-image_cosine_recall@5': np.float64(0.2185374149659864),
 'jdocqa-question-image_cosine_recall@10': np.float64(0.29017857142857145),
 'jdocqa-question-image_cosine_ndcg@1': np.float64(0.10459183673469388),
 'jdocqa-question-image_cosine_ndcg@3': np.float64(0

In [15]:
context_image_evaluator(model)

{'jdocqa-context-image_cosine_accuracy@1': 0.22789115646258504,
 'jdocqa-context-image_cosine_accuracy@3': 0.35799319727891155,
 'jdocqa-context-image_cosine_accuracy@5': 0.42346938775510207,
 'jdocqa-context-image_cosine_accuracy@10': 0.5034013605442177,
 'jdocqa-context-image_cosine_precision@1': np.float64(0.22789115646258504),
 'jdocqa-context-image_cosine_precision@3': np.float64(0.11961451247165532),
 'jdocqa-context-image_cosine_precision@5': np.float64(0.08486394557823129),
 'jdocqa-context-image_cosine_precision@10': np.float64(0.050510204081632655),
 'jdocqa-context-image_cosine_recall@1': np.float64(0.22732426303854875),
 'jdocqa-context-image_cosine_recall@3': np.float64(0.35685941043083896),
 'jdocqa-context-image_cosine_recall@5': np.float64(0.42233560090702943),
 'jdocqa-context-image_cosine_recall@10': np.float64(0.502267573696145),
 'jdocqa-context-image_cosine_ndcg@1': np.float64(0.22789115646258504),
 'jdocqa-context-image_cosine_ndcg@3': np.float64(0.303504109391309

In [16]:
stair_captions_evaluator(model)

{'stair_captions_cosine_accuracy@1': 0.524,
 'stair_captions_cosine_accuracy@3': 0.738,
 'stair_captions_cosine_accuracy@5': 0.813,
 'stair_captions_cosine_accuracy@10': 0.891,
 'stair_captions_cosine_precision@1': np.float64(0.524),
 'stair_captions_cosine_precision@3': np.float64(0.24599999999999997),
 'stair_captions_cosine_precision@5': np.float64(0.16260000000000002),
 'stair_captions_cosine_precision@10': np.float64(0.0891),
 'stair_captions_cosine_recall@1': np.float64(0.524),
 'stair_captions_cosine_recall@3': np.float64(0.738),
 'stair_captions_cosine_recall@5': np.float64(0.813),
 'stair_captions_cosine_recall@10': np.float64(0.891),
 'stair_captions_cosine_ndcg@1': np.float64(0.524),
 'stair_captions_cosine_ndcg@3': np.float64(0.6498538845142899),
 'stair_captions_cosine_ndcg@5': np.float64(0.6809275613463065),
 'stair_captions_cosine_ndcg@10': np.float64(0.7064990222181987),
 'stair_captions_cosine_mrr@10': 0.6474376984126983,
 'stair_captions_cosine_map@100': np.float64(0.

In [17]:
import torch

del model
torch.cuda.empty_cache()

## Colqwen2-1.0

In [18]:
import torch
from evaluation_utils import ColQwen2Wrapper
from colpali_engine.models import ColQwen2, ColQwen2Processor

model_name = "vidore/colqwen2-v1.0-merged"
colqwen2 = ColQwen2.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # or "mps" if on Apple Silicon
).eval()
processor = ColQwen2Processor.from_pretrained(model_name)
model = ColQwen2Wrapper(colqwen2, processor)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


In [22]:
# 既に別のモデルでevaluatorを動かしている場合のみ、score_functionsの再設定が必要
question_image_evaluator.score_functions = {model.similarity_fn_name: model.similarity}
question_image_evaluator.score_function_names = [model.similarity_fn_name]
question_image_evaluator(model)

{'jdocqa-question-image_maxsim_accuracy@1': 0.5221088435374149,
 'jdocqa-question-image_maxsim_accuracy@3': 0.6641156462585034,
 'jdocqa-question-image_maxsim_accuracy@5': 0.7074829931972789,
 'jdocqa-question-image_maxsim_accuracy@10': 0.766156462585034,
 'jdocqa-question-image_maxsim_precision@1': np.float64(0.5221088435374149),
 'jdocqa-question-image_maxsim_precision@3': np.float64(0.22250566893424034),
 'jdocqa-question-image_maxsim_precision@5': np.float64(0.14217687074829932),
 'jdocqa-question-image_maxsim_precision@10': np.float64(0.07712585034013604),
 'jdocqa-question-image_maxsim_recall@1': np.float64(0.5196286848072562),
 'jdocqa-question-image_maxsim_recall@3': np.float64(0.6621315192743764),
 'jdocqa-question-image_maxsim_recall@5': np.float64(0.7054988662131519),
 'jdocqa-question-image_maxsim_recall@10': np.float64(0.7638180272108843),
 'jdocqa-question-image_maxsim_ndcg@1': np.float64(0.5221088435374149),
 'jdocqa-question-image_maxsim_ndcg@3': np.float64(0.6047474554

In [23]:
context_image_evaluator.score_functions = {model.similarity_fn_name: model.similarity}
context_image_evaluator.score_function_names = [model.similarity_fn_name]
context_image_evaluator(model)

{'jdocqa-context-image_maxsim_accuracy@1': 0.6964285714285714,
 'jdocqa-context-image_maxsim_accuracy@3': 0.7772108843537415,
 'jdocqa-context-image_maxsim_accuracy@5': 0.8018707482993197,
 'jdocqa-context-image_maxsim_accuracy@10': 0.8282312925170068,
 'jdocqa-context-image_maxsim_precision@1': np.float64(0.6964285714285714),
 'jdocqa-context-image_maxsim_precision@3': np.float64(0.2604875283446712),
 'jdocqa-context-image_maxsim_precision@5': np.float64(0.16122448979591836),
 'jdocqa-context-image_maxsim_precision@10': np.float64(0.08333333333333333),
 'jdocqa-context-image_maxsim_recall@1': np.float64(0.6918225623582767),
 'jdocqa-context-image_maxsim_recall@3': np.float64(0.7743764172335601),
 'jdocqa-context-image_maxsim_recall@5': np.float64(0.7990362811791383),
 'jdocqa-context-image_maxsim_recall@10': np.float64(0.8256802721088435),
 'jdocqa-context-image_maxsim_ndcg@1': np.float64(0.6964285714285714),
 'jdocqa-context-image_maxsim_ndcg@3': np.float64(0.7425693367243253),
 'jdo

In [21]:
stair_captions_evaluator.score_functions = {model.similarity_fn_name: model.similarity}
stair_captions_evaluator.score_function_names = [model.similarity_fn_name]
stair_captions_evaluator(model)

{'stair_captions_maxsim_accuracy@1': 0.327,
 'stair_captions_maxsim_accuracy@3': 0.539,
 'stair_captions_maxsim_accuracy@5': 0.64,
 'stair_captions_maxsim_accuracy@10': 0.767,
 'stair_captions_maxsim_precision@1': np.float64(0.327),
 'stair_captions_maxsim_precision@3': np.float64(0.17966666666666667),
 'stair_captions_maxsim_precision@5': np.float64(0.128),
 'stair_captions_maxsim_precision@10': np.float64(0.0767),
 'stair_captions_maxsim_recall@1': np.float64(0.327),
 'stair_captions_maxsim_recall@3': np.float64(0.539),
 'stair_captions_maxsim_recall@5': np.float64(0.64),
 'stair_captions_maxsim_recall@10': np.float64(0.767),
 'stair_captions_maxsim_ndcg@1': np.float64(0.327),
 'stair_captions_maxsim_ndcg@3': np.float64(0.4493662191964322),
 'stair_captions_maxsim_ndcg@5': np.float64(0.4904104215148692),
 'stair_captions_maxsim_ndcg@10': np.float64(0.5315000129257906),
 'stair_captions_maxsim_mrr@10': 0.45792222222222184,
 'stair_captions_maxsim_map@100': np.float64(0.467567202498173

In [24]:
import torch

del model
torch.cuda.empty_cache()