In [1]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("/home/lyz/hf-models/openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("/home/lyz/hf-models/openai/clip-vit-base-patch16")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

2025-08-11 20:40:37.450943: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
import pandas as pd
import pdfplumber

df = pd.read_csv("sample_submit.csv")
df

Unnamed: 0,Source,Caption
0,./dataset/81ab3d22-0ae8-4c76-8f91-55757ec4678a...,step-wise select prompt to select next step.
1,./dataset/90a408aa-dbbf-4ade-a9c9-32abbf57f7e8...,Generated Persona
2,./dataset/0e6e79e6-6fd4-4911-bc45-dcb272e80beb...,Failure analysis across the four methods on th...
3,./dataset/bedcf7d3-a91c-4c3b-b266-9c4a24efa24c...,TTS case of Scoring Bias.
4,./dataset/de788cb6-6929-4152-8795-84208728ec9c...,3D trajectory plot plot
...,...,...
1402,./dataset/891f4550-4db7-4c2c-9d02-db3037f92c29...,DeepSeekV3
1403,./dataset/c610fb41-26ff-4ac7-a586-d86465144134...,Overview of the survey presented in this work.
1404,./dataset/ffe09dee-7de4-4400-80fa-59df54d3bdb7...,Distribution of ``All Passed Code'' (code that...
1405,./dataset/8c8e591d-d430-4c00-a36b-32f025aa181c...,Input and Output Token Cost across Various LLM...


In [18]:
images = []
for path in df["Source"].values:
    try:
        if path.endswith("pdf"):
            with pdfplumber.open(path) as pdf:
                first_page = pdf.pages[0]
                im = first_page.to_image(resolution=150).original
        else:
            im = Image.open(path)
    except:
        im = Image.open("./dataset/01a882f1-8a14-4421-8094-6f363a195971.png")
    images.append(im)

In [41]:
image_embeds = []
for idx in range(len(images) // 20 + 1):
    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images[idx*20:(idx+1)*20], return_tensors="pt", padding=True)
    outputs = model(**inputs)
    image_embeds.append(outputs.image_embeds.data.cpu().data)



In [42]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

text_embeds = []
for idx in range(len(df["Caption"].values) // 20 + 1):
    inputs = processor(text=list([x[:100] for x in df["Caption"].values[idx*20:(idx+1)*20]]), images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    text_embeds.append(outputs.text_embeds.data.cpu().data)

In [43]:
image_embeds = np.vstack(image_embeds)
text_embeds = np.vstack(text_embeds)

In [44]:
image_embeds.shape, text_embeds.shape

((1407, 512), (1407, 512))

In [46]:
from sklearn.preprocessing import normalize

image_embeds = normalize(image_embeds)
text_embeds = normalize(text_embeds)

In [51]:
clip_preds = []
for image_embed in image_embeds:
    clip_preds.append(df["Caption"].values[np.argmax(np.dot(image_embed, text_embeds.T))])

In [52]:
df['Caption'] = clip_preds

In [54]:
df.to_csv("submit.csv", index=None)