In [1]:
!pip install transformers torch torchvision pillow tqdm --quiet

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import requests
import torch
from transformers import CLIPProcessor, CLIPModel

dataset_path = "/kaggle/input/am-ml-test/test.csv"
input_images = "/kaggle/input/test-set-image-downlaod/images/"
working_images = "/kaggle/working/images/"
os.makedirs(working_images, exist_ok=True)

df = pd.read_csv(dataset_path)
df['filename'] = df['sample_id'].astype(str) + '.jpg'
n_rows = len(df)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "openai/clip-vit-large-patch14"
clip_model = CLIPModel.from_pretrained(model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(model_name)
embedding_dim = clip_model.visual_projection.out_features
print(f"Model loaded on {device} | Embedding dim: {embedding_dim}")

embeddings = np.zeros((n_rows, embedding_dim), dtype=np.float32)
valid_mask = np.zeros(n_rows, dtype=bool)

def get_image_path(fname, url):
    paths = [os.path.join(input_images, fname), os.path.join(working_images, fname)]
    for p in paths:
        if os.path.exists(p):
            return p
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            save_path = os.path.join(working_images, fname)
            with open(save_path, "wb") as f:
                f.write(r.content)
            return save_path
    except Exception as e:
        tqdm.write(f"‚ö†Ô∏è Download failed for {fname}: {e}")
    return None

existing_files = {f for f in os.listdir(input_images)}
df['exists'] = df['filename'].isin(existing_files)
missing_df = df[~df['exists']]

if len(missing_df) > 0:
    print(f"üîç {len(missing_df)} images missing ‚Äî attempting download...")
    for i, row in tqdm(missing_df.iterrows(), total=len(missing_df), desc="Downloading missing images"):
        fname, url = row['filename'], row['image_link']
        _ = get_image_path(fname, url)
else:
    print("‚úÖ No missing images found.")

print("\nüéØ Generating embeddings...")
clip_model.eval()
for i, row in tqdm(df.iterrows(), total=n_rows, desc="Encoding images"):
    fpath = get_image_path(row['filename'], row['image_link'])
    if not fpath:
        tqdm.write(f"‚ö†Ô∏è Missing {row['filename']}")
        continue
    try:
        img = Image.open(fpath).convert("RGB")
        inputs = clip_processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            emb = clip_model.get_image_features(**inputs)
            emb = emb / emb.norm(p=2, dim=-1, keepdim=True)  # normalize
            embeddings[i] = emb.cpu().numpy()
            valid_mask[i] = True
    except Exception as e:
        tqdm.write(f"‚ö†Ô∏è Skipped {row['filename']}: {e}")

np.save("/kaggle/working/image_embeddings.npy", embeddings)
np.save("/kaggle/working/valid_mask.npy", valid_mask)

print("\n‚úÖ Done")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Valid images: {valid_mask.sum()} / {n_rows}")
print("Files saved in /kaggle/working/")


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.8/13.8 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.6/24.6 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m883.7/883.7 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[

2025-10-12 23:20:22.896005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760311223.072837      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760311223.125129      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Model loaded on cuda | Embedding dim: 768
üîç 1 images missing ‚Äî attempting download...


Downloading missing images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.25it/s]



üéØ Generating embeddings...


Encoding images:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 42047/75000 [1:36:49<1:15:43,  7.25it/s]

‚ö†Ô∏è Missing 286800.jpg


Encoding images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [2:46:34<00:00,  7.50it/s]



‚úÖ Done
Embeddings shape: (75000, 768)
Valid images: 74999 / 75000
Files saved in /kaggle/working/
