In [None]:
import os
import torch
import clip
from PIL import Image
import pandas as pd
from tqdm import tqdm

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Path to the archive folder
archive_path = "path/to/archive"  # Change this to your actual path

# List all category folders (e.g., beds, chairs, etc.)
categories = os.listdir(archive_path)

# Store embeddings
image_data = []

# Process each category folder
for category in categories:
    category_path = os.path.join(archive_path, category)
    if not os.path.isdir(category_path):
        continue  # Skip if not a folder

    print(f"Processing category: {category}...")

    # Process each image in the category folder
    for image_name in tqdm(os.listdir(category_path)):
        image_path = os.path.join(category_path, image_name)

        # Ensure it's an image file
        if not image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
            continue

        try:
            # Load and preprocess image
            image = Image.open(image_path).convert("RGB")
            image = preprocess(image).unsqueeze(0).to(device)

            # Generate embedding
            with torch.no_grad():
                embedding = model.encode_image(image).cpu().numpy().flatten()

            # Store data
            image_data.append([image_path, category] + embedding.tolist())

        except Exception as e:
            print(f"Error processing {image_name}: {e}")

# Convert to DataFrame
embedding_df = pd.DataFrame(image_data)
embedding_df.columns = ["image_path", "category"] + [f"dim_{i}" for i in range(embedding_df.shape[1] - 2)]

# Save to CSV
embedding_df.to_csv("image_embeddings.csv", index=False)
print("Embeddings saved to image_embeddings.csv")

 14%|█████▍                                | 48.5M/338M [10:50<54:03, 93.5kiB/s]