In [None]:
import torch
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
from clip_classifier import CustomAttributeClassifier

# Define custom attributes
custom_attributes = {
    "ethnicity": ["African", "Asian", "European"],
    "age": ["young", "middle aged", "elderly"],
    "expression": ["happy", "neutral", "sad"]
}

# Correct path with proper handling of spaces
img_dir = "/home/omrid/Desktop/jungo /projectCLIPvae/celeba_dataset/img_align_celeba/img_align_celeba"

try:
    # Initialize classifier
    print("Initializing CLIP classifier...")
    classifier = CustomAttributeClassifier()
    
    print(f"\nStarting classification for {len(os.listdir(img_dir))} images...")
    
    # Run classification with smaller batch size and more frequent updates
    results_df = classifier.classify_images(
        img_dir=img_dir,
        attribute_values=custom_attributes,
        batch_size=16  # Smaller batch size for stability
    )
    
    # Save results
    output_path = "celeba_custom_attributes.csv"
    results_df.to_csv(output_path, index=False)
    print(f"\nResults saved to {output_path}")
    
    # Display summary statistics
    print("\nDataset summary:")
    for attr in custom_attributes.keys():
        print(f"\n{attr.capitalize()} distribution:")
        value_counts = results_df[attr].value_counts()
        for idx, count in value_counts.items():
            attr_value = custom_attributes[attr][idx]
            percentage = (count / len(results_df)) * 100
            print(f"{attr_value}: {count} images ({percentage:.1f}%)")
            
except Exception as e:
    print(f"Error during classification: {str(e)}")

Initializing CLIP classifier...
Initializing CLIP model on cuda

Starting classification for 202599 images...
Scanning directory: /home/omrid/Desktop/jungo /projectCLIPvae/celeba_dataset/img_align_celeba/img_align_celeba
Found 202599 images


Classifying images:   1%|          | 119/12663 [00:12<21:50,  9.57it/s]