In [1]:
import os
import torch
import clip
import pandas as pd
from PIL import Image
from torchvision import transforms

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Set the path to your image archive
image_folder = "/Users/n.eltahhan/Downloads/archive"

# Store extracted features
image_data = []

# Debug: Check if folder exists
if not os.path.exists(image_folder):
    print(f"Error: The folder '{image_folder}' does not exist.")
    exit()

# Walk through all subfolders and find image files
for root, _, files in os.walk(image_folder):
    for img_name in files:
        img_path = os.path.join(root, img_name)

        # Skip non-image files
        if not img_name.lower().endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif")):
            print(f"Skipping non-image file: {img_name}")
            continue

        try:
            # Open and preprocess the image
            image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)

            # Extract CLIP features
            with torch.no_grad():
                features = model.encode_image(image).cpu().numpy().flatten()

            # Store image name + features
            image_data.append([img_path] + features.tolist())
            print(f"Processed: {img_path}")

        except Exception as e:
            print(f"Error processing {img_path}: {e}")

# Debug: Check if any images were processed
if not image_data:
    print("No valid images were processed. Check image formats or permissions.")
    exit()

# Create a DataFrame
num_features = len(image_data[0]) - 1
columns = ["image_path"] + [f"feature_{i}" for i in range(num_features)]
df = pd.DataFrame(image_data, columns=columns)

# Save to CSV
output_csv = "image_features.csv"
df.to_csv(output_csv, index=False)

print(f"Feature extraction complete! Data saved to {output_csv}")

Skipping non-image file: .DS_Store
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1502.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_811.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1264.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_805.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1516.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1258.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_193.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_187.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_839.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_178.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_636.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_150.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_144.jpg
Processed: /Users/n.eltahhan/Downloads/archive/wadrobe/image_622

In [3]:
from PIL import Image

image_path = "/Users/n.eltahhan/Downloads/archive/wadrobe/image_1462.jpg"

try:
    img = Image.open(image_path)
    img.show()  # Opens the image to check if it's viewable
except Exception as e:
    print(f"Image is unreadable: {e}")

Image is unreadable: cannot identify image file '/Users/n.eltahhan/Downloads/archive/wadrobe/image_1462.jpg'


In [4]:
import os
from PIL import Image, UnidentifiedImageError

# Set the path to your image dataset
image_folder = "/Users/n.eltahhan/Downloads/archive"

# Counter for deleted files
deleted_count = 0

# Walk through all subfolders and delete corrupted images
for root, _, files in os.walk(image_folder):
    for img_name in files:
        img_path = os.path.join(root, img_name)

        # Skip non-image files
        if not img_name.lower().endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif")):
            continue

        try:
            # Try opening the image
            with Image.open(img_path) as img:
                img.verify()  # Verify if the image is valid
        except (UnidentifiedImageError, OSError):
            # If unreadable, delete the file
            os.remove(img_path)
            deleted_count += 1
            print(f"Deleted corrupted image: {img_path}")

print(f"Finished! Total corrupted images deleted: {deleted_count}")

Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1462.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_585.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_751.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_424.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_912.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1372.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1164.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1554.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1053.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_238.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_36.jpg
Deleted corrupted image: /Users/n.eltahhan/Downloads/archive/wadrobe/image_1268.jpg

In [5]:
import pandas as pd

# Load extracted features
df = pd.read_csv("image_features.csv")

# Display the first 5 rows
print(df.head())

                                          image_path  feature_0  feature_1  \
0  /Users/n.eltahhan/Downloads/archive/wadrobe/im...  -0.135900   0.182124   
1  /Users/n.eltahhan/Downloads/archive/wadrobe/im...   0.118107   0.128994   
2  /Users/n.eltahhan/Downloads/archive/wadrobe/im...  -0.165156   0.366288   
3  /Users/n.eltahhan/Downloads/archive/wadrobe/im...   0.150303  -0.043835   
4  /Users/n.eltahhan/Downloads/archive/wadrobe/im...  -0.285136   0.221046   

   feature_2  feature_3  feature_4  feature_5  feature_6  feature_7  \
0   0.149584   0.140109   0.529317  -0.454870  -0.008258   0.348099   
1   0.122769  -0.145706   0.509572  -0.087755  -0.085169   0.259595   
2   0.137222   0.104013   0.274260   0.108726  -0.370014  -0.016387   
3  -0.558581  -0.126324   0.182259  -0.274206  -0.012932   0.064413   
4   0.122178   0.066603   0.218777   0.145738  -0.182190   0.042205   

   feature_8  ...  feature_502  feature_503  feature_504  feature_505  \
0  -0.104106  ...     0.776255 

In [7]:
import faiss
import numpy as np
import pandas as pd
import torch
import clip
from PIL import Image

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load image features from CSV
df = pd.read_csv("image_features.csv")

# Extract image paths and embeddings
image_paths = df["image_path"].values
image_embeddings = df.iloc[:, 1:].values.astype("float32")

# Create FAISS index for fast similarity search
index = faiss.IndexFlatL2(image_embeddings.shape[1])
index.add(image_embeddings)

print(f"FAISS index created with {len(image_embeddings)} items.")

FAISS index created with 11641 items.


In [9]:
import streamlit
import faiss
import torch
print("All packages installed successfully!")

All packages installed successfully!
