In [8]:
import torch
import clip
from PIL import Image
import os
import matplotlib.pyplot as plt  # Import the matplotlib library for image visualization
import json

# Check if CUDA (GPU) is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Load the CLIP model and preprocessing pipeline
model, preprocess = clip.load("ViT-B/32", device=device)

In [4]:
# List of clothing items for comparison
clothing_items = [
    "Saree",
    "Lehenga",
    "Women Kurta",
    "Dupatta",
    "Gown",
    "Nehru Jacket",
    "Sherwani",
    "Men Kurta",
    "Men Mojari",
    "Leggings and Salwar",
    "Blouse",
    "Palazzo",
    "Dhoti Pants",
    "Petticoat",
    "Women Mojari"
]

In [9]:
json_path = 'archive/test_data.json'


In [10]:
input_data = []
with open(json_path, 'r') as f:
    for line in f:
        obj = json.loads(line)
        input_data.append(obj)

In [12]:
input_data[500]

{'image_url': 'https://m.media-amazon.com/images/I/81aTdjyWllL._AC_UL320_.jpg',
 'image_path': 'images/test/500.jpeg',
 'brand': 'Cotton colors',
 'product_title': "Plain Cloth Pieces for Women's Blouse of Silk 1 Meter_H!",
 'class_label': 'blouse'}

In [37]:
# Index of the input data you want to analyze
index_ = 499

# Assuming 'input_data' is a list of JSON-like objects with image information
image_json = input_data[index_]

# Construct the full path to the image file using the given 'image_path'
image_path = os.path.join("archive/", image_json['image_path'])

# Get the class label of the image
image_class = image_json['class_label']

# Preprocess the image and move it to the appropriate device (CPU or GPU)
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)

# Tokenize and move the clothing item names to the appropriate device
text = torch.cat([clip.tokenize(f"a photo of a {c}") for c in clothing_items]).to(device)

In [38]:
# Perform inference
with torch.no_grad():
    # Encode image and text
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    # Calculate similarity scores between image and text
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

In [39]:
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

In [40]:
# Calculate similarity scores
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

In [41]:
# Print the top predictions
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{clothing_items[index]:>16s}: {100 * value.item():.2f}%")


Top predictions:

           Saree: 57.96%
Leggings and Salwar: 31.03%
         Dupatta: 4.33%
         Lehenga: 3.71%
     Dhoti Pants: 1.10%
