In [24]:
import pickle
import os
import numpy as np
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import io
import random

import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F

import clip
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation

processor = SegformerImageProcessor.from_pretrained("mattmdjaga/segformer_b2_clothes")
model = AutoModelForSemanticSegmentation.from_pretrained("mattmdjaga/segformer_b2_clothes")

  return func(*args, **kwargs)


# File Organization

In [25]:
def rename_files(source_dir):
    """
    Renames all jpg files in the source directory with their Design Labels.

    Parameters:
    source_dir: str, the path to the directory containing the jpg files.

    Returns:
    None
    """
    
    # List all files in the source directory
    files = os.listdir(source_dir)
    
    for file in files:

        # Check if the file is a jpg
        if file.endswith('.jpg'):

            # Get the file extension
            _, ext = os.path.splitext(file)

            # Skip the first VL in the file name
            first_vl_index = file.find('VL')

            # Find the next VL in the file name
            if first_vl_index != -1:
                start_index = file.find('VL', first_vl_index + 2)
                if start_index != -1:
                    end_index = file.find('.', start_index)
                    new_name = file[start_index:end_index] if end_index != -1 else file[start_index:]

                    # Rename the file
                    original_file_path = os.path.join(source_dir, file)
                    new_file_path = os.path.join(source_dir, new_name + ext)
                    os.rename(original_file_path, new_file_path)

# Helpers

In [26]:
def open_image(image_path, convert_mode):
    """
    Opens an image from the given path.

    Parameters:
    image_path: str, the path to the image.
    convert_mode: str, the mode to convert the image to. Options are "RGB" and "L".

    Returns:
    image: Image, the opened image.
    """

    assert convert_mode in ["RGB", "L"], "Invalid convert mode. Options are 'RGB' and 'L'."
    
    # Open the image
    image = Image.open(image_path)

    # Convert the image to specified mode
    image = image.convert(convert_mode)

    return image

def display_image(image):
    """
    Displays the image.

    Parameters:
    image: Image, the image to display.

    Returns:
    None
    """

    image.show()

def create_reference_embeddings(source_dir, CLIP_model, CLIP_transform, convert_mode):
    """
    Creates the image embeddings for the images in the source directory and saves together with labels.
    
    Parameters:
    - source_dir: str, the path to the directory containing the images.
    - CLIP_model: CLIP model, the CLIP model to use for encoding.
    - CLIP_transform: CLIP transforms, the CLIP transformation to apply to the images.
    - convert_mode: str, the mode to convert the image to. Options are "RGB" and "L".
    
    Returns:
    None
    """

    # Get the list of files in the source directory
    sub_files = os.listdir(source_dir)

    # Initialize the list of image features and labels
    design_features_list = []
    design_labels_list = []

    for file in sub_files:
        if file == ".DS_Store":
            continue
        print(f"Processing {file}...")

        # Get the path to the folder containing the images
        image_path = os.path.join(source_dir, file)

        # Load the images from the folder
        image = open_image(image_path, convert_mode)

        # Embed the image
        image_features = image_encoder(image, CLIP_model, CLIP_transform)

        # Append the image features and labels to the lists
        design_features_list.append(image_features)
        design_labels_list.append(file)

    # Save the image features and labels
    with open(f'../data/design_embeddings_{convert_mode}.pkl', 'wb') as f:
        pickle.dump(design_features_list, f)
    with open(f'../data/design_labels_{convert_mode}.pkl', 'wb') as f:
        pickle.dump(design_labels_list, f)

def get_palette(num_cls):
    """ Returns the color map for visualizing the segmentation mask.
    Args:
        num_cls: Number of classes
    Returns:
        The color map
    """
    n = num_cls
    palette = [0] * (n * 3)
    for j in range(0, n):
        lab = j
        palette[j * 3 + 0] = 0
        palette[j * 3 + 1] = 0
        palette[j * 3 + 2] = 0
        i = 0
        while lab:
            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
            i += 1
            lab >>= 3
    return palette

# Functions

In [27]:
import torch
import numpy as np

def crop_image(image, model):
    """
    Crops an image by detecting humans using a YOLO model.

    Parameters:
    - image: PIL.Image object, the image to crop.
    - model: YOLO object, the YOLO model to use for detection.

    Returns:
    - cropped_images: list, the cropped images. Each element is a PIL.Image object.
    """

    # Ensure the image is in RGB format and an Image object
    assert isinstance(image, Image.Image), "image must be a PIL.Image object"
    assert image.mode == "RGB", "Image is not in RGB format"

    # # Ensure the image is in RGB format
    # image = image.convert("RGB")

    # Resize the image to make its dimensions divisible by 32
    new_width = (image.width // 32) * 32
    new_height = (image.height // 32) * 32
    image = image.resize((new_width, new_height))

    # Convert the image to a tensor
    img = torch.from_numpy(np.array(image)).float()
    img /= 255.0  # Normalize the image
    img = img.permute((2, 0, 1)).unsqueeze(0)  # Add batch dimension

    # Inference
    results = model(img, verbose=False)
    boxes = results[0].boxes.xyxy.cpu().tolist()
    classes = results[0].boxes.cls.cpu().tolist()

    cropped_images = []
    
    for box, cls in zip(boxes, classes):
        # Get the class name from the model class dictionary
        class_name = model.names[cls]

        if class_name == 'person' or class_name == 'surfboard' or class_name == 'tie' or class_name == 'tennis racket' or class_name == 'sports ball' or class_name == 'frisbee':
            # Crop the image
            crop_img = image.crop((box[0], box[1], box[2], box[3]))
            cropped_images.append(crop_img)
        else:
            print(f"Skipping {class_name}")
            continue
    return cropped_images

def image_encoder(image, model, transform):
    """
    Use CLIP model to encode the image.
    
    Parameters:
    - image: PIL.Image object, the image to encode.

    Returns:
    - image_features: torch.Tensor, the encoded image.
    """

    # Load the CLIP model
    model = model.eval().to(DEVICE)

    # Preprocess the image
    image = transform(image).unsqueeze(0).to(DEVICE)

    # Encode the image
    with torch.no_grad():
        image_features = model.encode_image(image)

    return image_features

def get_segmentation_mask(image, processor, model):
    """
    Function to segment clothes in an image.

    Parameters:
    - image: PIL.Image object, the image to segment.
    - processor: SegformerImageProcessor object, the processor used to preprocess the image.
    - model: AutoModelForSemanticSegmentation object, the model used to segment the image.

    Returns:
    - pred_seg: torch.Tensor, the segmented image.
    """
    inputs = processor(images=image, return_tensors="pt")

    outputs = model(**inputs)
    logits = outputs.logits.cpu()

    upsampled_logits = nn.functional.interpolate(
        logits,
        size=image.size[::-1],
        mode="bilinear",
        align_corners=False,
    )

    pred_seg = upsampled_logits.argmax(dim=1)[0]

    # Create a mask for the labels 4, 5, 6, and 7
    mask = (pred_seg == 4) | (pred_seg == 5) | (pred_seg == 6) | (pred_seg == 7) | (pred_seg == 8) | (pred_seg == 16) | (pred_seg == 17)

    # Set all other labels to 0
    pred_seg[~mask] = 0

    # Set the labels 4, 5, 6, and 7 to 255
    pred_seg[mask] = 255

    return pred_seg
    
    # plt.imshow(pred_seg)

## Triplet Functions

In [33]:
def image_to_tensor(image):
    """
    Converts a PIL image to a tensor.

    Parameters:
    image: PIL Image, the image to convert.

    Returns:
    tensor: Tensor, the converted tensor.
    """

    # Convert the image to a tensor
    tensor = transforms.ToTensor()(image)

    return tensor

def tensor_to_image(tensor):
    """
    Converts a tensor to a PIL image.

    Parameters:
    tensor: Tensor, the tensor image to convert.

    Returns:
    image: PIL Image, the converted image.
    """

    # Convert the tensor to an image
    image = transforms.ToPILImage()(tensor)

    return image


def downsample_and_upsample(image_tensor, downsample_level=5):
    """
    Downsamples an input tensor to a specified level and then upsamples it to the original size.

    A proper range for downsample_level is 5 to 10.

    Parameters:
    image_tensor: Tensor, the input image tensor.
    downsample_level: int, the factor by which to downsample.

    Returns:
    upsampled_tensor: Tensor, the upsampled image tensor.
    """
    # Get the original size of the image tensor
    original_size = image_tensor.shape[-2:]

    # Calculate the downsampled size
    downsampled_size = (original_size[0] // downsample_level, original_size[1] // downsample_level)

    # Downsample the image tensor
    downsampled_tensor = F.interpolate(image_tensor.unsqueeze(0), size=downsampled_size, mode='bilinear', align_corners=False).squeeze(0)

    # Upsample the image tensor back to the original size
    upsampled_tensor = F.interpolate(downsampled_tensor.unsqueeze(0), size=original_size, mode='bilinear', align_corners=False).squeeze(0)

    return upsampled_tensor


def gaussian_blur(image_tensor, kernel_size=5, sigma=1.0):
    """
    Applies a Gaussian blur to a given tensor.

    Parameters:
    image_tensor: Tensor, the input image tensor.
    kernel_size: int, the size of the Gaussian kernel.
    sigma: float, the standard deviation of the Gaussian kernel.

    Returns:
    blurred_tensor: Tensor, the blurred image tensor.
    """
    # Define the Gaussian blur transform
    gaussian_blur = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma)

    # Apply the Gaussian blur to the image tensor
    blurred_tensor = gaussian_blur(image_tensor)

    return blurred_tensor


def random_jpeg_compression(image_tensor, min_quality=30, max_quality=70):
    """
    Applies random JPEG compression with varying levels of quality to simulate artifacts and lower quality in images.

    Parameters:
    image_tensor: Tensor, the input image tensor.
    min_quality: int, the minimum JPEG quality.
    max_quality: int, the maximum JPEG quality.

    Returns:
    compressed_tensor: Tensor, the compressed image tensor.
    """
    # Convert the tensor to a PIL image
    image = transforms.ToPILImage()(image_tensor)

    # Generate a random quality level between min_quality and max_quality
    quality = random.randint(min_quality, max_quality)

    # Save the PIL image to a bytes buffer with the generated quality level
    buffer = io.BytesIO()
    image.save(buffer, format='JPEG', quality=quality)
    buffer.seek(0)

    # Load the image back from the bytes buffer
    compressed_image = Image.open(buffer)

    # Convert the PIL image back to a tensor
    compressed_tensor = transforms.ToTensor()(compressed_image)

    return compressed_tensor


def random_mask(image_tensor, mask_size=500, area_to_mask=4000000):
    """
    Randomly masks out regions of the image tensor.

    Proper range for mask_size is 500 to 1000.
    and for num_masks is such that in total 4k pixels are masked.

    Parameters:
    image_tensor: Tensor, the input image tensor.
    mask_size: int, the size of the mask.
    num_masks: int, the number of masks to apply.

    Returns:
    masked_tensor: Tensor, the masked image tensor.
    """
    # Get the dimensions of the image tensor
    _, height, width = image_tensor.shape

    # Create a copy of the image tensor to apply masks
    masked_tensor = image_tensor.clone()

    # Calculate the area of one mask
    mask_area = mask_size * mask_size

    # Calculate the number of masks needed defaulting to 4M pixels
    num_masks = area_to_mask // mask_area

    print(f"Number of masks: {num_masks}")

    for _ in range(num_masks):
        # Randomly select the top-left corner of the mask
        top = random.randint(0, height - mask_size)
        left = random.randint(0, width - mask_size)

        # Apply the mask by setting the selected region to zero
        masked_tensor[:, top:top + mask_size, left:left + mask_size] = 0

    return masked_tensor


def add_synthetic_shadows(image_tensor, num_shadows=3, shadow_intensity=0.5, shadow_color=(0, 0, 0)):
    """
    Adds synthetic shadows to an image tensor to mimic uneven lighting conditions.
    
    Parameters:
    - image_tensor (torch.Tensor): The input image tensor with shape (C, H, W).
    - num_shadows (int): Number of shadow shapes to add.
    - shadow_intensity (float): The intensity of the shadows (0 = no shadow, 1 = completely black).
    - shadow_color (tuple): The color of the shadow in RGB.

    Returns:
    - torch.Tensor: The image tensor with synthetic shadows.
    """
    
    _, H, W = image_tensor.shape
    shadow_image = image_tensor.clone()

    for _ in range(num_shadows):
        # Randomly generate an ellipse
        center_x = np.random.randint(0, W)
        center_y = np.random.randint(0, H)
        axis_x = np.random.randint(W // 8, W // 2)
        axis_y = np.random.randint(H // 8, H // 2)
        angle = np.random.uniform(0, 180)
        angle = torch.tensor(angle)  # Convert angle to a tensor


        # Create a meshgrid for the image
        Y, X = torch.meshgrid(torch.arange(H), torch.arange(W), indexing='ij')

        # Apply the ellipse equation
        ellipse = (((X - center_x) * torch.cos(angle) + (Y - center_y) * torch.sin(angle)) ** 2) / axis_x ** 2 + \
                  (((X - center_x) * torch.sin(angle) - (Y - center_y) * torch.cos(angle)) ** 2) / axis_y ** 2

        # Create a mask where the ellipse condition is satisfied
        mask = ellipse <= 1

        # Apply the shadow by reducing the intensity of the masked region
        for i in range(3):  # Assuming image is RGB
            shadow_image[i][mask] = (shadow_image[i][mask] * (1 - shadow_intensity) + 
                                      shadow_color[i] * shadow_intensity)

    return shadow_image

## Test Cell

In [42]:
# Load an image and convert to tensor
image = Image.open('/Users/ilerisoy/Library/CloudStorage/GoogleDrive-mtilerisoy@gmail.com/My Drive/Vlisco/ML-based-Image-Matching/data/designs/VL00815.jpg')
image_tensor = transforms.ToTensor()(image)

In [11]:
# Downsample and upsample the image tensor
downsampled_upsampled_tensor = downsample_and_upsample(image_tensor, downsample_level=10)
print("Done!")

# Convert back to PIL image to visualize
downsampled_upsampled_image = transforms.ToPILImage()(downsampled_upsampled_tensor)
downsampled_upsampled_image.show()

Done!


In [32]:
# Apply random JPEG compression to the image tensor
compressed_tensor = random_jpeg_compression(image_tensor, min_quality=30, max_quality=50)
print("Done!")

# Convert back to PIL image to visualize
compressed_image = transforms.ToPILImage()(compressed_tensor)
compressed_image.show()

Done!


In [23]:
# Apply random masks to the image tensor
masked_tensor = random_mask(image_tensor, mask_size=900)
print("Done!")

# Convert back to PIL image to visualize
masked_image = transforms.ToPILImage()(masked_tensor)
masked_image.show()

Number of masks: 4
Done!


In [35]:
# Add synthetic shadows to the image tensor
shadowed_tensor = add_synthetic_shadows(image_tensor, shadow_intensity=0.0005, num_shadows=1)
print("Done!")

# Convert back to PIL image to visualize
shadowed_image = transforms.ToPILImage()(shadowed_tensor)
shadowed_image.show()

Done!


In [44]:
# Add synthetic shadows to the image tensor
shadowed_tensor = add_synthetic_shadows(image_tensor)
print("Done!")

# Convert back to PIL image to visualize
shadowed_image = transforms.ToPILImage()(shadowed_tensor)
shadowed_image.show()

Done!


# Configuration

In [5]:
DEVICE = "mps"

# Source directory containing the scraped folders
source_dir = "../data/designs"

convert_mode = "RGB"

# Load the CLIP model
CLIP_model, CLIP_transform = clip.load("ViT-L/14@336px")

# Segmentation model initialization
seg_processor = SegformerImageProcessor.from_pretrained("mattmdjaga/segformer_b2_clothes")
seg_model = AutoModelForSemanticSegmentation.from_pretrained("mattmdjaga/segformer_b2_clothes")




# # Create the reference embeddings
# create_reference_embeddings(source_dir, CLIP_model, CLIP_transform, convert_mode=convert_mode)

# # Load the YOLO model
# YOLO_model = YOLO("yolov10x.pt").to(DEVICE)

In [6]:
# Load the design database embeddings and labels
with open(f'../data/design_embeddings_{convert_mode}.pkl', 'rb') as f:
    design_embeddings = pickle.load(f)
with open(f'../data/design_labels_{convert_mode}.pkl', 'rb') as f:
    design_labels = pickle.load(f)

print(f'Total number of embeddings: {len(design_embeddings)}')
print(f'Type of design embeddings: {type(design_embeddings)}')
print(f'Design Labels: {design_labels}')
print(f'Length of Design Labels: {len(design_labels)}')

Total number of embeddings: 31
Type of design embeddings: <class 'list'>
Design Labels: ['VL0H516.jpg', 'VL00562.jpg', 'VL03916.jpg', 'VL00760.jpg', 'VL49600.jpg', 'VL58650.jpg', 'VL08932.jpg', 'VL44050.jpg', 'VL00564.jpg', 'VL54350.jpg', 'VL08759.jpg', 'VL03816.jpg', 'VL03784.jpg', 'VL02918.jpg', 'VL03541.jpg', 'VL03999.jpg', 'VL48350.jpg', 'VL73650.jpg', 'VL8870.jpg', 'VL04009.jpg', 'VL2961R.jpg', 'VLH1167.jpg', 'VL01201.jpg', 'VLA0020.jpg', 'VLS8589.jpg', 'VL80021.jpg', 'VL2961Rotated.jpg', 'VL04490.jpg', 'VL00815.jpg', 'VL00633.jpg', 'VL65450.jpg']
Length of Design Labels: 31


In [7]:
source_dir = "../data/models"
sub_files = os.listdir(source_dir)

# Initialize the vars to keep track of stats
match = 0
ds_strore_count = 0
failed_files = []
for file in sub_files:
    if file == ".DS_Store":
        ds_strore_count += 1
        continue
    # print(f"{file}")

    # Get the path to the folder containing the images
    image_path = os.path.join(source_dir, file)

    # Load the images from the folder
    image = open_image(image_path, convert_mode=convert_mode)
    
    # Get cloth segmentation mask
    segmented_image = get_segmentation_mask(image, seg_processor, seg_model)

    # Convert the tensor to a numpy array
    segmented_image = segmented_image.cpu().numpy()
    segmented_image = np.array(segmented_image, dtype=np.uint8)

    # Create a 3-channel mask
    segmented_image_3ch = np.stack([segmented_image] * 3, axis=-1)

    # Apply the mask to the input image
    filtered_image_np = np.where(segmented_image_3ch == 255, np.array(image), 0)

    # Convert the filtered image back to PIL format
    filtered_image = Image.fromarray(filtered_image_np, mode='RGB')

    # Save the filtered image
    filtered_image.save(f"../data/filtered_images/{file[:-4]}_filtered.jpg")

    # # Display the filtered image
    # Image._show(filtered_image)

    # # Display the segmented image
    # plt.imshow(segmented_image)

    # Embed the image
    image_features = image_encoder(filtered_image, CLIP_model, CLIP_transform)

    # Do cosine similarity with the design embeddings
    similarities = [torch.nn.functional.cosine_similarity(image_features, t) for t in design_embeddings]
    similarities = torch.stack(similarities)
    # print(f"Shape of similarities: {similarities.shape}")
    
    # Get the index of the most k similar designs
    k = 15
    top_k_similarities = similarities.T.topk(k)

    # print(f"Top K similarity values: {top_k_similarities.values}")

    # Get the design labels of the top k similar designs
    top_k_design_labels = [design_labels[i] for i in top_k_similarities.indices[0]]

    # print(f"Top K similarity values: {top_k_similarities.values}")
    # print(f"Top {k} similar designs for image {file}: {top_k_design_labels}")
    # print("################################")

    temp_match = match
    for design_label in top_k_design_labels:
        # print(f"Design label: {design_label[:7]}")
        # print(f"File: {file[:7]}")
        if design_label[:6] == file[:6]:
            match += 1
            print(f"MATCH: {match} in {file}   || Top K similarity values: {top_k_similarities.values}")
            # print(f"Top K similarity values: {top_k_similarities.values}")
            # print(f"Top {k} similar designs for image {file}: {top_k_design_labels}")
            break
    
    if temp_match == match:
        print(f"{file}")
        print(f"Top K similarity values: {top_k_similarities.values}")
        failed_files.append(file)
        # print(f"Top {k} similar designs for image {file}: {top_k_design_labels}")


print(f"Match: {match}/{len(sub_files)-ds_strore_count}")
print(f"Failed files: {failed_files}")

MATCH: 1 in VL0H516.jpg   || Top K similarity values: tensor([[0.7430, 0.7414, 0.7175, 0.7074, 0.7039, 0.6978, 0.6916, 0.6910, 0.6760, 0.6747, 0.6700, 0.6664, 0.6644, 0.6595, 0.6570]], device='mps:0')
VLXXXBeatlesCROPPED.png
Top K similarity values: tensor([[0.7188, 0.7029, 0.7028, 0.6957, 0.6882, 0.6856, 0.6838, 0.6761, 0.6751, 0.6749, 0.6631, 0.6555, 0.6530, 0.6517, 0.6457]], device='mps:0')
MATCH: 2 in VL00562.jpg   || Top K similarity values: tensor([[0.7314, 0.7084, 0.7044, 0.7041, 0.7018, 0.7013, 0.7007, 0.6967, 0.6908, 0.6893, 0.6864, 0.6807, 0.6787, 0.6734, 0.6636]], device='mps:0')
MATCH: 3 in VL03916.jpg   || Top K similarity values: tensor([[0.7601, 0.7437, 0.7270, 0.7234, 0.7125, 0.7038, 0.7004, 0.6951, 0.6946, 0.6915, 0.6892, 0.6889, 0.6884, 0.6861, 0.6843]], device='mps:0')
MATCH: 4 in VL00760.jpg   || Top K similarity values: tensor([[0.7787, 0.7510, 0.7396, 0.7393, 0.7377, 0.7325, 0.7315, 0.7202, 0.7202, 0.7195, 0.7174, 0.7058, 0.7040, 0.7003, 0.6964]], device='mps:0')
