In [14]:
import os
import pandas as pd
from PIL import Image

In [2]:
datasetPath = "data/"

labelsCSVPath = os.path.join(datasetPath, "product_list.csv")
trainImagesPath = os.path.join(datasetPath, "DAM")
testImagesPath = os.path.join(datasetPath, "test_image_headmind")

In [5]:
# Load the CSV file
df = pd.read_csv(labelsCSVPath)

# Get the number of lines
num_lines_csv = len(df)

# Get the number of categories (assuming there is a 'Category' column)
num_categories = df['Product_BusinessUnitDesc'].nunique()

print(f"Number of lines: {num_lines_csv}")
print(f"Number of categories: {num_categories}")

Number of lines: 2767
Number of categories: 6


In [6]:
categories = df['Product_BusinessUnitDesc'].unique()
print(categories)

['W RTW' 'W SLG' 'W Bags' 'W Shoes' 'Watches' 'W Accessories']


In [7]:
# Count the number of images in the training directory
num_train_images = len([name for name in os.listdir(trainImagesPath) if os.path.isfile(os.path.join(trainImagesPath, name))])
print(f"Number of training images: {num_train_images}")

Number of training images: 2766


In [8]:
# Count the number of images in the test directory
num_test_images = len([name for name in os.listdir(testImagesPath) if os.path.isfile(os.path.join(testImagesPath, name))])
print(f"Number of test images: {num_test_images}")

Number of test images: 80


In [None]:
#One issue : there is one more line in the csv than image in the train. We will ignore it during the project
# Ensure that the other data are matching between csv and images
csv_mmcs = set(df['MMC'])
image_files = set(os.listdir(trainImagesPath))
# Remove the closing .jpeg in image_files
image_files = {os.path.splitext(file)[0] for file in image_files}

# Find the MMCs that do not have a corresponding image file
missing_images = csv_mmcs - image_files

# Print the number of matching data
num_matching_data = len(csv_mmcs) - len(missing_images)
print(f"Number of matching data: {num_matching_data}")

# Print the MMCs that do not have a corresponding image file
print("MMCs without corresponding image files:")
print(missing_images)

Number of matching data: 2766
MMCs without corresponding image files:
{'CD040112A0030000'}


In [16]:
def get_image_sizes(image_path):
    sizes = []
    for image_file in os.listdir(image_path):
        if os.path.isfile(os.path.join(image_path, image_file)):
            with Image.open(os.path.join(image_path, image_file)) as img:
                sizes.append(img.size)
    return sizes

train_image_sizes = get_image_sizes(trainImagesPath)

train_size_set = set(train_image_sizes)

if len(train_size_set) > 1:
    print(f"Train image sizes vary. Range: {min(train_size_set)} to {max(train_size_set)}")
else:
    print(f"All train images have the same size: {train_size_set.pop()}")

All train images have the same size: (256, 256)


In [17]:
test_image_sizes = get_image_sizes(testImagesPath)

test_size_set = set(test_image_sizes)

if len(test_size_set) > 1:
    print(f"Test image sizes vary. Range: {min(test_size_set)} to {max(test_size_set)}")
else:
    print(f"All test images have the same size: {test_size_set.pop()}")

Test image sizes vary. Range: (1536, 2048) to (4032, 3024)
