In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from PIL import Image, ImageOps
import sys
import re
from sklearn.model_selection import GroupShuffleSplit
import os
from tqdm import tqdm

pd.set_option('display.max_rows', 500)


# Preprocess CSV:

### Fabric types:
  0 denim, 1 cotton, 2 leather, 3 furry, 4 knitted, 5 chiffon, 6 other, 7 NA

### Pattern types:
  0 floral, 1 graphic, 2 striped, 3 pure color, 4 lattice, 5 other, 6 color block, 7 NA

In [48]:
#Fabric Data:
fabric_headers = ['img_name', 'upper_fabric', 'lower_fabric', 'outer_fabric']
fabric_ann = pd.read_csv('/vast/amr10211/cv_data/labels/texture/fabric_ann.txt', delimiter=' ', names=fabric_headers)

#Pattern Data:
pattern_headers = ['img_name', 'upper_pattern', 'lower_pattern', 'outer_pattern']
pattern_ann = pd.read_csv('/vast/amr10211/cv_data/labels/texture/pattern_ann.txt', delimiter=' ', names=pattern_headers)

#Keypoints Data (for filtering for full body images):
keypoint_headers = ['img_name', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3', 'x_4', 'y_4', 'x_5', 'y_5',
           'x_6', 'y_6', 'x_7', 'y_7', 'x_8', 'y_8', 'x_9', 'y_9', 'x_10', 'y_10',
           'x_11', 'y_11', 'x_12', 'y_12', 'x_13', 'y_13', 'x_14', 'y_14', 'x_15', 'y_15',
           'x_16', 'y_16', 'x_17', 'y_17', 'x_18', 'y_18', 'x_19', 'y_19', 'x_20', 'y_20',
           'x_21', 'y_21']
keypoints_loc = pd.read_csv('/vast/amr10211/cv_data/keypoints/keypoints_loc.txt', delim_whitespace=True, names=keypoint_headers)

img_names = keypoints_loc['img_name'].unique()


  keypoints_loc = pd.read_csv('/vast/amr10211/cv_data/keypoints/keypoints_loc.txt', delim_whitespace=True, names=keypoint_headers)


In [49]:
#Merge and filter Data

data_df = pd.merge(fabric_ann,pattern_ann, on='img_name') #Merge fabric and pattern
data_df = data_df[data_df['img_name'].isin(img_names)] #Filter for full body images 


In [50]:
def get_image_id(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")

    # Extract the substring before the last dash
    substring_before_last_dash = img_name[:last_dash_index]

    return substring_before_last_dash

data_df['img_id'] = data_df['img_name'].apply(get_image_id)

In [51]:
def get_image_num(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")

    # Find the index of the next underscore after the last dash
    next_underscore_index = img_name.find("_", last_dash_index)

    # Extract the substring between the last dash and the next underscore
    substring_between_dash_and_underscore = img_name[last_dash_index + 1:next_underscore_index]

    return substring_between_dash_and_underscore

data_df['img_num'] = data_df['img_name'].apply(get_image_num)

In [52]:
def get_image_view(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")
    
    # Find the index of the underscore after the last dash
    underscore_index = img_name.find("_", last_dash_index)
    
    # Find the index of the underscore after the one following the last dash
    next_underscore_index = img_name.find("_", underscore_index + 1)
    
    # Find the index of the period
    period_index = img_name.rfind(".")
    
    # Extract the substring between the underscore after the last dash and the period
    substring_between_underscores_and_period = img_name[next_underscore_index + 1:period_index]
    
    return substring_between_underscores_and_period

    
data_df['img_view'] = data_df['img_name'].apply(get_image_view)

In [53]:
def get_segm_name(img_name):
    segm_name = img_name[:-4]
    segm_name = segm_name + '_segm.png'
    return segm_name

data_df['segm_name'] = data_df['img_name'].apply(get_segm_name)

In [54]:
#Filter out images without segmentations:
segm_names = os.listdir('/vast/amr10211/cv_data/segm_preprocessed')

# Filter out directories, if any
segm_names = [segm for segm in segm_names if os.path.isfile(os.path.join('/vast/amr10211/cv_data/segm_preprocessed', segm))]

data_df = data_df[data_df['segm_name'].isin(segm_names)]

In [55]:
columns_to_check = ['upper_fabric', 'lower_fabric', 'outer_fabric', 'upper_pattern', 'lower_pattern', 'outer_pattern']
data_df = data_df.dropna(subset=columns_to_check)

In [56]:
data_df.to_csv('full_data.csv')

In [57]:
img_dir = '/vast/amr10211/cv_data/masked_images'
masked_image_list = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
masked_data_df = pd.DataFrame({'masked_img_name':masked_image_list})
masked_data_df['segm_name'] = masked_data_df['masked_img_name'].str[:-6] + '.png'

data = pd.read_csv( '../data/full_data.csv')
data = pd.merge(masked_data_df, data, on ='segm_name', how='left')

In [58]:
#Encode Labels based on number before .png
# 1 (upper)
# 2 (outer)
# 3 (lower) 
# 4 (lower)
# 5 (upper)
# 6 (upper)

data['clothing_type'] = data['masked_img_name'].str[-5].astype(int)

def map_fabric_pattern(data):
    if int(data['clothing_type']) in (1,5,6):
        data['fabric'] = data['upper_fabric']
        data['pattern'] = data['upper_pattern']
    elif int(data['clothing_type']) in (3,4):
        data['fabric'] = data['lower_fabric']
        data['pattern'] = data['lower_pattern']
    elif int(data['clothing_type'])== 2:
        data['fabric'] = data['outer_fabric']
        data['pattern'] = data['outer_pattern']
    
    return data

data = data.apply(map_fabric_pattern, axis=1)

data = data[['masked_img_name', 'clothing_type', 'fabric', 'pattern', 'segm_name', 'img_id']].dropna()
data = data.reset_index(drop=True)

In [59]:
#Fabric types:
# 0 denim, 1 cotton, 2 leather, 3 furry, 4 knitted, 5 chiffon, 6 other, 7 NA
fabrics = [0,1,2,4,5,7]
data = data[data['fabric'].isin(fabrics)]

def map_labels(label):
    mapping = {
        0: 0,
        1: 1,
        2: 2,
        4: 3,
        5: 4,
        7: 5,
    }
    return mapping.get(label, label)

data['fabric'] = data['fabric'].apply(map_labels)

In [60]:
data

Unnamed: 0,masked_img_name,clothing_type,fabric,pattern,segm_name,img_id
0,MEN-Denim-id_00000080-01_7_additional_segm_1.png,1,1,3,MEN-Denim-id_00000080-01_7_additional_segm.png,MEN-Denim-id_00000080
1,MEN-Denim-id_00000080-01_7_additional_segm_4.png,4,1,4,MEN-Denim-id_00000080-01_7_additional_segm.png,MEN-Denim-id_00000080
2,MEN-Denim-id_00000089-01_7_additional_segm_1.png,1,1,3,MEN-Denim-id_00000089-01_7_additional_segm.png,MEN-Denim-id_00000089
3,MEN-Denim-id_00000089-01_7_additional_segm_4.png,4,1,3,MEN-Denim-id_00000089-01_7_additional_segm.png,MEN-Denim-id_00000089
4,MEN-Denim-id_00000089-02_7_additional_segm_1.png,1,1,2,MEN-Denim-id_00000089-02_7_additional_segm.png,MEN-Denim-id_00000089
...,...,...,...,...,...,...
25076,WOMEN-Tees_Tanks-id_00007970-01_7_additional_s...,4,0,3,WOMEN-Tees_Tanks-id_00007970-01_7_additional_s...,WOMEN-Tees_Tanks-id_00007970
25077,WOMEN-Tees_Tanks-id_00007976-01_4_full_segm_1.png,1,1,6,WOMEN-Tees_Tanks-id_00007976-01_4_full_segm.png,WOMEN-Tees_Tanks-id_00007976
25078,WOMEN-Tees_Tanks-id_00007976-01_4_full_segm_4.png,4,0,3,WOMEN-Tees_Tanks-id_00007976-01_4_full_segm.png,WOMEN-Tees_Tanks-id_00007976
25079,WOMEN-Tees_Tanks-id_00007979-03_4_full_segm_6.png,6,1,3,WOMEN-Tees_Tanks-id_00007979-03_4_full_segm.png,WOMEN-Tees_Tanks-id_00007979


In [61]:
data.to_csv('full_data.csv')

# Split CSV:

In [62]:
#80/10/10 train/val/test split with no data leakage of images with multiple poses

group_shuffle_split_80_20 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=47)
group_shuffle_split_50_50 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=47)

for train_index, temp_index in group_shuffle_split_80_20.split(data, groups=data['img_id']):
    train_df = data.iloc[train_index]
    temp_df = data.iloc[temp_index]

for val_index, test_index in  group_shuffle_split_50_50.split(temp_df, groups=temp_df['img_id']):
    val_df = temp_df.iloc[val_index]
    test_df = temp_df.iloc[test_index]


In [63]:
#Check for leakage:
train_image_ids_set = set(train_df['img_id'])
val_image_ids_set = set(val_df['img_id'])
test_image_ids_set = set(test_df['img_id'])

intersection_train_val = train_image_ids_set.intersection(val_image_ids_set)
intersection_train_test = train_image_ids_set.intersection(test_image_ids_set)
intersection_val_test = val_image_ids_set.intersection(test_image_ids_set)

# Check if there are any matching image_ids between the sets
if intersection_train_val:
    print("Matching image_ids between train_set and val_set:", intersection_train_val)

if intersection_train_test:
    print("Matching image_ids between train_set and test_set:", intersection_train_test)

if intersection_val_test:
    print("Matching image_ids between val_set and test_set:", intersection_val_test)
else:
    print("No matching image_ids found between any sets.")

No matching image_ids found between any sets.


In [64]:
#Save to CSV:
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')
test_df.to_csv('test.csv')

In [14]:
#Make small datasources for model buidling:
train_sample = train_df.sample(frac=0.025, random_state=47)
val_sample = val_df.sample(frac=0.025, random_state=47)
test_sample = test_df.sample(frac=0.025, random_state=47)

train_sample.to_csv('train_sample.csv')
val_sample.to_csv('val_sample.csv')
test_sample.to_csv('test_sample.csv')

## Preprocess Image Segmentations

In [28]:
mapping = {
    1: 1,
    2: 2,
    3: 3,
    5: 4,
    6: 4,
    4: 5,
    21: 6
}

input_directory = '/vast/amr10211/cv_data/segm'
output_directory = '/vast/amr10211/cv_data/segm_preprocessed'
png_files = [filename for filename in os.listdir(input_directory) if filename.endswith('.png')]

for filename in tqdm(png_files, desc="Converting and mapping PNG files"):
    input_filepath = os.path.join(input_directory, filename)
    output_filepath = os.path.join(output_directory, filename)
    
    # Open the image
    img = Image.open(input_filepath)
    
    # Convert the image to a NumPy array
    img_array = np.array(img)
    
    # Apply the mapping to each pixel value
    mapped_array = np.vectorize(lambda x: mapping.get(x, 0))(img_array)
    
    # Convert the mapped array back to an image
    mapped_img = Image.fromarray(mapped_array.astype(np.uint8))
    
    # Save the mapped image as PNG
    mapped_img.save(output_filepath)


Converting and mapping PNG files: 100%|██████████| 12701/12701 [23:25<00:00,  9.04it/s]


In [34]:

input_directory = '/vast/amr10211/cv_data/segm_preprocessed'
png_files = [filename for filename in os.listdir(input_directory) if filename.endswith('.png')]

unique_value_counts = {}

for filename in tqdm(png_files, desc="Converting and mapping PNG files"):
    input_filepath = os.path.join(input_directory, filename)
    
    # Open the image
    img = Image.open(input_filepath)
    
    # Convert the image to a NumPy array
    img_array = np.array(img)

    # Count unique values in the array
    unique_values, counts = np.unique(img_array, return_counts=True)
    
    # Update the counts in the dictionary
    for value, count in zip(unique_values, counts):
        if value not in unique_value_counts:
            unique_value_counts[value] = 0
        unique_value_counts[value] += count

# Print the final count per unique value
for value, count in unique_value_counts.items():
    print(f"Value: {value}, Count: {count}")

Converting and mapping PNG files: 100%|██████████| 12701/12701 [02:56<00:00, 71.88it/s]

Value: 0, Count: 9088520009
Value: 1, Count: 400516322
Value: 4, Count: 399366922
Value: 2, Count: 191766313
Value: 6, Count: 70451055
Value: 5, Count: 277766047
Value: 3, Count: 52070582



