In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from PIL import Image, ImageOps
import sys
import re
from sklearn.model_selection import GroupShuffleSplit

pd.set_option('display.max_rows', 500)


# Preprocess CSV:

### Fabric types:
  0 denim, 1 cotton, 2 leather, 3 furry, 4 knitted, 5 chiffon, 6 other, 7 NA

### Pattern types:
  0 floral, 1 graphic, 2 striped, 3 pure color, 4 lattice, 5 other, 6 color block, 7 NA

In [2]:
#Fabric Data:
fabric_headers = ['img_name', 'upper_fabric', 'lower_fabric', 'outer_fabric']
fabric_ann = pd.read_csv('/vast/amr10211/cv_data/labels/texture/fabric_ann.txt', delimiter=' ', names=fabric_headers)

#Pattern Data:
pattern_headers = ['img_name', 'upper_pattern', 'lower_pattern', 'outer_pattern']
pattern_ann = pd.read_csv('/vast/amr10211/cv_data/labels/texture/pattern_ann.txt', delimiter=' ', names=pattern_headers)

#Keypoints Data (for filtering for full body images):
keypoint_headers = ['img_name', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3', 'x_4', 'y_4', 'x_5', 'y_5',
           'x_6', 'y_6', 'x_7', 'y_7', 'x_8', 'y_8', 'x_9', 'y_9', 'x_10', 'y_10',
           'x_11', 'y_11', 'x_12', 'y_12', 'x_13', 'y_13', 'x_14', 'y_14', 'x_15', 'y_15',
           'x_16', 'y_16', 'x_17', 'y_17', 'x_18', 'y_18', 'x_19', 'y_19', 'x_20', 'y_20',
           'x_21', 'y_21']
keypoints_loc = pd.read_csv('/vast/amr10211/cv_data/keypoints/keypoints_loc.txt', delim_whitespace=True, names=keypoint_headers)

img_names = keypoints_loc['img_name'].unique()


  keypoints_loc = pd.read_csv('/vast/amr10211/cv_data/keypoints/keypoints_loc.txt', delim_whitespace=True, names=keypoint_headers)


In [3]:
#Merge and filter Data

data_df = pd.merge(fabric_ann,pattern_ann, on='img_name') #Merge fabric and pattern
data_df = data_df[data_df['img_name'].isin(img_names)] #Filter for full body images 

In [4]:
def get_image_id(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")

    # Extract the substring before the last dash
    substring_before_last_dash = img_name[:last_dash_index]

    return substring_before_last_dash

data_df['img_id'] = data_df['img_name'].apply(get_image_id)

In [5]:
def get_image_num(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")

    # Find the index of the next underscore after the last dash
    next_underscore_index = img_name.find("_", last_dash_index)

    # Extract the substring between the last dash and the next underscore
    substring_between_dash_and_underscore = img_name[last_dash_index + 1:next_underscore_index]

    return substring_between_dash_and_underscore

data_df['img_num'] = data_df['img_name'].apply(get_image_num)

In [6]:
def get_image_view(img_name):
    # Find the index of the last dash
    last_dash_index = img_name.rfind("-")
    
    # Find the index of the underscore after the last dash
    underscore_index = img_name.find("_", last_dash_index)
    
    # Find the index of the underscore after the one following the last dash
    next_underscore_index = img_name.find("_", underscore_index + 1)
    
    # Find the index of the period
    period_index = img_name.rfind(".")
    
    # Extract the substring between the underscore after the last dash and the period
    substring_between_underscores_and_period = img_name[next_underscore_index + 1:period_index]
    
    return substring_between_underscores_and_period

    
data_df['img_view'] = data_df['img_name'].apply(get_image_view)

In [7]:
data_df

Unnamed: 0,img_name,upper_fabric,lower_fabric,outer_fabric,upper_pattern,lower_pattern,outer_pattern,img_id,img_num,img_view
0,MEN-Denim-id_00000080-01_7_additional.jpg,1,1,7,3,4,7,MEN-Denim-id_00000080,01,additional
1,MEN-Denim-id_00000089-01_7_additional.jpg,1,1,7,3,3,7,MEN-Denim-id_00000089,01,additional
2,MEN-Denim-id_00000089-02_7_additional.jpg,1,1,7,2,3,7,MEN-Denim-id_00000089,02,additional
3,MEN-Denim-id_00000089-03_7_additional.jpg,1,1,7,3,3,7,MEN-Denim-id_00000089,03,additional
4,MEN-Denim-id_00000089-04_7_additional.jpg,0,1,7,3,3,7,MEN-Denim-id_00000089,04,additional
...,...,...,...,...,...,...,...,...,...,...
44074,WOMEN-Tees_Tanks-id_00007969-04_7_additional.jpg,1,1,0,3,0,3,WOMEN-Tees_Tanks-id_00007969,04,additional
44078,WOMEN-Tees_Tanks-id_00007970-01_7_additional.jpg,1,0,7,1,3,7,WOMEN-Tees_Tanks-id_00007970,01,additional
44082,WOMEN-Tees_Tanks-id_00007976-01_4_full.jpg,1,0,7,6,3,7,WOMEN-Tees_Tanks-id_00007976,01,full
44086,WOMEN-Tees_Tanks-id_00007979-03_4_full.jpg,1,1,7,3,3,7,WOMEN-Tees_Tanks-id_00007979,03,full


# Split CSV:

In [10]:
#80/10/10 train/val/test split with no data leakage of images with multiple poses

group_shuffle_split_80_20 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=47)
group_shuffle_split_50_50 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=47)

for train_index, temp_index in group_shuffle_split_80_20.split(data_df, groups=data_df['img_id']):
    train_df = data_df.iloc[train_index]
    temp_df = data_df.iloc[temp_index]

for val_index, test_index in  group_shuffle_split_50_50.split(temp_df, groups=temp_df['img_id']):
    val_df = temp_df.iloc[val_index]
    test_df = temp_df.iloc[test_index]


In [11]:
#Check for leakage:
train_image_ids_set = set(train_df['img_id'])
val_image_ids_set = set(val_df['img_id'])
test_image_ids_set = set(test_df['img_id'])

intersection_train_val = train_image_ids_set.intersection(val_image_ids_set)
intersection_train_test = train_image_ids_set.intersection(test_image_ids_set)
intersection_val_test = val_image_ids_set.intersection(test_image_ids_set)

# Check if there are any matching image_ids between the sets
if intersection_train_val:
    print("Matching image_ids between train_set and val_set:", intersection_train_val)

if intersection_train_test:
    print("Matching image_ids between train_set and test_set:", intersection_train_test)

if intersection_val_test:
    print("Matching image_ids between val_set and test_set:", intersection_val_test)
else:
    print("No matching image_ids found between any sets.")

No matching image_ids found between any sets.


In [12]:
#Save to CSV:
train_df.to_csv('train.csv')
val_df.to_csv('val.csv')
test_df.to_csv('test.csv')