### Shopee

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
from IPython.display import display
pd.set_option('display.max_colwidth', None)



In [None]:
os.makedirs('final_data_224/Shopee/images', exist_ok=True)

In [None]:
df = pd.read_csv('Shopee/train.csv')
df.drop_duplicates(subset='image', inplace=True)
all_groups = []
for label, group in tqdm(df.groupby('label_group')):
    if len(group)<3:
        continue
    group = group.reset_index(drop=True)
    all_groups.append(group)
    for idx, row in group.iterrows():
        file = row['image']
        path = 'Shopee/train_images/'+file
        img = cv2.imread(path)
        min_ = min(img.shape[:2])
        max_ = max(img.shape[:2])
        if min_>224:
            img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
        elif max_>224:
            img = cv2.resize(img, (224, 224))
        new_path = 'final_data_224/Shopee/images/'+file
        cv2.imwrite(new_path, img)

df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/Shopee/Shopee_final_224.csv', index=False)

all_files = glob('final_data_224/Shopee/images/*')
assert len(all_files)==df_n100_224.shape[0], (len(all_files), df_n100_224.shape[0])
print(df.shape[0], df['label_group'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['label_group'].nunique())
df_n100_224.head(1)

### JD-product-10K

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/Products10K/images', exist_ok=True)

In [None]:
df = pd.read_csv('products10k/train.csv')
all_groups = []
for label, group in tqdm(df.groupby('class')):
    if len(group)<5:
        continue
    group = group.reset_index(drop=True)
    all_groups.append(group)
    for idx, row in group.iterrows():
        file = row['name']
        path = 'products10k/train/'+file
        img = cv2.imread(path)
        min_ = min(img.shape[:2])
        max_ = max(img.shape[:2])
        if min_>224:
            img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
        elif max_>224:
            img = cv2.resize(img, (224, 224))
        new_path = 'final_data_224/Products10K/images/'+f'{file}'
        cv2.imwrite(new_path, img)

df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/Products10K/products10K_final_224.csv', index=False)
all_files = glob('final_data_224/Products10K/images/*')
assert len(all_files)==df_n100_224.shape[0]
print(df.shape[0], df['class'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['class'].nunique())
df_n100_224.head(5)

### Stanford_Products

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/Stanford_Products/images', exist_ok=True)

In [None]:
df = pd.read_csv('Stanford_Online_Products/Stanford_Products_extracted.csv')
all_groups = []
for label, group in tqdm(df.groupby('labels')):
    if len(group)<5:
        continue
    group = group.reset_index(drop=True)
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        min_ = min(img.shape[:2])
        max_ = max(img.shape[:2])
        if min_>224:
            img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
        elif max_>224:
            img = cv2.resize(img, (224, 224))
        new_path = 'final_data_224/Stanford_Products/images/'+path.split('/')[-2]+path.split('/')[-1]
        cv2.imwrite(new_path, img)
        group.loc[idx, 'image_files'] = new_path
    all_groups.append(group)
    


df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/Stanford_Products/Stanford_Products_final_224.csv', index=False)



In [None]:
all_files = glob('final_data_224/Stanford_Products/images/*')
print(len(all_files))
assert len(all_files)==df_n100_224.shape[0]
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique())
df_n100_224.head(5)

### Deep Fashion （Consumer-to-shop）

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/deepFashion/images', exist_ok=True)

In [None]:
df = pd.read_csv('deepFashion/deepFashion_extracted.csv')
all_groups = []
for label, group in tqdm(df.groupby('labels')):
    if len(group)<3:
        continue
    group = group.reset_index(drop=True)
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        min_ = min(img.shape[:2])
        max_ = max(img.shape[:2])
        if min_>224:
            img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
        elif max_>224:
            img = cv2.resize(img, (224, 224))
        new_path = 'final_data_224/deepFashion/images/'+f'f_{label}_{idx}_'+path.split('/')[-1]
        cv2.imwrite(new_path, img)
        group.loc[idx, 'image_files'] = new_path
    all_groups.append(group)

df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/deepFashion/DeepFashion_final_224.csv', index=False)



In [None]:
all_files = glob('final_data_224/deepFashion/images/*')
assert len(all_files)==df_n100_224.shape[0],(len(all_files), df_n100_224.shape[0])
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique())
df_n100_224.head(10)

### Grocery Store

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/grocery_dataset/images', exist_ok=True)


In [None]:
df = pd.read_csv('GroceryStoreDataset/grocery_dataset_extracted.csv')
all_groups = []
for label, group in tqdm(df.groupby('labels')):
    if len(group)<5:
        continue
    group = group.reset_index(drop=True)
    del_idx=[]
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        if img is not None:
            min_ = min(img.shape[:2])
            max_ = max(img.shape[:2])
            if min_>224:
                img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
            elif max_>224:
                img = cv2.resize(img, (224, 224))
            new_path = 'final_data_224/grocery_dataset/images/'+path.split('/')[-4]+path.split('/')[-1]
            cv2.imwrite(new_path, img)
            group.loc[idx, 'image_files'] = new_path
        else:
            del_idx.append(idx)
    # print(del_idx)
    if(len(del_idx)>0):
        group = group.drop(index=del_idx)
    all_groups.append(group)
    

df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/grocery_dataset/grocery_dataset_final_224.csv', index=False)



In [None]:
all_files = glob('final_data_224/grocery_dataset/images/*')
print(len(all_files), df_n100_224.shape[0])
print(df_n100_224.iloc[6])
assert len(all_files)==df_n100_224.shape[0]
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique())
df_n100_224.head(10)

### Fashion200K

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/Fashion_200K/images', exist_ok=True)

In [None]:
df = pd.read_csv('Fashion_200K/Fashion_200K_extracted.csv')
all_groups = []
for label, group in tqdm(df.groupby('labels')):
    if len(group)<3:
        continue
    group = group.reset_index(drop=True)
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        min_ = min(img.shape[:2])
        max_ = max(img.shape[:2])
        if min_>224:
            img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
        elif max_>224:
            img = cv2.resize(img, (224, 224))
        new_path = 'final_data_224/Fashion_200K/images/'+path.split('/')[-3]+path.split('/')[-1]
        cv2.imwrite(new_path, img)
        group.loc[idx, 'image_files'] = new_path
    all_groups.append(group)
df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/Fashion_200K/Fashion_200K_final_224.csv', index=False)

all_files = glob('final_data_224/Fashion_200K/images/*')
assert len(all_files)==df_n100_224.shape[0], (len(all_files), df_n100_224.shape[0])
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique())
df_n100_224.head(5)

### RP2K

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
os.makedirs('final_data_224/rp2k/images', exist_ok=True)

In [None]:
df = pd.read_csv('rp2k/rp2k_extracted.csv')
all_groups = []
for label, group in tqdm(df.groupby('labels')):
    if len(group)<5:
        continue
    group = group.reset_index(drop=True)
    del_idx = []
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        if img is not None:
            min_ = min(img.shape[:2])
            max_ = max(img.shape[:2])
            if min_>224:
                img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
            elif max_>224:
                img = cv2.resize(img, (224, 224))
            new_path = 'final_data_224/rp2k/images/'+path.split('/')[-1]
            if not cv2.imwrite(new_path, img):
                raise Exception("Could not write image")
            group.loc[idx, 'image_files'] = new_path
        else:
            del_idx.append(idx)
    if(len(del_idx)>0):
        group = group.drop(index=del_idx)
    all_groups.append(group)

df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/rp2k/rp2k_final_224.csv', index=False)

In [None]:
all_files = glob('final_data_224/rp2k/images/*')
# assert len(all_files)==df_n100_224.shape[0], (len(all_files), df_n100_224.shape[0])
#assert fails but images are present
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique())
df_n100_224.head(5)

### DeepFashion2 （hard-triplets）

In [None]:
from glob import glob
import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm, trange
import albumentations as A
pd.set_option('display.max_colwidth', None)

os.makedirs('final_data_224/DeepFashion2/images', exist_ok=True)

In [None]:
df = pd.read_csv('deepFashion2/deepFashion2_extracted.csv')
all_groups = []
n = 0
for label, group in tqdm(df.groupby('labels')):
    if len(group)<3:
        continue
    group = group.reset_index(drop=True)
    
    for idx, row in group.iterrows():
        path = row['image_files']
        img = cv2.imread(path)
        if img is not None:
            min_ = min(img.shape[:2])
            max_ = max(img.shape[:2])
            if min_>224:
                img = cv2.resize(img, (224, 224), interpolation = cv2.INTER_AREA)
            elif max_>224:
                img = cv2.resize(img, (224, 224))
            new_path = f'final_data_224/DeepFashion2/images/{n}_'+path.split('/')[-1]
            cv2.imwrite(new_path, img)
            group.loc[idx, 'image_files'] = new_path
            n+=1
    all_groups.append(group)



In [None]:
all_groups

In [None]:
df_n100_224 = pd.concat(all_groups, axis=0).reset_index(drop=True)
df_n100_224.to_csv('final_data_224/DeepFashion2/DeepFashion2_final_224.csv', index=False)

all_files = glob('final_data_224/DeepFashion2/images/*')
# assert len(all_files)==df_n100_224.shape[0], (len(all_files), df_n100_224.shape[0])
print(df.shape[0], df['labels'].nunique())
print('--->>>')
print(df_n100_224.shape[0], df_n100_224['labels'].nunique(), df_n100_224['image_files'].nunique())
df_n100_224.head(5)