# 数据集预处理

使用的数据集是DeepFashion中的`in-shop_clothes_retrieval_benchmark`数据集，在预处理前，需要制作数据集软链接到`../data/input`目录

1. DeeFashion子数据集 in-shop 目录结构如下：

```shell
.
├── Anno
│   ├── list_attr_cloth.txt            # 衣服所有的属性列表
│   ├── list_attr_items.txt            # 每个商品id对应的属性列表
│   ├── list_bbox_inshop.txt           # 每张图片对应的bbox
│   ├── list_description_inshop.json   # 每个商品id详细信息文字描述
│   ├── list_item_inshop.txt           # 所有的商品id列表
│   └── list_landmarks_inshop.txt      # 每张图片对应的关键点位置
├── Eval
│   └── list_eval_partition.txt        # 每张图片的train、val、test标签
├── Img
│   └── img
│       ├── MEN
│       │   ├── Denim
│       │   │    ├── id_0000xxx
│       │   │    ├── ...
│       │   │    └── id_0000xxx
│       │   ├── ...
│       │   └── Tees_Tanks
│       └── WOMEN
│           ├── Blouses_Shirts
│           ├── ...
│           └── Tees_Tanks
├── README.txt
├── test.csv                           # 预处理后的test数据集
├── train.csv                          # 预处理后的train数据集
└── val.csv                            # 预处理后的val数据集
```

2. `../data/output/`文件结构如下：

```shell
.
├── cache                              # 缓存文件夹
├── logs                               # 日志文件夹
│   └── fashionnet                     # 使用的网络名称
│       ├── events.out.tfevents.1554190389.mysd-desktop  # 中间结果tensorboard文件
│       └── fashionnet.csv             # 中间结果csv文件
├── models                             # 模型文件
│   ├── fashionnet                     # 使用的网络名称，指定网络保存在该文件夹下
│   └── resnet50
└── submits                            # 预测结果保存的文件夹
```


## dependency

In [10]:
import re
from pathlib import Path
from multiprocessing.pool import Pool

from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imgaug as ia
from imgaug import augmenters as iaa
from tqdm import tqdm_notebook as tqdm

In [13]:
LANDMARKS_INSHOP = '../data/input/Anno/list_landmarks_inshop.txt'
EVAL_PARTITION = '../data/input/Eval/list_eval_partition.txt'
ATTR_ITEMS = '../data/input/Anno/list_attr_items.txt'

TRAIN_DF = '../data/input/train.csv'
VAL_DF = '../data/input/val.csv'
TEST_DF = '../data/input/test.csv'

CATEGORY = ['Denim',
            'Jackets_Vests',
            'Pants',
            'Shirts_Polos',
            'Shorts',
            'Suiting',
            'Sweaters',
            'Sweatshirts_Hoodies',
            'Tees_Tanks',
            'Blouses_Shirts',
            'Cardigans',
            'Denim',
            'Dresses',
            'Graphic_Tees',
            'Jackets_Coats',
            'Leggings',
            'Pants',
            'Rompers_Jumpsuits',
            'Shorts',
            'Skirts',
            'Sweaters',
            'Sweatshirts_Hoodies',
            'Tees_Tanks'
           ]

num_attr = 463

## 读取数据集

In [14]:
landmarks_inshop = pd.read_csv(
    LANDMARKS_INSHOP,
    sep=r' +',
    header=1,
    engine='python'
)
eval_partition = pd.read_csv(
    EVAL_PARTITION,
    sep=r' +',
    header=1,
    engine='python'
)
attr_items = pd.read_csv(
    ATTR_ITEMS,
    sep=r' ',
    header=None,
    skiprows=[0,1],
    names=['item_id', ]+list(range(num_attr)),
    engine='python'
)
# merge all attribute into one string
attr_items['attribute_labels'] = ''
for i in range(num_attr):
    attr_items['attribute_labels'] += (' ' + attr_items[i].map(str))
attr_items['attribute_labels'] = attr_items['attribute_labels'].map(lambda x: x[1:])
attr_items = attr_items.drop(columns=range(463))

eval_dict = dict([(m, c)for _, m, i, c in eval_partition.to_records()])

## 编辑数据集

In [16]:
def edit_attrimg(index):
    """transform the No.index item's attribute_labels of attr_items.
    Transform the label 1 or -1 into the index of attribute labels
    
    Arguments
        index: the index of attr_items
    
    Return
        None
    """
    labels = attr_items['attribute_labels'].loc[index]
    try:
        ll = labels
        labels = re.split(r' +', labels)
    except TypeError as e:
        print("Error")
#     labels = [str(item[0]) for item in enumerate(labels) if item[1] == '1']
    labels = np.array(labels)
    labels = np.argwhere(labels == '1').flatten().astype(np.str)
    
    labels = ' '.join(labels)
    return labels


pool = Pool()
labels = pool.map(edit_attrimg, range(len(attr_items)))
attr_items['attribute_labels'] = labels
pool.close()
pool.join()

## 合并数据集

In [17]:
df = eval_partition\
    .merge(landmarks_inshop, on='image_name')\
    .merge(attr_items, on='item_id')\
    .drop(columns=['clothes_type', 'variation_type'])\
    .fillna(0)
df['category_label'] = df['image_name'].map(lambda x: CATEGORY.index(x.split('/')[-3]))

df.head()

Unnamed: 0,image_name,item_id,evaluation_status,landmark_visibility_1,landmark_location_x_1,landmark_location_y_1,landmark_visibility_2,landmark_location_x_2,landmark_location_y_2,landmark_visibility_3,...,landmark_location_x_6,landmark_location_y_6,landmark_visibility_7,landmark_location_x_7,landmark_location_y_7,landmark_visibility_8,landmark_location_x_8,landmark_location_y_8,attribute_labels,category_label
0,img/WOMEN/Dresses/id_00000002/02_1_front.jpg,id_00000002,train,1,109,63,0,156,70,0,...,161.0,136.0,0.0,89.0,234.0,0.0,206.0,230.0,0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...,12
1,img/WOMEN/Dresses/id_00000002/02_2_side.jpg,id_00000002,train,1,127,59,0,145,61,1,...,141.0,125.0,1.0,123.0,217.0,0.0,132.0,229.0,0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...,12
2,img/WOMEN/Dresses/id_00000002/02_4_full.jpg,id_00000002,train,0,123,46,0,148,51,0,...,149.0,96.0,0.0,101.0,149.0,0.0,157.0,156.0,0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...,12
3,img/WOMEN/Dresses/id_00000002/02_7_additional.jpg,id_00000002,train,0,153,58,0,112,61,0,...,108.0,141.0,0.0,175.0,228.0,0.0,91.0,233.0,0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...,12
4,img/WOMEN/Skirts/id_00000003/02_1_front.jpg,id_00000003,train,1,79,131,1,125,130,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 1 2 6 12 13 20 23 42 55 84 87 113 152 171 19...,19


## 分割数据集

In [18]:
df['evaluation_status'].value_counts()

train      26338
query      14441
gallery    12811
Name: evaluation_status, dtype: int64

In [19]:
# split the dataframe by evaluation_status
train_df = df[df['evaluation_status'] == 'train']\
            .drop(columns=['evaluation_status'])\
            .reset_index(drop=True)
val_df = df[df['evaluation_status'] == 'query']\
            .drop(columns=['evaluation_status'])\
            .reset_index(drop=True)
test_df = df[df['evaluation_status'] == 'gallery']\
            .drop(columns=['evaluation_status'])\
            .reset_index(drop=True)

len(train_df), len(val_df), len(test_df)

(26338, 14441, 12811)

In [20]:
# save the dataframe
train_df.to_csv(TRAIN_DF, index=False)
val_df.to_csv(VAL_DF, index=False)
test_df.to_csv(TEST_DF, index=False)