In [1]:
from glob import glob
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
def make_df(root_dir):
    image_dir = os.path.join(root_dir, 'image')
    anno_dir = os.path.join(root_dir, 'label', 'xml')
    
    class_list = os.listdir(image_dir)
    df_list = []
    for label in class_list:
        image_path = os.path.join(image_dir, label)
        anno_path = os.path.join(anno_dir, label)

        fpaths = glob(os.path.join(anno_path, '*.xml'))
        image_ids, widths, heights = [], [], []
        xmins, ymins, xmaxs, ymaxs = [], [], [], []


        for fpath in fpaths:
            tree = ET.parse(fpath)
            root = tree.getroot()

            image_id = root.find('./filename').text[:-4]
            width = np.int32(root.find('./size/width').text)
            height = np.int32(root.find('./size/height').text)
            name_path = root.findall('./object/name')
            box_path = root.findall('./object/bndbox')
            for idx, name in enumerate(name_path):
                if name.text != 'dish':
                    bounding_box = (np.int32(child.text) for child in box_path[idx])
                    xmin, ymin, xmax, ymax = bounding_box


            image_ids.append(image_id)
            widths.append(width)
            heights.append(height)
            xmins.append(xmin)
            ymins.append(ymin)
            xmaxs.append(xmax)
            ymaxs.append(ymax)

            sub_df = pd.DataFrame({'image_id': image_ids, 'width': widths, 'height': heights,
                                   'xmin': xmins, 'ymin': ymins, 'xmax': xmaxs, 'ymax': ymaxs})
        df_list.append(sub_df)     
    df = pd.concat(df_list)    
    return df

In [3]:
def preprocess_df(df):
    code_to_name = {'01011001':'쌀밥','01012001':'잡곡밥','01014001':'김치볶음밥',
                    '01014005':'비빔밥','02011027':'짜장면','02011034':'크림파스타',
                    '04012008':'시레기된장국', '06012004':'떡갈비', '08014001':'떡볶이', 
                    '09011002':'갈치조림', '10012002':'치킨', '10012003':'돈까스'}

    df['label'] = df['image_id'].apply(lambda x: x.split('_')[2])
    df['label'] = df['label'].map(code_to_name)
    np.random.seed(123)
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [4]:
root_train_dir = './data/img_dataset/train'
root_valid_dir = './data/img_dataset/valid'
root_test_dir = './data/img_dataset/test'

In [5]:
train_df = make_df(root_train_dir)
valid_df = make_df(root_valid_dir)
test_df = make_df(root_test_dir)

In [6]:
train_df = preprocess_df(train_df)
valid_df = preprocess_df(valid_df)
test_df = preprocess_df(test_df)

In [7]:
print(train_df.shape)
train_df.head()

(2520, 8)


Unnamed: 0,image_id,width,height,xmin,ymin,xmax,ymax,label
0,01_012_01012001_160448635580781_0,4608,3456,1430,813,3531,2785,잡곡밥
1,02_021_02011034_160318072711998_0,4000,3000,1209,516,3002,2409,크림파스타
2,06_062_06012004_160276148265185,4032,3024,608,784,3547,2197,떡갈비
3,08_084_08014001_160474542070584_0,4032,1908,875,8,2867,1905,떡볶이
4,10_102_10012003_160508777675598_1,3831,2268,1784,239,3293,1142,돈까스


In [8]:
print(valid_df.shape)
valid_df.head()

(720, 8)


Unnamed: 0,image_id,width,height,xmin,ymin,xmax,ymax,label
0,01_014_01014005_161063200164014_0,2268,4032,427,1032,1871,2476,비빔밥
1,08_084_08014001_160596605459444_1,2160,2160,178,296,2104,1781,떡볶이
2,02_021_02011027_160560635077176_1,4656,3492,918,296,4022,2439,짜장면
3,06_062_06012004_160750900885284_0,3024,3024,925,875,2328,2400,떡갈비
4,02_021_02011034_160614232612514_0,4032,3024,683,416,2981,2747,크림파스타


In [9]:
print(test_df.shape)
test_df.head()

(360, 8)


Unnamed: 0,image_id,width,height,xmin,ymin,xmax,ymax,label
0,09_091_09011002_160776000564622_0,4248,5664,851,2013,3322,4222,갈치조림
1,02_021_02011034_160967366187078_0,4248,5664,611,1191,3741,4113,크림파스타
2,01_012_01012001_160497208627264_0,500,500,74,89,419,403,잡곡밥
3,01_011_01011001_160384788075037_0,2201,1960,413,156,2085,1760,쌀밥
4,08_084_08014001_160325313779958_1,4000,3000,561,534,3259,2492,떡볶이


In [10]:
train_df.to_csv('./data/train_df.csv', index=False)
valid_df.to_csv('./data/valid_df.csv', index=False)
test_df.to_csv('./data/test_df.csv', index=False)