In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
xml_list = glob('./VOC2012/Annotations/*.xml')
xml_list = list(map(lambda x: x.replace('\\','/'),xml_list))

In [3]:
xml_list[:25]

['./VOC2012/Annotations/2007_000027.xml',
 './VOC2012/Annotations/2007_000032.xml',
 './VOC2012/Annotations/2007_000033.xml',
 './VOC2012/Annotations/2007_000039.xml',
 './VOC2012/Annotations/2007_000042.xml',
 './VOC2012/Annotations/2007_000061.xml',
 './VOC2012/Annotations/2007_000063.xml',
 './VOC2012/Annotations/2007_000068.xml',
 './VOC2012/Annotations/2007_000121.xml',
 './VOC2012/Annotations/2007_000123.xml',
 './VOC2012/Annotations/2007_000129.xml',
 './VOC2012/Annotations/2007_000170.xml',
 './VOC2012/Annotations/2007_000175.xml',
 './VOC2012/Annotations/2007_000187.xml',
 './VOC2012/Annotations/2007_000241.xml',
 './VOC2012/Annotations/2007_000243.xml',
 './VOC2012/Annotations/2007_000250.xml',
 './VOC2012/Annotations/2007_000256.xml',
 './VOC2012/Annotations/2007_000272.xml',
 './VOC2012/Annotations/2007_000323.xml',
 './VOC2012/Annotations/2007_000332.xml',
 './VOC2012/Annotations/2007_000333.xml',
 './VOC2012/Annotations/2007_000346.xml',
 './VOC2012/Annotations/2007_00036

In [4]:
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    image_name = root.find('filename').text
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    return parser

In [5]:
# example
extract_text('./VOC2012/Annotations/2007_000061.xml')

[['2007_000061.jpg', '500', '333', 'boat', '274', '437', '11', '279'],
 ['2007_000061.jpg', '500', '333', 'boat', '184', '281', '214', '252']]

In [6]:
parser_all = list(map(extract_text,xml_list))

In [7]:
len(parser_all)

17125

In [8]:
dataset = reduce(lambda x,y : x+y, parser_all)

In [9]:
dataset[:25]

[['2007_000027.jpg', '486', '500', 'person', '174', '349', '101', '351'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '104', '375', '78', '183'],
 ['2007_000032.jpg', '500', '281', 'aeroplane', '133', '197', '88', '123'],
 ['2007_000032.jpg', '500', '281', 'person', '195', '213', '180', '229'],
 ['2007_000032.jpg', '500', '281', 'person', '26', '44', '189', '238'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '9', '499', '107', '263'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '421', '482', '200', '226'],
 ['2007_000033.jpg', '500', '366', 'aeroplane', '325', '411', '188', '223'],
 ['2007_000039.jpg', '500', '375', 'tvmonitor', '156', '344', '89', '279'],
 ['2007_000042.jpg', '500', '335', 'train', '263', '500', '32', '295'],
 ['2007_000042.jpg', '500', '335', 'train', '1', '235', '36', '299'],
 ['2007_000061.jpg', '500', '333', 'boat', '274', '437', '11', '279'],
 ['2007_000061.jpg', '500', '333', 'boat', '184', '281', '214', '252'],
 ['2007_000063.jpg', '500', '375', 'do

In [10]:
len(dataset)

40138

In [11]:
df = pd.DataFrame(dataset, columns = ['filename','width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [12]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238


In [13]:
df.shape

(40138, 8)

In [14]:
df['name'].value_counts()

name
person         17401
chair           3056
car             2492
dog             1598
bottle          1561
cat             1277
bird            1271
pottedplant     1202
sheep           1084
boat            1059
aeroplane       1002
tvmonitor        893
sofa             841
bicycle          837
horse            803
motorbike        801
diningtable      800
cow              771
train            704
bus              685
Name: count, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  object
 2   height    40138 non-null  object
 3   name      40138 non-null  object
 4   xmin      40138 non-null  object
 5   xmax      40138 non-null  object
 6   ymin      40138 non-null  object
 7   ymax      40138 non-null  object
dtypes: object(8)
memory usage: 2.4+ MB


In [18]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  40138 non-null  object
 1   width     40138 non-null  int32 
 2   height    40138 non-null  int32 
 3   name      40138 non-null  object
 4   xmin      40138 non-null  int32 
 5   xmax      40138 non-null  int32 
 6   ymin      40138 non-null  int32 
 7   ymax      40138 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 1.5+ MB


In [19]:
# centre x,y of the bounding boxes - normalized
df['centre_x'] = ((df['xmin'] + df['xmax'])/2)/df['width']
df['centre_y'] = ((df['ymin'] + df['ymax'])/2)/df['height']

# width,height of bounding box - normalized
df['w'] = (df['xmax'] - df['xmin'])/df['width']
df['h'] = (df['ymax'] - df['ymin'])/df['height']

In [20]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


In [27]:
images = df['filename'].unique()

In [28]:
len(images)

17125

In [33]:
# we have 17125 unique images test - train 80-20
img_df = pd.DataFrame(images, columns = ['filename'])
train = tuple(img_df.sample(frac=0.8)['filename'])
print(train[:10])

('2011_003016.jpg', '2007_004830.jpg', '2008_006361.jpg', '2010_000097.jpg', '2010_000117.jpg', '2011_002629.jpg', '2009_000954.jpg', '2008_003709.jpg', '2012_002075.jpg', '2008_007537.jpg')


In [36]:
test = tuple(img_df.query(f'filename not in {train}')['filename'])
test[:10]

('2007_000323.jpg',
 '2007_000491.jpg',
 '2007_000504.jpg',
 '2007_000559.jpg',
 '2007_000636.jpg',
 '2007_000676.jpg',
 '2007_000733.jpg',
 '2007_000876.jpg',
 '2007_001027.jpg',
 '2007_001149.jpg')

In [37]:
print(len(test), len(train), len(test)+len(train))

3425 13700 17125


In [38]:
train_df = df.query(f'filename in {train}')
test_df = df.query(f'filename in {test}')

In [40]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


In [41]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,w,h
41,2007_000323.jpg,500,375,person,277,500,3,375,0.777,0.504,0.446,0.992
42,2007_000323.jpg,500,375,person,12,305,3,375,0.317,0.504,0.586,0.992
66,2007_000491.jpg,500,334,cow,2,459,2,334,0.461,0.502994,0.914,0.994012
67,2007_000504.jpg,500,412,cow,341,500,102,406,0.841,0.616505,0.318,0.737864
68,2007_000504.jpg,500,412,person,36,79,34,123,0.115,0.190534,0.086,0.216019


In [42]:
df['name'].unique()

array(['person', 'aeroplane', 'tvmonitor', 'train', 'boat', 'dog',
       'chair', 'bird', 'bicycle', 'bottle', 'sheep', 'diningtable',
       'horse', 'motorbike', 'sofa', 'cow', 'car', 'cat', 'bus',
       'pottedplant'], dtype=object)

In [45]:
# label encoding
labels_dictionary = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}

train_df.loc[:, 'id'] = train_df['name'].map(labels_dictionary)
test_df.loc[:, 'id'] = test_df['name'].map(labels_dictionary)

In [48]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,w,h,id
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5,0
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665,16
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555,16
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377,0
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623,16
6,2007_000033.jpg,500,366,aeroplane,421,482,200,226,0.903,0.581967,0.122,0.071038,16
7,2007_000033.jpg,500,366,aeroplane,325,411,188,223,0.736,0.561475,0.172,0.095628,16
8,2007_000039.jpg,500,375,tvmonitor,156,344,89,279,0.5,0.490667,0.376,0.506667,13
9,2007_000042.jpg,500,335,train,263,500,32,295,0.763,0.48806,0.474,0.785075,17


In [49]:
test_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,centre_x,centre_y,w,h,id
41,2007_000323.jpg,500,375,person,277,500,3,375,0.777,0.504,0.446,0.992,0
42,2007_000323.jpg,500,375,person,12,305,3,375,0.317,0.504,0.586,0.992,0
66,2007_000491.jpg,500,334,cow,2,459,2,334,0.461,0.502994,0.914,0.994012,14
67,2007_000504.jpg,500,412,cow,341,500,102,406,0.841,0.616505,0.318,0.737864,14
68,2007_000504.jpg,500,412,person,36,79,34,123,0.115,0.190534,0.086,0.216019,0
69,2007_000504.jpg,500,412,person,2,36,44,120,0.038,0.199029,0.068,0.184466,0
70,2007_000504.jpg,500,412,person,29,41,42,64,0.07,0.128641,0.024,0.053398,0
79,2007_000559.jpg,500,370,bottle,36,79,250,354,0.115,0.816216,0.086,0.281081,3
80,2007_000559.jpg,500,370,tvmonitor,160,371,26,241,0.531,0.360811,0.422,0.581081,13
87,2007_000636.jpg,500,335,train,79,392,96,300,0.471,0.591045,0.626,0.608955,17


Saving Images and labels as text

In [50]:
from shutil import move

In [53]:
train_folder = 'YOLO_data/train'
test_folder = 'YOLO_data/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [58]:
# grouping the following info belonging to each image together
cols = ['filename','id','centre_x','centre_y','w','h']
groupby_train = train_df[cols].groupby('filename')
groupby_test = test_df[cols].groupby('filename')

In [65]:
# save each image in train/test_folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    # moving image from source to destination using move
    src = os.path.join('VOC2012/JPEGImages',filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)

    # saving the labels - id,centre_x,centre_y,w,h for each object in an image in a single filename.txt file
    text_filename = os.path.splitext(filename)[0]+'.txt'  # splitting the extension jpg from the filename and assigning the image no. as text_filename
    text_filename = os.path.join(folder_path, text_filename) # destination file path
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,index=False,header=False,sep=' ')    

In [66]:
train_keys = pd.Series(groupby_train.groups.keys())

In [67]:
train_keys.apply(save_data,args=(train_folder, groupby_train))

0        None
1        None
2        None
3        None
4        None
         ... 
13695    None
13696    None
13697    None
13698    None
13699    None
Length: 13700, dtype: object

In [68]:
test_keys = pd.Series(groupby_test.groups.keys())

In [69]:
test_keys.apply(save_data,args=(test_folder, groupby_test))

0       None
1       None
2       None
3       None
4       None
        ... 
3420    None
3421    None
3422    None
3423    None
3424    None
Length: 3425, dtype: object