In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
# load all xml files and store in a list
xmlfiles = glob('./data_images/*.xml')
# data cleaning reolace \\ with /
xmlfiles = list(map(lambda x: x.replace('\\','/'),xmlfiles))

In [3]:
xmlfiles

['./data_images/01.xml',
 './data_images/02.xml',
 './data_images/03.xml',
 './data_images/04.xml',
 './data_images/05.xml',
 './data_images/06.xml',
 './data_images/07.xml',
 './data_images/08.xml',
 './data_images/09.xml',
 './data_images/10.xml',
 './data_images/11.xml',
 './data_images/12.xml',
 './data_images/13.xml',
 './data_images/14.xml',
 './data_images/15.xml',
 './data_images/16.xml',
 './data_images/17.xml',
 './data_images/18.xml',
 './data_images/19.xml',
 './data_images/20.xml',
 './data_images/21.xml',
 './data_images/22.xml',
 './data_images/23.xml',
 './data_images/24.xml',
 './data_images/25.xml',
 './data_images/26.xml',
 './data_images/27.xml',
 './data_images/28.xml',
 './data_images/29.xml',
 './data_images/30.xml',
 './data_images/31.xml',
 './data_images/32.xml',
 './data_images/33.xml',
 './data_images/34.xml',
 './data_images/35.xml',
 './data_images/36.xml',
 './data_images/37.xml',
 './data_images/38.xml',
 './data_images/39.xml',
 './data_images/40.xml',


In [4]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:   
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    
    return parser

In [5]:
parser_all = list(map(extract_text,xmlfiles))

In [6]:
data = reduce(lambda x,y : x+y,parser_all)

In [7]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [8]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,01.jpg,1280,719,car,120,1198,64,637
1,02.jpg,1280,853,car,551,996,269,532
2,03.jpg,1280,1707,car,372,535,1351,1501
3,04.jpg,1280,2145,car,390,1232,1330,1688
4,05.jpg,1280,853,car,249,1070,57,749


In [9]:
df.shape

(104, 8)

In [10]:
df['name'].value_counts()

name
person     37
sheep      11
car         9
cat         9
bicycle     6
horse       6
boat        6
bus         6
dog         5
train       5
chair       2
bottle      1
bird        1
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  104 non-null    object
 1   width     104 non-null    object
 2   height    104 non-null    object
 3   name      104 non-null    object
 4   xmin      104 non-null    object
 5   xmax      104 non-null    object
 6   ymin      104 non-null    object
 7   ymax      104 non-null    object
dtypes: object(8)
memory usage: 6.6+ KB


In [12]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  104 non-null    object
 1   width     104 non-null    int32 
 2   height    104 non-null    int32 
 3   name      104 non-null    object
 4   xmin      104 non-null    int32 
 5   xmax      104 non-null    int32 
 6   ymin      104 non-null    int32 
 7   ymax      104 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 4.2+ KB


In [13]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [14]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1280,719,car,120,1198,64,637,0.514844,0.487483,0.842187,0.79694
1,02.jpg,1280,853,car,551,996,269,532,0.604297,0.469519,0.347656,0.308324
2,03.jpg,1280,1707,car,372,535,1351,1501,0.354297,0.835384,0.127344,0.087873
3,04.jpg,1280,2145,car,390,1232,1330,1688,0.633594,0.703497,0.657813,0.1669
4,05.jpg,1280,853,car,249,1070,57,749,0.515234,0.47245,0.641406,0.811254


In [15]:
images = df['filename'].unique()

In [16]:
len(images)

61

In [17]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [18]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [19]:
len(img_train),len(img_test)

(49, 12)

In [20]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [21]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1280,719,car,120,1198,64,637,0.514844,0.487483,0.842187,0.79694
1,02.jpg,1280,853,car,551,996,269,532,0.604297,0.469519,0.347656,0.308324
3,04.jpg,1280,2145,car,390,1232,1330,1688,0.633594,0.703497,0.657813,0.1669
6,07.jpg,1280,852,car,39,1211,106,760,0.488281,0.508216,0.915625,0.767606
7,08.jpg,1280,853,car,619,1231,290,593,0.722656,0.517585,0.478125,0.355217


In [22]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
2,03.jpg,1280,1707,car,372,535,1351,1501,0.354297,0.835384,0.127344,0.087873
4,05.jpg,1280,853,car,249,1070,57,749,0.515234,0.47245,0.641406,0.811254
5,06.jpg,1280,851,car,45,1225,189,834,0.496094,0.601058,0.921875,0.757932
25,16.jpg,1280,854,person,357,535,529,823,0.348438,0.791569,0.139063,0.344262
26,16.jpg,1280,854,person,635,786,554,817,0.555078,0.802693,0.117969,0.307963


In [23]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
              'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
              'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [25]:
train_df.loc[:,'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:,'id'] = test_df['name'].apply(label_encoding)

In [26]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,01.jpg,1280,719,car,120,1198,64,637,0.514844,0.487483,0.842187,0.79694,1
1,02.jpg,1280,853,car,551,996,269,532,0.604297,0.469519,0.347656,0.308324,1
3,04.jpg,1280,2145,car,390,1232,1330,1688,0.633594,0.703497,0.657813,0.1669,1
6,07.jpg,1280,852,car,39,1211,106,760,0.488281,0.508216,0.915625,0.767606,1
7,08.jpg,1280,853,car,619,1231,290,593,0.722656,0.517585,0.478125,0.355217,1
8,09.jpg,1280,1920,person,14,1258,301,1915,0.496875,0.577083,0.971875,0.840625,0
9,10.jpg,1280,853,person,277,1041,82,848,0.514844,0.545135,0.596875,0.898007,0
10,11.jpg,1280,828,person,130,357,185,825,0.190234,0.609903,0.177344,0.772947,0
11,11.jpg,1280,828,person,361,528,204,823,0.347266,0.620169,0.130469,0.747585,0
12,11.jpg,1280,828,person,512,762,234,828,0.497656,0.641304,0.195312,0.717391,0


In [27]:
import os
from shutil import move

In [28]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [29]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [30]:
#groupby_obj_train.get_group('02.jpg').set_index('filename').to_csv('sample.txt', index=False,header=False)
# save each image in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move images to the destination folder
    #save the labels 
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)




In [31]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [32]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
dtype: object

In [33]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
dtype: object