In [103]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [104]:
# load all xml files and store in a list
xmlfiles = glob('./data_images/*.xml')
# data cleaning reolace \\ with /
xmlfiles = list(map(lambda x: x.replace('\\','/'),xmlfiles))

In [105]:
xmlfiles

['./data_images/01.xml',
 './data_images/02.xml',
 './data_images/03.xml',
 './data_images/04.xml',
 './data_images/05.xml',
 './data_images/06.xml',
 './data_images/07.xml',
 './data_images/08.xml',
 './data_images/09.xml',
 './data_images/10.xml',
 './data_images/11.xml',
 './data_images/12.xml',
 './data_images/13.xml',
 './data_images/14.xml',
 './data_images/15.xml',
 './data_images/16.xml',
 './data_images/17.xml',
 './data_images/20.xml',
 './data_images/21.xml',
 './data_images/22.xml',
 './data_images/23.xml',
 './data_images/24.xml',
 './data_images/25.xml']

In [106]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:   
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    
    return parser

In [107]:
parser_all = list(map(extract_text,xmlfiles))

In [108]:
data = reduce(lambda x,y : x+y,parser_all)

In [109]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [110]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,01.jpg,1024,657,car,15,306,330,507
1,01.jpg,1024,657,car,269,578,335,481
2,01.jpg,1024,657,car,505,803,344,444
3,01.jpg,1024,657,car,716,1012,342,435
4,02.jpg,799,533,car,49,758,175,420


In [111]:
df.shape

(60, 8)

In [112]:
df['name'].value_counts()

name
person       18
car           7
horse         6
boat          4
sheep         4
train         4
bicycle       3
bus           3
dog           3
motorbike     2
bottle        2
bird          1
cat           1
sofa          1
aeroplane     1
Name: count, dtype: int64

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  60 non-null     object
 1   width     60 non-null     object
 2   height    60 non-null     object
 3   name      60 non-null     object
 4   xmin      60 non-null     object
 5   xmax      60 non-null     object
 6   ymin      60 non-null     object
 7   ymax      60 non-null     object
dtypes: object(8)
memory usage: 3.9+ KB


In [114]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  60 non-null     object
 1   width     60 non-null     int32 
 2   height    60 non-null     int32 
 3   name      60 non-null     object
 4   xmin      60 non-null     int32 
 5   xmax      60 non-null     int32 
 6   ymin      60 non-null     int32 
 7   ymax      60 non-null     int32 
dtypes: int32(6), object(2)
memory usage: 2.5+ KB


In [115]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [116]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1024,657,car,15,306,330,507,0.156738,0.636986,0.28418,0.269406
1,01.jpg,1024,657,car,269,578,335,481,0.413574,0.621005,0.301758,0.222222
2,01.jpg,1024,657,car,505,803,344,444,0.638672,0.599696,0.291016,0.152207
3,01.jpg,1024,657,car,716,1012,342,435,0.84375,0.591324,0.289062,0.141553
4,02.jpg,799,533,car,49,758,175,420,0.505006,0.558161,0.887359,0.459662


In [117]:
images = df['filename'].unique()

In [118]:
len(images)

23

In [119]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [120]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [121]:
len(img_train),len(img_test)

(18, 5)

In [122]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [123]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
6,03.jpg,799,543,bird,218,473,120,443,0.432416,0.518416,0.319149,0.594843
7,04.jpg,640,800,cat,115,573,31,740,0.5375,0.481875,0.715625,0.88625
8,05.jpg,800,600,boat,380,569,186,339,0.593125,0.4375,0.23625,0.255
9,06.jpg,800,600,sheep,128,645,140,493,0.483125,0.5275,0.64625,0.588333
11,08.jpg,800,571,sheep,213,469,202,563,0.42625,0.669877,0.32,0.632224


In [124]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,1024,657,car,15,306,330,507,0.156738,0.636986,0.28418,0.269406
1,01.jpg,1024,657,car,269,578,335,481,0.413574,0.621005,0.301758,0.222222
2,01.jpg,1024,657,car,505,803,344,444,0.638672,0.599696,0.291016,0.152207
3,01.jpg,1024,657,car,716,1012,342,435,0.84375,0.591324,0.289062,0.141553
4,02.jpg,799,533,car,49,758,175,420,0.505006,0.558161,0.887359,0.459662


In [125]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
              'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
              'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [127]:
train_df.loc[:,'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:,'id'] = test_df['name'].apply(label_encoding)

In [128]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
6,03.jpg,799,543,bird,218,473,120,443,0.432416,0.518416,0.319149,0.594843,5
7,04.jpg,640,800,cat,115,573,31,740,0.5375,0.481875,0.715625,0.88625,12
8,05.jpg,800,600,boat,380,569,186,339,0.593125,0.4375,0.23625,0.255,10
9,06.jpg,800,600,sheep,128,645,140,493,0.483125,0.5275,0.64625,0.588333,15
11,08.jpg,800,571,sheep,213,469,202,563,0.42625,0.669877,0.32,0.632224,15
12,08.jpg,800,571,sheep,36,407,142,364,0.276875,0.443082,0.46375,0.388792,15
13,08.jpg,800,571,sheep,423,766,172,427,0.743125,0.524518,0.42875,0.446585,15
15,10.jpg,800,535,bicycle,298,502,249,381,0.5,0.588785,0.255,0.246729,8
16,10.jpg,800,535,bicycle,459,609,234,367,0.6675,0.561682,0.1875,0.248598,8
17,10.jpg,800,535,bicycle,590,791,259,385,0.863125,0.601869,0.25125,0.235514,8


In [129]:
import os
from shutil import move

In [131]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'data_images/train'

In [132]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [133]:
#groupby_obj_train.get_group('02.jpg').set_index('filename').to_csv('sample.txt', index=False,header=False)
# save each image in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move images to the destination folder
    #save the labels 
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)




In [134]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [135]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
dtype: object

In [136]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0    None
1    None
2    None
3    None
4    None
dtype: object