In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as ET

In [2]:
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

Current working directory: /Users/paul.serban/Desktop/documents/repos/wbk--dl-ml-ai--playground/alogrithms/detection-algorithms/yolo-for-object-detection


In [3]:
# Load all XML files and store in a list
xml_list = glob('./01_data-preparation/data_images/*.xml')
print(f"Number of XML files: {len(xml_list)}")

# data cleaning - replace \\ with / - for windows
xml_lis = map(lambda x: x.replace("\\", "/"), xml_list)

Number of XML files: 350


In [4]:
# read XML files
# from each XML file, extract the following fields:
# - filename, size(width, height), object(name, xmin, ymin, xmax, ymax)

def extract_text(filename):
    tree  = ET.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    size = root.find('size')
    width = size.find('width').text
    height = size.find('height').text
    objects = root.findall('object')
    parser  = []
    for obj in objects:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        ymin = bndbox.find('ymin').text
        xmax = bndbox.find('xmax').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, ymin, xmax, ymax])

    return parser


In [5]:
parser_all = list(map(extract_text, xml_list))

In [6]:
# flatten the list of lists
data = reduce(lambda x, y: x+y, parser_all)

In [7]:
# create the dataframe
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'ymin', 'xmax', 'ymax'])
df.head()
df.shape

(1107, 8)

In [8]:
df['name'].value_counts()

name
person         420
car            145
chair          121
bottle          35
boat            33
horse           33
bird            31
dog             29
sofa            27
cat             25
aeroplane       24
tvmonitor       24
train           24
pottedplant     23
bicycle         23
motorbike       21
sheep           21
diningtable     21
cow             14
bus             13
Name: count, dtype: int64

In [9]:
# get class names
class_names = df['name'].unique()
class_names, len(class_names)

(array(['car', 'person', 'boat', 'bottle', 'sheep', 'tvmonitor', 'dog',
        'chair', 'aeroplane', 'motorbike', 'bicycle', 'train', 'bus',
        'horse', 'sofa', 'cat', 'cow', 'pottedplant', 'diningtable',
        'bird'], dtype=object),
 20)

In [10]:
# type conversion
cols = ['width', 'height', 'xmin', 'ymin', 'xmax', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1107 non-null   object
 1   width     1107 non-null   int64 
 2   height    1107 non-null   int64 
 3   name      1107 non-null   object
 4   xmin      1107 non-null   int64 
 5   ymin      1107 non-null   int64 
 6   xmax      1107 non-null   int64 
 7   ymax      1107 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 69.3+ KB


In [11]:
# define center x, center y, width, height
df['center_x'] = (df['xmin'] + df['xmax']) / 2
df['center_y'] = (df['ymin'] + df['ymax']) / 2
df['w'] = df['xmax'] - df['xmin']
df['h'] = df['ymax'] - df['ymin']

# normalize the values
df['center_x'] = df['center_x'] / df['width']
df['center_y'] = df['center_y'] / df['height']
df['w'] = df['w'] / df['width']
df['h'] = df['h'] / df['height']

In [12]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,ymin,xmax,ymax,center_x,center_y,w,h
0,000620.jpg,500,344,car,114,182,303,309,0.417,0.713663,0.378,0.369186
1,000620.jpg,500,344,car,51,267,71,280,0.122,0.795058,0.04,0.037791
2,000620.jpg,500,344,car,19,272,36,284,0.055,0.80814,0.034,0.034884
3,000146.jpg,374,500,person,155,208,252,408,0.544118,0.616,0.259358,0.4
4,000608.jpg,500,375,boat,139,14,220,142,0.359,0.208,0.162,0.341333


# Split data into TRAIN & TEST sets

In [13]:
images = df['filename'].unique()
len(images)

350

In [14]:
# 80% train, 20% test
image_df = pd.DataFrame(images, columns=['filename'])
image_df.head()
image_train = image_df.sample(frac=0.8, random_state=42) # shuffle and pick 80% of the data
image_test = image_df.drop(image_train.index) # drop the train data to get the test data

image_train = tuple(image_train['filename'])
image_test = tuple(image_test['filename'])

len(image_train), len(image_test)

(280, 70)

In [15]:
train_df = df[df['filename'].isin(image_train)]
test_df = df[df['filename'].isin(image_test)]

In [16]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,ymin,xmax,ymax,center_x,center_y,w,h
0,000620.jpg,500,344,car,114,182,303,309,0.417,0.713663,0.378,0.369186
1,000620.jpg,500,344,car,51,267,71,280,0.122,0.795058,0.04,0.037791
2,000620.jpg,500,344,car,19,272,36,284,0.055,0.80814,0.034,0.034884
4,000608.jpg,500,375,boat,139,14,220,142,0.359,0.208,0.162,0.341333
5,000608.jpg,500,375,boat,12,110,420,286,0.432,0.528,0.816,0.469333


In [17]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,ymin,xmax,ymax,center_x,center_y,w,h
3,000146.jpg,374,500,person,155,208,252,408,0.544118,0.616,0.259358,0.4
26,000225.jpg,500,333,sheep,125,96,355,263,0.48,0.539039,0.46,0.501502
27,000225.jpg,500,333,sheep,2,22,47,81,0.049,0.154655,0.09,0.177177
47,000190.jpg,500,375,bus,426,197,500,273,0.926,0.626667,0.148,0.202667
48,000190.jpg,500,375,bus,235,216,323,258,0.558,0.632,0.176,0.112


# Label Encoding
- it is converting the categorical data into numerical data for the model to understand the data.

In [18]:
# Function to create a label dictionary from a unique set of labels
def labels_dictionary(data_frame_column):
    labels = data_frame_column.unique()
    return {label: i for i, label in enumerate(labels)}

# Create the labels dictionary based on the 'name' column in train_df
labels = labels_dictionary(train_df['name'])

# Label encoding function
def label_encoding(x):
    return labels[x]

# Apply label encoding using .loc to ensure modifications are explicit
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)


In [19]:
train_df

Unnamed: 0,filename,width,height,name,xmin,ymin,xmax,ymax,center_x,center_y,w,h,id
0,000620.jpg,500,344,car,114,182,303,309,0.417,0.713663,0.378,0.369186,0
1,000620.jpg,500,344,car,51,267,71,280,0.122,0.795058,0.040,0.037791,0
2,000620.jpg,500,344,car,19,272,36,284,0.055,0.808140,0.034,0.034884,0
4,000608.jpg,500,375,boat,139,14,220,142,0.359,0.208000,0.162,0.341333,1
5,000608.jpg,500,375,boat,12,110,420,286,0.432,0.528000,0.816,0.469333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,000349.jpg,500,375,chair,321,126,479,363,0.800,0.652000,0.316,0.632000,6
1091,000349.jpg,500,375,chair,284,76,360,144,0.644,0.293333,0.152,0.181333,6
1092,000349.jpg,500,375,chair,100,91,206,298,0.306,0.518667,0.212,0.552000,6
1093,000349.jpg,500,375,chair,167,161,326,375,0.493,0.714667,0.318,0.570667,6


# Save IMage and Labels in Text

In [20]:
import os
from shutil import copyfile

In [21]:
train_folder = './01_data-preparation/data_images/train'
test_folder = './01_data-preparation/data_images/test'

# Create the train and test directories
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [22]:
# Move the images to the respective folders
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')


In [23]:
# save each image in the respective folder and respective labels in the txt file
def save_data(filename, folder_path, group_obj):
    for filename, group in group_obj:
        # save the labels in the txt file
        group.set_index('filename').to_csv(f'{folder_path}/{filename[:-4]}.txt', sep=' ', index=False, header=False)
        # save the images in the respective folder
        src = f'./01_data-preparation/data_images/{filename}'
        target = f'{folder_path}/{filename}'
        copyfile(src, target)

save_data('filename', train_folder, groupby_obj_train)
save_data('filename', test_folder, groupby_obj_test)