#1. Import Libraries

The following libraries are essential for handling files, data manipulation, and XML parsing.

In [None]:
import os                   # Importing the OS module to interact with the operating system
from glob import glob       # Importing the 'glob' function to find all the pathnames matching a specified pattern
import pandas as pd         # Importing the Pandas library for data manipulation and analysis, aliased as 'pd'
from functools import reduce # Importing the 'reduce' function to apply a function of two arguments cumulatively to the items of an iterable
from xml.etree import ElementTree as et # Importing the ElementTree module for XML parsing, aliased as 'et'

#2.preparing the Dta

This section of the code is responsible for loading all XML files from a specified directory and cleaning the file paths.

In [None]:
# Load all XML files from the 'data_images' directory and store them in a list
xml_list = glob('./data_images/*.xml')

# Data Cleaning: Replace backslashes with forward slashes in the file paths
# Using a lambda function to replace '\\' with '/' for each path in the list
xml_list = list(map(lambda x: x.replace("\\\\", "/"), xml_list))

check the xml_list

In [None]:
xml_list

The extract_text function reads an XML file and extracts key details related to image annotation. The details include the filename, image dimensions (width and height), and annotations for objects present in the image. Each object's annotation includes the name and bounding box coordinates.

In [None]:
def extract_text(filename: str) -> list:
    """Extracts details from an XML file.

    Args:
        filename (str): Path to the XML file.

    Returns:
        list: A list of parsed details including filename, size, and object attributes.
    """
    # Parse the XML file
    tree = et.parse(filename)
    root = tree.getroot()

    # Extract filename
    file_name = root.find('filename').text

    # Extract the width and height
    size = root.find('size')
    width = size.find('width').text
    height = size.find('height').text

    # Extract objects (name, xmin, xmax, ymin, ymax)
    parser = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        coordinates = [bndbox.find(coord).text for coord in ['xmin', 'xmax', 'ymin', 'ymax']]
        parser.append([file_name, width, height, name] + coordinates)

    return parser


This part of the code takes all the XML files listed in xml_list and applies the previously defined extract_text function to extract the required details from each file. These details are then combined into a single list named data.

In [None]:
# Apply the 'extract_text' function to each XML file in 'xml_list' using the 'map' function
# The result is a list of lists, where each inner list contains the parsed details of one XML file
parser_all = list(map(extract_text, xml_list))

# Use the 'reduce' function to combine the individual lists in 'parser_all' into a single list 'data'
# The lambda function 'lambda x, y: x + y' concatenates two lists and is applied cumulatively to the elements of 'parser_all'
data = reduce(lambda x, y: x + y, parser_all)

In [None]:
data

[['000001.jpg', '1024', '657', 'car', '14', '301', '335', '522'],
 ['000001.jpg', '1024', '657', 'car', '269', '571', '345', '489'],
 ['000001.jpg', '1024', '657', 'car', '502', '798', '342', '450'],
 ['000001.jpg', '1024', '657', 'car', '709', '1009', '333', '438'],
 ['000002.jpg', '800', '600', 'car', '41', '768', '240', '497'],
 ['000002.jpg', '800', '600', 'car', '533', '722', '236', '299'],
 ['000007.jpg', '500', '333', 'car', '141', '500', '50', '330'],
 ['000009.jpg', '500', '375', 'horse', '69', '270', '172', '330'],
 ['000009.jpg', '500', '375', 'person', '150', '229', '141', '284'],
 ['000009.jpg', '500', '375', 'person', '285', '327', '201', '331'],
 ['000009.jpg', '500', '375', 'person', '258', '297', '198', '329'],
 ['000012.jpg', '500', '333', 'car', '156', '351', '97', '270'],
 ['000016.jpg', '334', '500', 'bicycle', '92', '305', '72', '473'],
 ['000017.jpg', '480', '364', 'person', '185', '279', '62', '199'],
 ['000017.jpg', '480', '364', 'horse', '90', '403', '78', '33

#3.Visualisation and information about the Data

In [None]:
df = pd.DataFrame(data , columns = ['filename' , 'width' , 'height' , 'name' , 'xmin' , 'xmax' , 'ymin' , 'ymax'])

In [None]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,000001.jpg,1024,657,car,14,301,335,522
1,000001.jpg,1024,657,car,269,571,345,489
2,000001.jpg,1024,657,car,502,798,342,450
3,000001.jpg,1024,657,car,709,1009,333,438
4,000002.jpg,800,600,car,41,768,240,497


In [None]:
df.shape

(15663, 8)

In [None]:
df['name'].value_counts()

name
person         5447
car            1650
chair          1427
bottle          634
pottedplant     625
bird            599
dog             538
sofa            425
bicycle         418
horse           406
boat            398
motorbike       390
cat             389
tvmonitor       367
cow             356
sheep           353
aeroplane       331
train           328
diningtable     310
bus             272
Name: count, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  object
 2   height    15663 non-null  object
 3   name      15663 non-null  object
 4   xmin      15663 non-null  object
 5   xmax      15663 non-null  object
 6   ymin      15663 non-null  object
 7   ymax      15663 non-null  object
dtypes: object(8)
memory usage: 979.1+ KB


In [None]:
# type conversion
cols = ['width' , 'height' , 'xmin' , 'xmax' , 'ymin' , 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15663 entries, 0 to 15662
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  15663 non-null  object
 1   width     15663 non-null  int32 
 2   height    15663 non-null  int32 
 3   name      15663 non-null  object
 4   xmin      15663 non-null  int32 
 5   xmax      15663 non-null  int32 
 6   ymin      15663 non-null  int32 
 7   ymax      15663 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 612.0+ KB


In [None]:
"""
we are extracting until now in the project , from the images , {width , height , X_min , X_max , Y_min , Y_max}
but the YOLO model works with ( X , Y , W , H).
   - the X and Y are the center of the bounding box
   - the W and H represent the width and height of the bounding box

 *  center_x = ((X_min + X_max)/2)/width ofthe image

 *  center_y = ((Y_min + Y_max)/2) / height of the image

 *  W = (X_max - X_min)/width of the image

 * h = (Y_max - Y_min) / height of the image


"""
#center x , center y
df['center_x'] = ((df['xmax'] + df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax'] + df['ymin'])/2)/df['height']

# W & H
df['w'] = (df['xmax'] - df['xmin'])/df['width']
df['h'] = (df['ymax'] - df['ymin'])/df['height']

We are extracting the following details from the images in this project:
- width
- height
- X_min
- X_max
- Y_min
- Y_max

However, the YOLO model works with $(X, Y, W, H)$ where:
- $X$ and $Y$ are the center coordinates of the bounding box.
- $W$ and $H$ represent the width and height of the bounding box.

Formulas to convert the coordinates:
- $center_x = \frac{{(X_{min} + X_{max})}}{2}/\text{{width of the image}}$
- $center_y = \frac{{(Y_{min} + Y_{max})}}{2}/\text{{height of the image}}$
- $W = \frac{{(X_{max} - X_{min})}}{\text{{width of the image}}}$
- $H = \frac{{(Y_{max} - Y_{min})}}{\text{{height of the image}}}$



In [None]:
# Calculate the center coordinates (X, Y) of the bounding box
# center_x is the average of xmin and xmax, normalized by the image width
# center_y is the average of ymin and ymax, normalized by the image height
df['center_x'] = ((df['xmax'] + df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax'] + df['ymin'])/2)/df['height']

# Calculate the width (W) and height (H) of the bounding box
# W is the difference between xmax and xmin, normalized by the image width
# H is the difference between ymax and ymin, normalized by the image height
df['w'] = (df['xmax'] - df['xmin'])/df['width']
df['h'] = (df['ymax'] - df['ymin'])/df['height']


In [None]:
# Display the first 5 rows of the DataFrame 'df'.
# This is useful to quickly inspect the structure and first few records of the data.

df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,000001.jpg,1024,657,car,14,301,335,522,0.153809,0.652207,0.280273,0.284627
1,000001.jpg,1024,657,car,269,571,345,489,0.410156,0.634703,0.294922,0.219178
2,000001.jpg,1024,657,car,502,798,342,450,0.634766,0.60274,0.289062,0.164384
3,000001.jpg,1024,657,car,709,1009,333,438,0.838867,0.586758,0.292969,0.159817
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.614167,0.90875,0.428333


#4. splite data into Train and test

In [None]:
# Extract unique filenames from the 'filename' column of the DataFrame 'df'.
images = df['filename'].unique()

In [None]:
len(images)

5012

Split the unique images into 80% for training and 20% for testing.

In [None]:
img_df = pd.DataFrame(images, columns=['filename']) # Create a DataFrame with the unique filenames.

# Shuffle the DataFrame and pick 80% of the images for the training set.
# The 'sample' method with 'frac=0.8' is used to randomly select 80% of the rows.
img_train = tuple(img_df.sample(frac=0.8)['filename'])

# Query the remaining 20% of the images that are not in the training set and assign them to the testing set.
# The 'query' method is used to filter the rows that are not included in 'img_train'.
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])


In [None]:
# Calculate the number of images in the training and testing sets.
len(img_train), len(img_test)

(4010, 1002)

In [None]:
# Create the training DataFrame by including only the rows with filenames that are in the 'img_train' set.
train_df = df.query(f'filename in {img_train}')

# Create the testing DataFrame by including only the rows with filenames that are in the 'img_test' set.
test_df = df.query(f'filename in {img_test}')

# Calculate the number of rows in the training and testing DataFrames.
len(train_df), len(test_df)

(12491, 3172)

In [None]:
# Display the first 5 rows of the training DataFrame 'train_df'.
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
6,000007.jpg,500,333,car,141,500,50,330,0.641,0.570571,0.718,0.840841
7,000009.jpg,500,375,horse,69,270,172,330,0.339,0.669333,0.402,0.421333
8,000009.jpg,500,375,person,150,229,141,284,0.379,0.566667,0.158,0.381333
9,000009.jpg,500,375,person,285,327,201,331,0.612,0.709333,0.084,0.346667
10,000009.jpg,500,375,person,258,297,198,329,0.555,0.702667,0.078,0.349333


In [None]:
# Display the first 5 rows of the test DataFrame 'test_df'.
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,000001.jpg,1024,657,car,14,301,335,522,0.153809,0.652207,0.280273,0.284627
1,000001.jpg,1024,657,car,269,571,345,489,0.410156,0.634703,0.294922,0.219178
2,000001.jpg,1024,657,car,502,798,342,450,0.634766,0.60274,0.289062,0.164384
3,000001.jpg,1024,657,car,709,1009,333,438,0.838867,0.586758,0.292969,0.159817
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.614167,0.90875,0.428333


**Assign id number to object names**

The label_encoding function defined below is responsible for converting categorical labels (such as object names) into numerical values. This is known as label encoding and is a crucial step in preparing categorical data for machine learning algorithms.

* This encoding allows algorithms to work with numerical representations of categorical data, facilitating mathematical computations and model training.


* The function can be applied to a column of labels in a DataFrame to transform the entire column into numerical form.



In [None]:
def label_encoding(x):
    """
    Perform label encoding on a given label.

    Args:
    x (str): The label to be encoded (e.g., 'person', 'car', etc.).

    Returns:
    int: The encoded value corresponding to the given label.

    Example:
    >>> label_encoding('person')
    0
    """
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
              'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
              'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]



In [None]:
# Apply the label_encoding function to the 'name' column of the training DataFrame.
# Store the resulting numerical labels in a new 'id' column within the training DataFrame.
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)

# Apply the label_encoding function to the 'name' column of the testing DataFrame.
# Store the resulting numerical labels in a new 'id' column within the testing DataFrame.
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)

In [None]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
6,000007.jpg,500,333,car,141,500,50,330,0.641,0.570571,0.718,0.840841,1
7,000009.jpg,500,375,horse,69,270,172,330,0.339,0.669333,0.402,0.421333,9
8,000009.jpg,500,375,person,150,229,141,284,0.379,0.566667,0.158,0.381333,0
9,000009.jpg,500,375,person,285,327,201,331,0.612,0.709333,0.084,0.346667,0
10,000009.jpg,500,375,person,258,297,198,329,0.555,0.702667,0.078,0.349333,0


In [None]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,000001.jpg,1024,657,car,14,301,335,522,0.153809,0.652207,0.280273,0.284627,1
1,000001.jpg,1024,657,car,269,571,345,489,0.410156,0.634703,0.294922,0.219178,1
2,000001.jpg,1024,657,car,502,798,342,450,0.634766,0.60274,0.289062,0.164384,1
3,000001.jpg,1024,657,car,709,1009,333,438,0.838867,0.586758,0.292969,0.159817,1
4,000002.jpg,800,600,car,41,768,240,497,0.505625,0.614167,0.90875,0.428333,1


# Save Images and labels in text

In [None]:
import os
from shutil import move # Import the 'move' function from the 'shutil' module to enable moving files from one directory to another.

In [None]:
# Define the paths for the training and testing folders within the 'data_images' directory.
train_folder = 'data_images/train'
test_folder = 'data_images/test'

# Create the 'train_folder' directory to store the training images.
os.mkdir(train_folder)

# Create the 'test_folder' directory to store the testing images.
os.mkdir(test_folder)

In [None]:
# Define the columns of interest that likely represent image annotations.
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']

# Group the training DataFrame by the 'filename' column, focusing on the selected columns.
# This will allow us to work with all records related to a specific training image together.
groupby_obj_train = train_df[cols].groupby('filename')

# Group the testing DataFrame by the 'filename' column, focusing on the selected columns.
# This will allow us to work with all records related to a specific testing image together.
groupby_obj_test = test_df[cols].groupby('filename')


In the code snippet below, we define a function save_data that performs two essential tasks: moving image files to a specific folder (e.g., training or testing) and saving their corresponding labels to a text file.



In [None]:
def save_data(filename, folder_path, group_obj):
    """
    Move an image file to a specified folder and save its corresponding labels to a text file.

    Args:
    filename (str): The name of the image file.
    folder_path (str): The destination folder path where the image will be moved.
    group_obj (pd.DataFrame): The groupby object containing the labels for the images.

    Example:
    >>> save_data('000009.jpg', 'data_images/train', groupby_obj_train)
    """

    # Construct the source and destination paths for the image.
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)

    # Move the image file to the destination folder.
    move(src, dst)

    # Construct the filename for the text file to save the labels.
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')

    # Retrieve the labels for the specified image from the groupby object, and save them to the text file.
    # The labels are saved without the header, and space-separated.
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)


In [None]:
# Create a Pandas Series containing the unique filenames from the training groupby object (groupby_obj_train).
# This Series represents the different groups within the training data and can be used for further processing.
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [None]:
filename_series.head(5) #visualize just the first 5 elements , for all the elements apply {.head()}

0    000007.jpg
1    000009.jpg
2    000012.jpg
3    000017.jpg
4    000019.jpg
dtype: object

In [None]:
filename_series.apply(save_data , args=(train_folder , groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [None]:
# Create a Pandas Series containing the unique filenames from the testing groupby object (groupby_obj_test).
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

# Apply the save_data function to each filename in the Series, along with the test_folder and groupby_obj_test as arguments.
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))


0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object