# Data Preparation

## Libraries

### We will start by installing the library to download the images

In [None]:
!pip install --upgrade pip
!pip install -q openimages

### We must also make sure we have the right version of opencv

In [None]:
!pip uninstall -qy opencv-python
!pip install -q opencv-python-headless #version meant to be used in a containerized environment

## Dataset

We will now download the dataset for 3 classes: Bicyle, Car, and Trafic sign. We are only downloading 300 images per class to limit the processing time in this example. However, to achieve a robust YOLOv5 model, it is recommended to train with over 1500 images per class, and more then 10,000 instances per class.

We specify the darknet format (–format darknet), which is the format YOLO can handle. This will create folders for each class, in which we will have darknet and images folders.

In [None]:
labels = 'Bicycle Car "Traffic sign"'
limit = 300
!oi_download_dataset --base_dir download --csv_dir download --labels {labels} --format darknet --limit {limit}

### Let's have a look at a sample image.

In [None]:
import os
import random
from PIL import Image, ImageDraw

def show_bbox(image_path):
    # convert image path to label path
    label_path = image_path.replace('/images/', '/darknet/')
    label_path = label_path.replace('.jpg', '.txt')

    # Open the image and create ImageDraw object for drawing
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    with open(label_path, 'r') as f:
        for line in f.readlines():
            # Split the line into five values
            label, x, y, w, h = line.split(' ')

            # Convert string into float
            x = float(x)
            y = float(y)
            w = float(w)
            h = float(h)

            # Convert center position, width, height into
            # top-left and bottom-right coordinates
            W, H = image.size
            x1 = (x - w/2) * W
            y1 = (y - h/2) * H
            x2 = (x + w/2) * W
            y2 = (y + h/2) * H

            # Draw the bounding box with red lines
            draw.rectangle((x1, y1, x2, y2),
                           outline=(255, 0, 0), # Red in RGB
                           width=5)             # Line width
    image.show()

In [None]:
files = os.listdir('download/car/images')
random_file = random.choice(files)
show_bbox('download/car/images/' + random_file)

### Now, let's prepare our training data structure

In [None]:
# Create a folder structure for YOLOv5 training
if not os.path.exists('data'):
    for folder in ['images', 'labels']:
        for split in ['train', 'val', 'test']:
            os.makedirs(f'data/{folder}/{split}')

### As all images will end up in the same folder, we must check for duplicate images (an image can contain multiple classes).

In [None]:
import glob

def get_filenames(folder):
    filenames = set()

    for path in glob.glob(os.path.join(folder, '*.jpg')):
        # Extract the filename
        filename = os.path.split(path)[-1]
        filenames.add(filename)

    return filenames


# classes filename sets
bicycle_images = get_filenames('download/bicycle/images')
car_images = get_filenames('download/car/images')
traffic_sign_images = get_filenames('download/traffic sign/images')

In [None]:
# Check for duplicates
duplicates1 = bicycle_images & car_images
duplicates2 = car_images & traffic_sign_images
duplicates3 = traffic_sign_images & bicycle_images

print(duplicates1)
print(duplicates2)
print(duplicates3)

In [None]:
# Cleanup duplicates
bicycle_images -= duplicates1
car_images -= duplicates2
traffic_sign_images -= duplicates3

In [None]:
# Check new datasets sizes
print(len(bicycle_images))
print(len(car_images))
print(len(traffic_sign_images))

### We can now randomly split all our images in train/val/test

We will use here a standard split scheme: 0.75, 0.125, 0.125

In [None]:
import numpy as np

bicycle_images = np.array(list(bicycle_images))
car_images = np.array(list(car_images))
traffic_sign_images = np.array(list(traffic_sign_images))

# Use the same random seed for reproducability
np.random.seed(42)
np.random.shuffle(bicycle_images)
np.random.shuffle(car_images)
np.random.shuffle(traffic_sign_images)

In [None]:
import shutil
import math


def split_dataset(item, image_names, train_size, val_size):
    for i, image_name in enumerate(image_names):
        # Label filename
        label_name = image_name.replace('.jpg', '.txt')

        # Split into train, val, or test
        if i < train_size:
            split = 'train'
        elif i < train_size + val_size:
            split = 'val'
        else:
            split = 'test'

        # Source paths
        source_image_path = f'download/{item}/images/{image_name}'
        source_label_path = f'download/{item}/darknet/{label_name}'

        # Destination paths
        target_image_folder = f'data/images/{split}'
        target_label_folder = f'data/labels/{split}'

        # Copy files
        shutil.copy(source_image_path, target_image_folder)
        shutil.copy(source_label_path, target_label_folder)


train_ratio = 0.75
val_ratio = 0.125

# Bicycle data
bicycle_train_size = math.floor(train_ratio * len(bicycle_images))
bicycle_val_size = math.floor(val_ratio * len(bicycle_images))
split_dataset('bicycle', bicycle_images, train_size=bicycle_train_size, val_size=bicycle_val_size)

# Car data
car_train_size = math.floor(train_ratio * len(car_images))
car_val_size = math.floor(val_ratio * len(car_images))
split_dataset('car', car_images, train_size=car_train_size, val_size=car_val_size)

# Traffic sign data
traffic_sign_train_size = math.floor(train_ratio * len(traffic_sign_images))
traffic_sign_val_size = math.floor(val_ratio * len(traffic_sign_images))
split_dataset('traffic sign', traffic_sign_images, train_size=traffic_sign_train_size, val_size=traffic_sign_val_size)

### Our dataset is now ready to use for training!