# We will create a dataset for the training of an object detection model using Detectron2

We have a classification dataset with cropped insects and a detection dataset with bounding boxes without labels.

We need to find check if the classification crops are from the detection dataset and match them so that we end up with labeled bounding boxes.

In [1]:
import os
import shutil
import glob
from IPython.display import clear_output
import cv2
import numpy as np
import json
import pandas as pd
from skimpy import skim
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
det_dir = 'detection_data'
class_dir = 'classification_data'
gen_dir = 'generated_data'

#flower backgrounds
det_dir_flowers = 'detection_data_flowers'
class_dir_flowers = 'classification_data_flowers'
gen_dir_flowers = 'generated_data_flowers'

In [3]:
jpg_files = glob.glob(det_dir + '/images/*.jpg')
file_names = [file.split('/')[-1] for file in jpg_files]
file_names_without_extension = [os.path.splitext(file)[0] for file in file_names]
len(file_names_without_extension)

3237

We have `3237` detection images with bboxes in total.

In [4]:
jpg_files = glob.glob(class_dir + '/images/*.jpg')
file_names = [file.split('/')[-1] for file in jpg_files]
crops_without_extension = [os.path.splitext(file)[0] for file in file_names]
cropped_names = [name.split('_', 2)[0] + '_' + name.split('_', 2)[1] for name in crops_without_extension]
len(cropped_names)

39445

And `39445` classification crops in total.

In [5]:
len(set(cropped_names))

2973

Those `39445` crops have been taken from `2973` images in total.

In [6]:
len(list(set(cropped_names).intersection(set(file_names_without_extension))))

2973

All of those `2973` images that have been used for the crops are present in the detection dataset.

In [7]:
def create_dataset_directory(dir='generated_data'):
    """
    Creates a directory structure for the dataset.

    This function creates a directory named 'gen_data' and its subdirectories 'images' and 'annotations'.
    It also creates an empty file named 'comparison.csv' inside the 'gen_data' directory.

    Returns:
        None
    """
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    os.makedirs(dir + '/images')
    os.makedirs(dir + '/annotations')
    open(dir + '/comparison.csv', 'w').close()

# Call the function to create the dataset directory
create_dataset_directory()


In [8]:
test = file_names_without_extension[:3].copy()
test.append('192_20200715165926')
test

['153_20200618121032',
 '249_20210828004742',
 '211_20210827035008',
 '192_20200715165926']

In [9]:
len(file_names_without_extension)

3237

In [10]:
import time

def timer(func):
  def wrapper(*args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {round(execution_time/60, 1)} minutes")
    return result
  return wrapper

In [11]:
@timer
def create_dataset(class_dir=class_dir, det_dir=det_dir, gen_dir=gen_dir, file_names_without_extension=file_names_without_extension, crops_without_extension=crops_without_extension):
    """
    Create a dataset by performing image processing and comparison.

    Args:
        class_dir (str): The directory path where the classification labels and images are stored.
        det_dir (str): The directory path where the original images and annotations are stored.
        gen_dir (str): The directory path where the generated images and annotations will be saved.
        test (list): A list of file names to process.
        crops_without_extension (list): A list of crop names without file extensions.

    Returns:
        None
    """

    # Read the classification labels from a CSV file
    labels_df = pd.read_csv(class_dir + '/classification_labels.csv')

    # Create an empty DataFrame to store comparison results
    comparison_df = pd.DataFrame(columns=['name', 'org_boxes', 'gen_boxes', 'similarity_pct'])

    total_anns = len(labels_df[labels_df['basename'].apply(lambda x: '_'.join(x.split('_', 2)[:2]) in file_names_without_extension)])
    current_anns = 0

    total_files = len(list(set(cropped_names).intersection(set(file_names_without_extension))))
    current_file = 0

    # for file in file_names_without_extension:
    for file in list(set(cropped_names).intersection(set(file_names_without_extension))):
        current_file += 1

        temp_crops = [crop for crop in crops_without_extension if file in crop]

        # Read the original image
        img_rgb = cv2.imread(det_dir + '/images/' + file + '.jpg')

        # Save the plain original image in `gen_dir/images`
        cv2.imwrite(gen_dir + '/images/' + file + '.jpg', img_rgb)

        gen_rgb = img_rgb.copy()
        org_rgb = img_rgb.copy()
        img_gray = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)

        gen_i = 0
        org_i = 0

        bbox_dict = {}
        img_h, img_w = img_rgb.shape[0], img_rgb.shape[1]

        for crop in temp_crops:
            current_anns += 1
            clear_output(wait=True)
            print(f'Processing file {current_file} of {total_files}, (' + str(round( current_anns / total_anns * 100)) + '%)')
            
            # Read the template image
            template = cv2.imread(class_dir + '/images/' + crop + '.jpg', 0)
            h, w = template.shape[::]

            # Perform template matching
            res = cv2.matchTemplate(img_gray, template, cv2.TM_SQDIFF)
            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
            top_left = min_loc
            bottom_right = (top_left[0] + w, top_left[1] + h)

            # Draw bounding box on the generated image
            cv2.rectangle(gen_rgb, top_left, bottom_right, (0, 0, 255), 2)
            gen_i += 1

            # Store bounding box coordinates as floats in [0, 1] relative to the dimensions of the image in a dictionary
            # bbox_dict[crop] = {'label': labels_df.deepest_name[labels_df.basename == crop].values[0], 'x': top_left[0], 'y': top_left[1], 'w': w, 'h': h}
            bbox_dict[crop] = {'label': labels_df.deepest_name[labels_df.basename == crop].values[0], 'bbox':[top_left[0]/img_w, top_left[1]/img_h, w/img_w, h/img_h]}

        # Save the bounding box information as a JSON file
        with open(gen_dir + '/annotations/' + file + '.json', 'w') as outfile:
            json.dump(bbox_dict, outfile, indent=2)

        # Read the annotations from the original image
        with open(det_dir + '/annotations/' + file + '.json') as json_file:
            data = json.load(json_file)
            for obj in data['annotations']:
                x = obj['shape']['x']
                y = obj['shape']['y']
                w = obj['shape']['width']
                h = obj['shape']['height']

                # Draw bounding box on the original image
                cv2.rectangle(org_rgb, (x, y), (x+w, y+h), (0,0,255), 2)
                org_i += 1

        # Calculate similarity percentage between the original and generated images
        imgA = cv2.cvtColor(org_rgb, cv2.COLOR_BGR2GRAY)
        imgB = cv2.cvtColor(gen_rgb, cv2.COLOR_BGR2GRAY)
        sub = cv2.subtract(imgA, imgB)
        if len(np.argwhere(sub > 0)) == 0:
            similarity = 100
        else:
            similarity = round(100 - len(np.argwhere(sub > 0)) / imgA.size * 100, 2)
            if similarity == 100:
                similarity = 100

        # Concatenate comparison_df with file, org_i, gen_i, similarity
        comparison_df = pd.concat([comparison_df, pd.DataFrame([{'name': file, 'org_boxes': org_i, 'gen_boxes': gen_i, 'similarity_pct': similarity}])], ignore_index=True)

    # Save the comparison results as a CSV file
    sorted_comp_df = comparison_df.sort_values('similarity_pct', ascending=True)
    sorted_comp_df.to_csv(gen_dir + '/comparison.csv', index=False)

In [12]:
create_dataset()

Processing file 3237 of 2973, (100%)
Execution time: 119.5 minutes


In [24]:
comparison_df = pd.read_csv(gen_dir + '/comparison.csv')
comparison_df

Unnamed: 0,name,org_boxes,gen_boxes,similarity_pct
0,192_20200723034614,513,0,99.17
1,174_20200801021643,180,0,99.72
2,108_20200730222724,105,105,99.88
3,178_20200618011145,638,638,99.90
4,141_20200715233601,30,0,99.93
...,...,...,...,...
3232,217_20210723032624,1,1,100.00
3233,216_20210824221614,2,2,100.00
3234,245_20210829221417,1,1,100.00
3235,249_20210806000713,8,8,100.00


!!! Made a mistake and took all detection images instead of only the needed ones.. We will remove the useless ones instead of running it again.

================== START OF CORRECTION ==================

In [47]:
# i = 0
# for name in list(set(file_names_without_extension).difference(set(cropped_names))):
#   os.remove(gen_dir + '/images/' + name + '.jpg')
#   os.remove(gen_dir + '/annotations/' + name + '.json')
#   i += 1
# print(i)

264


In [28]:
# comparisin_df = comparison_df[~comparison_df.name.isin(list(set(file_names_without_extension).difference(set(cropped_names))))]
# comparisin_df.to_csv(gen_dir + '/comparison.csv', index=False)

In [34]:
# comparison_df = pd.read_csv(gen_dir + '/comparison.csv')
# comparison_df

Unnamed: 0,name,org_boxes,gen_boxes,similarity_pct
0,108_20200730222724,105,105,99.88
1,178_20200618011145,638,638,99.90
2,148_20200621231631,17,8,99.96
3,295_20210805045410,34,34,99.96
4,261_20210918205358,69,69,99.96
...,...,...,...,...
2968,217_20210723032624,1,1,100.00
2969,216_20210824221614,2,2,100.00
2970,245_20210829221417,1,1,100.00
2971,249_20210806000713,8,8,100.00


================== END OF CORRECTION ==================

In [48]:
comparison_df[comparison_df.gen_boxes != comparison_df.org_boxes]

Unnamed: 0,name,org_boxes,gen_boxes,similarity_pct
2,148_20200621231631,17,8,99.96
11,172_20200808225425,50,20,99.98
15,191_20200822000531,10,7,99.98
46,159_20200723234609,16,2,99.99
53,261_20210905001440,18,17,99.99
...,...,...,...,...
2843,216_20210811230533,2,1,100.00
2877,167_20200601005407,16,8,100.00
2882,253_20210802064326,11,10,100.00
2953,290_20210911082351,5,4,100.00


In [50]:
comparison_df[comparison_df.gen_boxes != comparison_df.org_boxes].org_boxes.sum() - comparison_df[comparison_df.gen_boxes != comparison_df.org_boxes].gen_boxes.sum()

301

We have 74 images where not all of the insects were drawn, because not all cutout images were available. This amounts to 301 missing annotations. We will have to look at the total amounts of insects per label after transforming the dataset to a popular format.

Now that we have generated our dataset files and made the comparison with the expected annotation results, we need to open them with `FiftyOne` to convert the custom annotation format to a common one that can be used directly to train a detection model.

In [2]:
import fiftyone as fo

In [13]:
fo.delete_dataset("IBDM-OD-DS")

In [14]:
dataset_imgs = glob.glob(gen_dir_flowers + '/images/*.jpg')

# Create samples for your data
samples = []
for filepath in dataset_imgs:
    filename = filepath.split('/')[-1]
    filename = os.path.splitext(filename)[0]

    sample = fo.Sample(filepath=filepath)

    # Convert detections to FiftyOne format
    detections = []
    with open(gen_dir_flowers + '/annotations/' + filename + '.json') as json_file:
        annotations = json.load(json_file)
        json_file.close()
    for key, values in annotations.items():
        label = values["label"]
        bounding_box = values["bbox"]

        detections.append(
            fo.Detection(label=label, bounding_box=bounding_box)
        )

    # Store detections in a field name of your choice
    sample["ground_truth"] = fo.Detections(detections=detections)

    samples.append(sample)

# Create dataset
dataset = fo.Dataset("IBDM-OD-DS")
dataset.add_samples(samples)

 100% |███████████████| 2973/2973 [5.0s elapsed, 0s remaining, 577.4 samples/s]      


['66452d6bc3b1d6227c9aaeab',
 '66452d6bc3b1d6227c9aaeac',
 '66452d6bc3b1d6227c9aaead',
 '66452d6bc3b1d6227c9aaeae',
 '66452d6bc3b1d6227c9aaeaf',
 '66452d6bc3b1d6227c9aaeb0',
 '66452d6bc3b1d6227c9aaeb1',
 '66452d6bc3b1d6227c9aaeb2',
 '66452d6bc3b1d6227c9aaeb3',
 '66452d6bc3b1d6227c9aaeb4',
 '66452d6bc3b1d6227c9aaeb5',
 '66452d6bc3b1d6227c9aaeb6',
 '66452d6bc3b1d6227c9aaeb7',
 '66452d6bc3b1d6227c9aaeb8',
 '66452d6bc3b1d6227c9aaeb9',
 '66452d6bc3b1d6227c9aaeba',
 '66452d6bc3b1d6227c9aaebb',
 '66452d6bc3b1d6227c9aaebc',
 '66452d6bc3b1d6227c9aaebd',
 '66452d6bc3b1d6227c9aaebe',
 '66452d6bc3b1d6227c9aaebf',
 '66452d6bc3b1d6227c9aaec0',
 '66452d6bc3b1d6227c9aaec1',
 '66452d6bc3b1d6227c9aaec2',
 '66452d6bc3b1d6227c9aaec3',
 '66452d6bc3b1d6227c9aaec4',
 '66452d6bc3b1d6227c9aaec5',
 '66452d6bc3b1d6227c9aaec6',
 '66452d6bc3b1d6227c9aaec7',
 '66452d6bc3b1d6227c9aaec8',
 '66452d6bc3b1d6227c9aaec9',
 '66452d6bc3b1d6227c9aaeca',
 '66452d6bc3b1d6227c9aaecb',
 '66452d6bc3b1d6227c9aaecc',
 '66452d6bc3b1

In [15]:
dataset.tags.append('greenScreen')
dataset.save()

In [59]:
counts = dataset.count_values("ground_truth.detections.label")
counts = pd.DataFrame(list(counts.items()), columns=['label', 'count']).sort_values('count', ascending=False).reset_index(drop=True)
counts

Unnamed: 0,label,count
0,Chironomidae,11405
1,Caenidae,10351
2,Cicadellidae,3070
3,Diptera,2434
4,Insecta,2235
...,...,...
79,Anisopodidae,1
80,Nepticulidae,1
81,Cercopidae,1
82,Sciaridae,1


In [16]:
# Launch the App in a dedicated browser tab
session = fo.launch_app(dataset, auto=False)
session.open_tab()

Session launched. Run `session.show()` to open the App in a cell output.


<IPython.core.display.Javascript object>

In [70]:
export_dir = "coco_data"
label_field = "ground_truth"
dataset_type = fo.types.COCODetectionDataset

dataset.export(
    export_dir=export_dir,
    dataset_type=dataset_type,
    label_field=label_field,
)

 100% |███████████████| 2973/2973 [10.6s elapsed, 0s remaining, 278.3 samples/s]      


In [8]:
export_dir = "yolov4_data"
label_field = "ground_truth"
dataset_type = fo.types.YOLOv4Dataset

dataset.export(
    export_dir=export_dir,
    dataset_type=dataset_type,
    label_field=label_field,
)

 100% |███████████████| 2973/2973 [9.9s elapsed, 0s remaining, 206.8 samples/s]       


In [9]:
export_dir = "yolov5_data"
label_field = "ground_truth"
dataset_type = fo.types.YOLOv5Dataset

dataset.export(
    export_dir=export_dir,
    dataset_type=dataset_type,
    label_field=label_field,
)

 100% |███████████████| 2973/2973 [14.4s elapsed, 0s remaining, 200.1 samples/s]      
