# Dataset Generator (Object Detection)

In [1]:
import conf
import helper

## Definitions

### bounding box map (from GDT-HWD to AWS format)

```json
{
    "left": xmin
    "top": ymin
    "width": xmax - xmin
    "height": yman - ymin
}
```

### Classes 

```json
{
    "none": 0
    "red": 1
    "yellow": 2
    "white": 3
    "blue": 4
}
```

## Step by step transformations (concept)

- Create necessary folders

In [2]:
!rm -rf train train_annotation validation validation_annotation

In [3]:
!mkdir train train_annotation validation validation_annotation

### 1. read all annotations ordered (**GDUT-HWD/Annotations**) 

In [4]:
import os

raw_annotation_path = conf.raw_annotation_path
annotations = os.listdir(raw_annotation_path)

annotations.sort()
jpgs = [x.replace('xml', 'jpg') for x in annotations]
jsons = [x.replace('jpg', 'json') for x in annotations]

### 2. Split 80% to train e 20% to test

- Where X are the jpgs and Y are the annotations

In [5]:
from sklearn.model_selection import train_test_split
train_annotation, validation_annotation, train_jpgs, validation_jpgs = train_test_split(annotations,
                                                                                        jpgs,
                                                                                        test_size=0.2,
                                                                                        random_state=42)

### Disclaimer!

You should update in conf.py the following variables:

- num_training_samples


In [6]:
num_training_samples = len(train_annotation)
print('HEY! num_training_samples to {} in conf.py'.format(num_training_samples))

HEY! num_training_samples to 2540 in conf.py


### 3. Annotation from XML to Json in the template.json format

In [7]:
import xml.etree.ElementTree as ET

#### 3.1 Generate train_annotation

In [8]:
def generate_annotation(path, annotation_list):
    for filename in annotation_list:
        if filename != '.ipynb_checkpoints':
            image_id = filename.split('.')[0]
            root = ET.parse(conf.raw_annotation_path + '/{}'.format(filename)).getroot()

            annotations = []
            categories = []

            for size in root.findall('size'):
                W = size.find('width').text
                H = size.find('height').text
                depth = size.find('depth').text



            for ob in root.findall('object'):
                name = ob.find('name').text
                bndbox = ob.find('bndbox')
                xmin = int(bndbox.find('xmin').text)
                ymin = int(bndbox.find('ymin').text)
                xmax = int(bndbox.find('xmax').text)
                ymax = int(bndbox.find('ymax').text)

                left = xmin
                top = ymin
                width = xmax - xmin
                height = ymax - ymin

                annotation = helper.generate_annotation(name, left, top, width, height)
                categorie =  helper.generate_categorie(name, conf.object_categories)

                annotations.append(annotation)
                categories.append(categorie)

            categories = list({c['class_id']:c for c in categories}.values())

            data = helper.generate_annotation_file_dict(W, H, depth, image_id, annotations, categories)
            helper.dict_to_json(path+'_annotation', image_id, data)

In [9]:
%time
generate_annotation('train', train_annotation)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [10]:
%time
generate_annotation('validation', validation_annotation)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


### 4. copy imagem from Its folder (train or validation)
    
    - Template must be: "<folder>/<id>.jpg"
        - folder: train or validation
        - id: image name

In [11]:
for filename in train_annotation:
    im = filename.replace('xml', 'jpg')
    resp = ! cp GDUT-HWD/JPEGImages/$im train/
    

for filename in validation_annotation:
    im = filename.replace('xml', 'jpg')
    resp = ! cp GDUT-HWD/JPEGImages/$im validation/

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


### 5. Move All folders to S3

In [12]:
resp = ! aws s3 cp train s3://hardhat-dataset-sagemaker-object-detector-solvimm/train --recursive

^C


In [13]:
resp = ! aws s3 cp validation s3://hardhat-dataset-sagemaker-object-detector-solvimm/validation --recursive

^C


In [14]:
resp = ! aws s3 cp train_annotation s3://hardhat-dataset-sagemaker-object-detector-solvimm/train_annotation --recursive

^C


In [15]:
resp = ! aws s3 cp validation_annotation s3://hardhat-dataset-sagemaker-object-detector-solvimm/validation_annotation --recursive

^C
