# Image Dataset COCO JSON Generator

In [None]:
import json
import os
from PIL import Image
from datetime import datetime

img_dir = '/content/chula-parasite-dataset/Chula-ParasiteEgg-11_test/test/data'
classes = ('Ascaris lumbricoides', 'Capillaria philippinensis', 'Enterobius vermicularis', 'Fasciolopsis buski',
           'Hookworm egg', 'Hymenolepis diminuta', 'Hymenolepis nana', 'Opisthorchis viverrine',
           'Paragonimus spp', 'Taenia spp. egg', 'Trichuris trichiura')
dataset_name = 'ParEgg'

#list all images in directory
img_paths = [ [f.path, f.name] for f in os.scandir(img_dir) if f.name.split('.')[-1] in ('jpg', 'png', 'jpeg', 'bmp')]
images = []
id = 1
for img_path, img_name in img_paths:
    img = Image.open(img_path)
    width, height = img.size
    images.append({'id': id,
                   'file_name': img_name,
                   'height': height,
                   'width': width,
                   'license': None,
                   'coco_url': None})
    id+=1
#add some general info
info = {'date': datetime.today().strftime('%Y-%m-%d'),
        'author': 'tureckova',
        'describtion': dataset_name}
categories = []
for ind, cls in enumerate(classes):
    categories.append({'id': ind,
                       'name': cls})
coco_json = {'info': info,
             'licenses':[],
             'categories': categories,
             'images': images,
             'annotations': []}
#save output json file
with open('/content/chula-parasite-dataset/Chula-ParasiteEgg-11_test/test/'+'test.json','w') as file:
    json.dump(coco_json, file)

# COCO Annotation Splitter for K-Fold Cross-Validation

In [None]:
import json
from sklearn.model_selection import KFold, train_test_split
import os

def get_annotation_split(annotations, image_index):
    """annotations: coco dictionary with annotations
       image_index: index of images desired in output split"""
    img_list = []
    ann_list = []
    real_image_indexes = []
    for ind in image_index:
        real_image_indexes.append(annotations['images'][ind]['id'])
        img_list.append(annotations['images'][ind])
    for ann in annotations['annotations']:
        if ann['image_id'] in real_image_indexes:
            ann_list.append(ann)
    annotations_split = {'info': annotations['info'],
                         'licenses': annotations['licenses'],
                         'images': img_list,
                         'annotations': ann_list,
                         'categories': annotations['categories']
                         }
    return annotations_split

# file with all annotation
annotations_file = '/content/chula-parasite-dataset/Chula-ParasiteEgg-11/Chula-ParasiteEgg-11/Chula-ParasiteEgg-11/labels.json'
annotations_file_cut = 'Chula-ParasiteEgg-11/cut_960x1280__coco.json'
output_folder = '/content/output_directory/'
n_splits = 5

# load annotations
with open(annotations_file,'r') as file:
    annotations = json.load(file)

base_dir = output_folder+str(n_splits)+'-fold/'
if not os.path.exists(base_dir): os.mkdir(base_dir)
images_index = [i for i in range(len(annotations['images']))]
kf = KFold(n_splits=n_splits, shuffle=True)
kf.get_n_splits(images_index)
counter = 0
for train_index, val_index in kf.split(images_index):
    train_annotations = get_annotation_split(annotations, train_index)
    val_annotations = get_annotation_split(annotations, val_index)
    split_dir = base_dir + 'fold-' + str(counter) + '/'
    if not os.path.exists(split_dir): os.mkdir(split_dir)
    with open(split_dir+'val.json', 'w') as val_file:
        json.dump(val_annotations, val_file)
    with open(split_dir+'train.json', 'w') as train_file:
        json.dump(train_annotations, train_file)
    counter +=1

# COCO Dataset Slicing for Data Augmentation

In [None]:
from sahi.slicing import slice_coco
import os

data_root = '/content/chula-parasite-dataset/Chula-ParasiteEgg-11/Chula-ParasiteEgg-11/Chula-ParasiteEgg-11/data'
n_fold_folder = '/content/output_directory/5-fold/'
slice_height = 960
slice_width = 1280
overlap_height_ratio = 0.3
overlap_width_ratio = 0.3
min_area_ratio = 0.3
ignore_negative_samples = True

# Specify the fold you want to process
fold_to_process = 'fold-0'

fold_dir = os.path.join(n_fold_folder, fold_to_process)

for split in ['train', 'val']:
    coco_dict, coco_path = slice_coco(
        coco_annotation_file_path=os.path.join(fold_dir, f'{split}.json'),
        image_dir=data_root,
        output_coco_annotation_file_name=os.path.join(fold_dir, f'cut_{slice_height}x{slice_width}_{split}'),
        output_dir=os.path.join(fold_dir, f'cut_{slice_height}x{slice_width}_{split}'),
        slice_height=slice_height,
        slice_width=slice_width,
        overlap_height_ratio=overlap_height_ratio,
        overlap_width_ratio=overlap_width_ratio,
        verbose=True,
        ignore_negative_samples=ignore_negative_samples,
        min_area_ratio=min_area_ratio
    )

file_slicing_info = os.path.join(fold_dir, f'cut_{slice_height}x{slice_width}.txt')
with open(file_slicing_info, 'w') as file:
    file.write('data_root=' + data_root + '\n')
    file.write('n_fold_folder=' + n_fold_folder + '\n')
    file.write('slice_height=' + str(slice_height) + '\n')
    file.write('slice_width=' + str(slice_width) + '\n')
    file.write('overlap_height_ratio=' + str(overlap_height_ratio) + '\n')
    file.write('overlap_width_ratio=' + str(overlap_width_ratio) + '\n')
    file.write('min_area_ratio=' + str(min_area_ratio) + '\n')
    file.write('ignore_negative_samples=' + str(ignore_negative_samples) + '\n')
