In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image

%matplotlib inline

In [7]:
cwd = Path.cwd()
from collections import defaultdict
from random import random

def folder2coco(folder, map_file, pct=0.2):
    data_dir = cwd/'data'
    
    # Create class_id to species mapping
    mapper = pd.read_csv(data_dir/map_file)
    id2species = {idx: species for idx, species in zip(mapper['class_idx'].values, mapper['original_class'].values)}
    
    # Define the coco format
    train = defaultdict(list)
    valid = defaultdict(list)
    info = {
        'description': 'The 2019 Snake Species Identification Challenge',
        'url': 'https://www.aicrowd.com/challenges/snake-species-identification-challenge',
        'version': 1.0,
        'date_created': '2019-05-10'
    }
    train['info'] = info
    valid['info'] = info

    counter = 0
    for idx, species_dir in enumerate((data_dir/folder).iterdir()):
        train['categories'].append({'id': idx, 'name': id2species[int(species_dir.stem.split('-')[-1])]})
        valid['categories'].append({'id': idx, 'name': id2species[int(species_dir.stem.split('-')[-1])]})
        for image_path in species_dir.iterdir():
            try:
                coco = train if random() > pct else valid
                (w, h) = Image.open(image_path).size
                coco['images'].append({'id': counter, 'file_name': f'/{folder}/{species_dir.name}/{image_path.name}', 'width': w, 'height': h})
                coco['annotations'].append({'id': counter, 'image_id': counter, 'category_id': idx})
                counter += 1
            except OSError as e:
                pass
    
    return train, valid

In [8]:
train, valid = folder2coco('train', 'class_id_mapping.csv')

In [9]:
json.dump(train, (cwd/'data'/'train.json').open('wt', encoding='utf-8'))
json.dump(valid, (cwd/'data'/'valid.json').open('wt', encoding='utf-8'))

In [10]:
len(train['categories']), len(valid['categories'])

(45, 45)

In [11]:
len(train['images']), len(valid['images'])

(65964, 16453)

In [7]:
len(train['annotations']), len(valid['annotations'])

(66054, 16363)

In [8]:
train.keys()

dict_keys(['info', 'categories', 'images', 'annotations'])

In [9]:
train['categories'][:2]

[{'id': 0, 'name': 'pantherophis_vulpinus'},
 {'id': 1, 'name': 'nerodia_erythrogaster'}]

In [10]:
train['annotations'][:2]

[{'id': 0, 'image_id': 0, 'category_id': 0},
 {'id': 3, 'image_id': 3, 'category_id': 0}]

In [11]:
train['images'][:2]

[{'id': 0,
  'file_name': '/train/class-543/ca34c7358cec2385f4f47ecd3a6a160e.jpg',
  'width': 2000,
  'height': 1500},
 {'id': 3,
  'file_name': '/train/class-543/49cdb6ea29d4b007cb32b840eaca7061.jpg',
  'width': 1500,
  'height': 2000}]