In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nbdev import *

In [None]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
from tqdm import tqdm
from pathlib import Path
from ipyannotator.im2im_annotator import Im2ImAnnotator

# Select Dataset

In [None]:
# You can choose between 3 datasets ['cifar10', 'oxford_flowers', 'CUB_200'] that you can download.
# We use a artifical generated classification dataset by default that doesn't require downloading.
dataset = 'artifical'
# dataset = 'cifar10'
# dataset = 'oxford_flowers'
# dataset = 'CUB_200'

## prepare dataset

In [None]:
! mkdir -p data

In [None]:
from ipyannotator.datasets.generators import create_color_classification

if dataset == 'artifical':
    import tempfile
    tmp_dir = tempfile.TemporaryDirectory()
    path = Path(tmp_dir.name)
#     Convert artifical dataset annotations to ipyannotator format inplace
    from PIL import Image
    create_color_classification(path=path, n_samples=50, size=(500, 500))
    annotations = pd.read_json(path/'annotations.json').T
    
    anno = annotations.T.to_dict('records')[0]
    anno = {str(path / 'images' / k): [f'{v}.jpg'] for k,v in anno.items()}
    with open(path/'annotations.json', 'w') as f:  
        json.dump(anno, f)
        
    project_path = path
    project_file = path/'annotations.json'
    image_dir = 'images'
    label_dir = 'class_images'
    im_width=50 
    im_height=50
    label_width=30
    label_height=30
    n_cols = 3
    

In [None]:
from ipyannotator.datasets.download import get_cifar10, get_cub_200_2011, get_oxford_102_flowers

if dataset == 'cifar10':
    cifar_train_p, cifar_test_p = get_cifar10('data')
    project_path = 'data/cifar10/'
    project_file = cifar_test_p
    image_dir = 'test'
    label_dir = None
    
    im_width=50 
    im_height=50
    label_width=140
    label_height=30
    n_cols = 2

if dataset == 'oxford_flowers':
    flowers102_train_p, flowers102_test_p = get_oxford_102_flowers('data')
    project_path = 'data/oxford-102-flowers'
    project_file = flowers102_test_p
    image_dir = 'jpg'
    label_dir = None
    
    im_width=50 
    im_height=50
    label_width=40
    label_height=30
    n_cols = 7
    
if dataset == 'CUB_200':
    cub200_train_p, cub200_test_p = get_cub_200_2011('data')
    project_path = 'data/CUB_200_2011'
    project_file = cub200_test_p
    image_dir='images'
    label_dir = None
    
    im_width=50 
    im_height=50
    label_width=450
    label_height=15
    n_cols = 7

### ToDo convert datasets / create helper function

for all three dataset, each has a different file / folder structure

- should be possible to either look at train or test images
- should be possible to look at unlabeled or labeled data

comment: maybe we can borrow code from fastai `DataBunch` supports all this file/folder structures,
however we shouldn't have fastai as dependency because this would also require pytorch which is fairly big

# explore

Lets visualize existing annotated dataset.

As we don't have images for each class we do not provide `label_dir=None` for ippyannotator, thus class labels will be generrated automatically based on `annotation.json` file.

We use `results_dir` param to indicate directory where `annotation.json` file with existing annotations is located.

You can explore dataset with `next/previous` buttons to check visualized labels.

In [None]:
!cat {project_path}

In [None]:
im2im = Im2ImAnnotator(project_path=project_path, 
                        file_name=project_file, 
                        image_dir=image_dir,
                        step_down=True, 
                        label_dir=label_dir, 
                        im_width=im_width, im_height=im_height, 
                        label_width=label_width, label_height=label_height,
                        n_cols=n_cols
                       )
im2im

In [None]:
# Let's explore only subset of ds

import json
from random import sample

with project_file.open() as f:
    data = json.load(f)
    
all_labels = data.values()
unique_labels = set(label for item_labels in all_labels for label in item_labels)


#  get <some> random labels and generate annotation file with them:
some = 3
assert (some <= len(unique_labels))
subset_labels = sample([[a] for a in unique_labels], k=some)
subset_annotations = {k:v for k, v in data.items() if v in subset_labels}

subset_file = Path(project_path) / 'subset_anno.json'
with subset_file.open('w', encoding='utf-8') as fi:
    json.dump(subset_annotations, fi, ensure_ascii=False, indent=4)
    

# use it in annotator    
im2im = Im2ImAnnotator(project_path=project_path, 
                       file_name=subset_file, 
                       image_dir=image_dir,
                       step_down=True, 
                       label_dir=label_dir,
                       im_width=im_width, im_height=im_height, 
                       label_width=label_width, label_height=label_height, 
                       n_cols=n_cols,
                       label_autosize=False
                      )
im2im

# create

Load unannotated dataset and create classification labels.

- real
- generated

Now we set `label_dir='class_images'`, because we have existing folder, where one image per class with proper name saved beforehand corespondinlgy

Also, setting `results_dir="out"` we define that final `annotation.json` file will be generated from scratch and saved to `{project_path}/out` direcory

Try to annotate some pieces incorrectly, thus you prepare good set for `improve` step below

In [None]:
#  while we don't have class_labels for real datasets, let's combine train and test annotations to generate them

all_annotations = Path(project_path) / "annotations.json"
    
if dataset != 'artifical':  # combine train/test for real ds
    
    import json
    import glob

    with open(Path(project_path) / "annotations_train.json", "rb") as train:
        tr = json.load(train)


    with open(Path(project_path) / "annotations_test.json", "rb") as test:
        te = json.load(test)

    result = {**tr, **te}

    with open(all_annotations, "w") as outfile:
         json.dump(result, outfile)

In [None]:
gen_class_labels = Im2ImAnnotator(project_path=project_path, 
                       image_dir=image_dir, 
                       file_name=all_annotations,
                       label_dir=label_dir, 
                       results_dir=None,
                       im_width=im_width, im_height=im_height, 
                       label_width=label_width, label_height=label_height, 
                       n_cols=n_cols, 
                       question="Classification")
label_dir = gen_class_labels._model.label_dir.stem
label_dir

In [None]:
#  now we can generate new annotaation file from scratch, 
#  by using empty folder for <results_dir> and <label_dir> from previous step

output_dir = 'results'
print(Path(project_path) / output_dir)
!rm -rf {Path(project_path) / output_dir}

In [None]:
im2im = Im2ImAnnotator(project_path=project_path, 
                       image_dir=image_dir,
                       file_name=None,
                       label_dir=label_dir, 
                       results_dir=output_dir,
                       im_width=im_width, im_height=im_height, 
                       label_width=label_width, label_height=label_height, 
                       n_cols=n_cols, 
                       question="Classification")

im2im

In [None]:
all_labelss = im2im._model.labels_files
all_labelss[:3]

In [None]:
with all_annotations.open() as f:
    anno_ = json.load(f)

In [None]:
import numpy as np

filt = np.random.uniform(low=0, high=1, size=len(anno_))

label_noise = 0.1

In [None]:
# dummy annotator
from random import choice

def get_random_class():
    return choice (all_labelss)

get_random_class()

In [None]:
# assign random label for subset of all annotations to imitate human work with <label_noise> amount of errors

filtererd = {x: [get_random_class()] if f_ < label_noise else y for (x, y), f_ in zip(anno_.items(), filt)}

In [None]:
# update ipyannotator's annotations bassed on previous step and save

im2im._model.annotations.update((k, filtererd.get(k, [])) for k in im2im._model.annotations.keys())
im2im._save_btn.click()

In [None]:
# im2im._model.annotations

In [None]:
# check annotation file on disk
# !cat {im2im._model.annotation_file_path}

In [None]:
#same in memory
from IPython import display
# im2im.to_dict()

# improve

Load annotated dataset and mark wrongly annotated samples.

- real
- generated

Let's create corresponding map for each class from annotations obtained on `create` step above

In [None]:
#open labels generated on [create] step

with open(Path(project_path) / output_dir / 'annotations.json') as infile:
    loaded_image_annotations = json.load(infile)

In [None]:
# loaded_image_annotations

In [None]:
from collections import defaultdict

def group_files_by_class(annotations):
    grouped = defaultdict(list)
    for file, labels in annotations.items():
        for class_ in labels:
            grouped[class_].append(file)
    return grouped

In [None]:
classes_to_files = group_files_by_class(loaded_image_annotations) 

Lets group some annotators together, so we can go through all annotated images but for each classs separately.

Each grid shows images belonging to the __same__ class. 

You should __mark all errors__ (images, which belongs to __different__ class)

In [None]:
from ipyannotator.capture_annotator import CaptureAnnotator

!! Dont forget to click __SAVE__ button when finished with each class:

In [None]:
items = [CaptureAnnotator(project_path, image_dir, 50, 50, 2, 5,
                          question=f'Check incorrect annotation for [{class_name[:-4]}] class', 
                          filter_files=class_anno,
                          results_dir=f'{output_dir}/missed/{class_name[:-4]}') for class_name, class_anno in tqdm(classes_to_files.items())]

In [None]:
#let's select first two classes to mark the errors 
widgets.VBox(children = items[:2])

In [None]:
#  mark spoiled on create step, imitating human correction

for i in tqdm(items):
    for k, v in i._model.annotations.items():
        i._model.annotations[k] = {'answer': anno_[k] != filtererd[k]}   
        i._model._update_state()
    i._save_btn.click()
#     print(i._model.annotations)

Now we can get list of all marked images, which should be reclassified:

In [None]:
reclasify_this = [[c for c, v in i.to_dict().items() if v['answer']] for i in items]

#  show 10 files with incorrect label for the first class
reclasify_this[1][:10] 

Also, auttomatically generarted json file can be used for each class.

Let's load one random json and select filenames marked as incorrect on previous step for this class:

In [None]:

from glob import glob

random_class = sample(glob(str(Path(project_path) / output_dir/'missed')+'/*'), 1)[0]
print(random_class)

random_class_annotation = pd.read_json(Path(random_class) / 'annotations.json').T

random_misssed = list(random_class_annotation[random_class_annotation['answer']==True].index.values)

#  show 10 files with incorrect label for the random class 
random_misssed[:10]