## 1. Install VGG Image Annotator
http://www.robots.ox.ac.uk/~vgg/software/via/

## 2. Configure locally
- run the local HTML version
- set up project by sourcing image dir(s) to be labeled; configure project filename
- set up labels like this:
<img src="vgg_config.png" width="200"/>
- in settings you can change to display the label above each box (sanity check)
- can also set it so the radio button pops up right next to the box after you create/select it
- go through and label all the images
- periodically save and export your labels as json

## 3. Split files

In [10]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pathlib
import shutil
import cv2
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# change this to your main directory of raw images
# verify that inside the structure has the three subdirectories listed with the below structure
raw_img_root = '../../raw_images/'

dirs = {'sugarcane':'raw_class_images/sugarcane', 
        'weeds':'raw_class_images/weeds',
        'joint':'SugarcaneWeeds'}
data_dirs = {x: pathlib.Path(raw_img_root + dirs[x]) for x in dirs.keys()}
img_counts = {x: len(list(data_dirs[x].glob('*.jpg'))) for x in dirs.keys()}
for x in dirs.keys():
    print(f'%s images: %d'%(x, img_counts[x]))
    
    
# Shuffle files - seed to make it consistent
np.random.seed(32)
file_lists = {x: sorted(list(data_dirs[x].glob('*.jpg'))) for x in dirs.keys()}
for x in dirs.keys():
    np.random.shuffle(file_lists[x])

sugarcane images: 869
weeds images: 856
joint images: 415


In [3]:
# desired image counts
label_counts = {'sugarcane': 100,
               'weeds': 100,
               'joint': 200}
label_paths = {x: file_lists[x][:label_counts[x]] for x in dirs.keys()}

files_a = {x: label_paths[x][:len(label_paths[x])//2] for x in dirs.keys()}
files_b = {x: label_paths[x][len(label_paths[x])//2:] for x in dirs.keys()}

## 4. Copy files to a new directory (careful, do this once)

In [8]:
# Mike --> files_a; Cody --> files_b (change this below)

# input your destination directory. set up three empty subdirs: 'sugarcane', 'weeds, 'joint'

In [9]:
# Randomly select a fixed number of images from each directory
dest_dir = '/Users/mschoder/weeding_project/box_label_task_b'

for x in dirs.keys():
    for file in files_b[x]:
        shutil.copyfile(file, dest_dir+'/'+x+'/'+file.name)


## 5. Post-Labeling -- Merge Labels into single file

In [93]:
labels_a_path = './labels_a.json'
labels_b_path = './labels_b.json'

with open(labels_a_path) as f:
    labels_a = json.load(f)

with open(labels_b_path) as f:
    labels_b = json.load(f)

labels_a = labels_a['_via_img_metadata']
labels_b = labels_b['_via_img_metadata']

In [108]:
# remove duplicate labeled images before merge
dupe_labels_b = set(labels_b) & set(labels_a)
for uk in dupe_labels_b:
    del labels_b[uk]
# Merge
labels_dict_tmp = {**labels_a, **labels_b}
print("Unique labeled images: ", len(labels_dict.keys()))

# clean keys as they are already unique on image filenames (presume ixxxx.jpg structure)
labels_dict = {}
for k,v in labels_dict_tmp.items():
    k_new = k[:9]
    labels_dict[k_new] = v

Unique labeled images:  385


In [120]:
# Verify no unlabeled images
for k,v in labels_dict.items():
    if (v['regions'] is None or len(v['regions']) == 0):
        print(k)

In [114]:
# Example data structure
labels_dict['i2825.jpg']

{'filename': 'i2825.jpg',
 'size': 5877352,
 'regions': [{'shape_attributes': {'name': 'rect',
    'x': 15,
    'y': 46,
    'width': 739,
    'height': 728},
   'region_attributes': {'label': 'weed'}},
  {'shape_attributes': {'name': 'rect',
    'x': 1605,
    'y': 12,
    'width': 2464,
    'height': 1421},
   'region_attributes': {'label': 'weed'}},
  {'shape_attributes': {'name': 'rect',
    'x': 1143,
    'y': 96,
    'width': 527,
    'height': 431},
   'region_attributes': {'label': 'weed'}},
  {'shape_attributes': {'name': 'rect',
    'x': 31,
    'y': 631,
    'width': 1979,
    'height': 2060},
   'region_attributes': {'label': 'sugarcane'}},
  {'shape_attributes': {'name': 'rect',
    'x': 1748,
    'y': 1370,
    'width': 2310,
    'height': 1678},
   'region_attributes': {'label': 'sugarcane'}},
  {'shape_attributes': {'name': 'rect',
    'x': 4131,
    'y': 73,
    'width': 447,
    'height': 1752},
   'region_attributes': {'label': 'sugarcane'}}],
 'file_attributes': {}}

In [115]:
# write out to clean json
with open("labels.json", "w") as outfile:  
    json.dump(labels_dict, outfile) 

## 6. Match labeled files from raw images and copy into a new dir 
(since we messed up our labeling) 

In [113]:
labeled_files = [v['filename'] for k,v in labels_dict.items()]
all_filepaths = file_lists['sugarcane'] + file_lists['weeds'] + file_lists['joint']

labeled_img_paths = [path for path in all_filepaths if path.name in set(labeled_files)]

In [106]:
dest_dir = '/Users/mschoder/weeding_project/box_labeled_data/jpg_images'
for file in labeled_img_paths:
    shutil.copyfile(file, dest_dir+'/'+file.name)

## 7. Build separate dataset + labels for weed-only detections
(exclude sugarcane only images)

In [127]:
weed_labels_dict = {}
for k,v in labels_dict.items():
    box_dict = v['regions']
    if 'weed' in set([x['region_attributes']['label'] for x in box_dict]):
        weed_labels_dict[k] = v
    
# write out to clean json
with open("weed_labels.json", "w") as outfile:  
    json.dump(labels_dict, outfile) 
    

In [128]:
# Match corresponding images and save to the weeds_only dir
labeled_files = [v['filename'] for k,v in weed_labels_dict.items()]
all_filepaths = file_lists['weeds'] + file_lists['joint']
labeled_img_paths = [path for path in all_filepaths if path.name in set(labeled_files)]

dest_dir = '/Users/mschoder/weeding_project/box_labeled_data/jpg_images_allweeds'
for file in labeled_img_paths:
    shutil.copyfile(file, dest_dir+'/'+file.name)