In [None]:
import logging
import sys
sys.path.append('/home/mbewley/Development/benthoz')
logger = logging.getLogger()
handler = logging.FileHandler('notebook_log.txt')
handler.setLevel(logging.DEBUG)
logger.handlers = [handler]

%matplotlib inline
import matplotlib.pyplot as plt
import os
import glob
import pandas as pd
import numpy as np
import imageio
import cv2

import prep.image_fetcher
import prep.label_converter


os.chdir('/home/mbewley/Development/benthoz')
IMAGE_PATH = 'images'
if not os.path.exists(IMAGE_PATH):
    os.mkdir(IMAGE_PATH)

IMAGE_LIST_FILE = 'BENTHOZ-2015-imagelist.csv'

# Preparation
The first step is to:

1. Acquire the images.
2. Check they have been successfully acquired, by comparing them against the training set.

In [None]:
prep.image_fetcher.fetch_all_images(IMAGE_LIST_FILE, IMAGE_PATH)

In [None]:
# List number of downloaded images
downloaded_images = [f.strip('.tif') for f in os.listdir(os.path.join(IMAGE_PATH)) if f.endswith('tif')]
downloaded_images
len(downloaded_images)

In [None]:
df_training = pd.read_csv('data_splits/public_labels_train.csv')
df_training.head()

In [None]:
training_images = df_training.image_name.unique().tolist()

missing_images = list(set(training_images) - set(downloaded_images) & set(training_images))
missing_image_dates = set([s.split('_')[1] for s in missing_images])
print(f'Missing {len(missing_images)} images on dates: {missing_image_dates}')

# Defining the Hierarchy and Classes
First, we need to look up the class IDs used in the data set, referencing them against class name as per `id_lookups.csv` (also part of the data set paper).
We then need to build the heirarchy to define parent/child relationships.

In [None]:
df_id_lookups = prep.label_converter.build_hierarchy_from_id_lookup(id_lookup_file='idlookups.csv')
df_id_lookups

In [None]:
df_id_lookups = pd.read_csv('idlookups.csv', index_col=0)
print(f'There are {len(df_id_lookups)} classes defined in the id lookup list')
df_id_lookups.sample(10)

The naming convention separates layers of the hierarchy with a colon ':', so we can break this into a list of descendents, and calculate the depth of the tree.

In [None]:
df_id_lookups['parsed_name'] = df_id_lookups.name.apply(lambda s: s.split(': '))
df_id_lookups['depth'] = df_id_lookups.parsed_name.apply(lambda d: len(d))
df_id_lookups.sample(10)

The two top nodes "Biota" and "Physical" are not prepended to their children, so we need to do this manually.

In [None]:
# Top level classes
df_id_lookups.query('depth == 1').name.values

In [None]:
# Manually define biota and physical children

biota_kids = ['Worms', 'Sponges', 'Seagrasses', 'Molluscs', 'Macroalgae', 'Jellies', 'Fishes', 'Echinoderms', 'Crustacea',
              'Cnidaria', 'Bryozoa', 'Bioturbation', 'Bacterial mats', 'Ascidians']

physical_kids = ['Substrate']

# Prepend them to name lists, and add to depth.
biota_inds = df_id_lookups.parsed_name.apply(lambda d: d[0] in biota_kids)
df_id_lookups.loc[biota_inds, 'depth'] += 1
df_id_lookups.loc[biota_inds, 'parsed_name'] = df_id_lookups.loc[biota_inds, 'parsed_name'].apply(lambda d: ['Biota'] + d)

physical_inds = df_id_lookups.parsed_name.apply(lambda d: d[0] in physical_kids)
df_id_lookups.loc[physical_inds, 'depth'] += 1
df_id_lookups.loc[physical_inds, 'parsed_name'] = df_id_lookups.loc[physical_inds, 'parsed_name'].apply(lambda d: ['Physical'] + d)


df_id_lookups['child_name'] = df_id_lookups.parsed_name.apply(lambda d: d[-1])

display(df_id_lookups.head(10))
display(df_id_lookups.sample(10))

In [None]:
# Convert "parsed_name" to a list of node IDs, instead of strings

def get_ancestor_ids(child_id):
    parsed_name = df_id_lookups.loc[child_id, 'parsed_name']
    ancestor_parsed_ids = []
    for i in range(len(parsed_name)):
        ancestor_parsed_name = parsed_name[:i+1]
        ancestor_parsed_id = df_id_lookups[df_id_lookups.parsed_name.apply(lambda d: d == ancestor_parsed_name)].index[0]
        ancestor_parsed_ids.append(ancestor_parsed_id)
    return ancestor_parsed_ids


df_id_lookups['ancestor_id_list'] = [get_ancestor_ids(d) for d in df_id_lookups.index]
df_id_lookups = df_id_lookups.sort_index()
df_id_lookups['bit_vector_idx'] = range(len(df_id_lookups))
df_id_lookups

In [None]:
df_id_lookups[df_id_lookups.name.str.startswith('Substrate')]

We now want to represent this class hierarchy as a bit-vector. Each class index has a unique bit in the vector (indexed by `id2ind`). A root level class will turn on a single bit. A depth 4 class will turn on 4 bits.

This means that for an image, the label mask will be a 3D matrix of shape (`len(df_id_lookups)`, `im.shape[0]`, `im.shape[1]`)

## Sparse Points to Image Masks
The next step is to convert the sparse lists of points in (image, row, column, label_id) form, to a 1:1 label mask per image. Initially, we will do this just as a classic multi class problem, with label_id = 0 as the placeholder for "unknown" pixels which should not be considered by the loss function. Later, we will use the hierarchical relationships between the labels to set it up as a multi-label problem.

In [None]:
DONT_KNOW = -1
NO = 0
YES = 1

def long_labels_to_multilabel_masks(rows, cols, label_ids, im_shape):
    unique_label_ids = list(set(label_ids))
    
    # Stub out the label_mask to -1 for "don't know" unless otherwise specified.
    label_masks = np.ones([len(df_id_lookups), im_shape[0], im_shape[1]], dtype=np.int8) * DONT_KNOW
    for row, col, label_id in zip(*(rows, cols, label_ids)):
        
        # Now we know this particular pixel's value - zero it out, then flip on the relevant bits for the given class.
        # TODO: This isn't quite right. We need to state any children (recursively) of the node are "unknown", so that "Biota" doesn't explicitly rule out kelp.
        label_masks[:, row, col] = NO
        
        active_ids = df_id_lookups.loc[label_id, 'ancestor_id_list']
        for active_id in active_ids:
            bitvector_index = df_id_lookups.loc[active_id]['bit_vector_idx']
#                 print(f'Storing ID {active_id} at bit index {bitvector_index}')
            label_masks[bitvector_index, row, col] = YES
    return label_masks


In [None]:
# Convert long-form sparse point labels to "don't know" images. Treat this as multi-class for now (single image, with an indep label assigned to each point)

def visualise_label_layer(label_id, im, labels):
    bitvector_ind = df_id_lookups.loc[label_id]['bit_vector_idx']
    print(f'Label {label_id} - displaying layer from bitvector_index {bitvector_ind}')
    labels_single_layer = labels[bitvector_ind]
    
    # Get positive ones and dilate to make visible.
    labels_single_layer = labels[bitvector_ind,:,:]


    label_name = df_id_lookups.loc[label_id]['name']
    plt.figure(figsize=(20,20))
    plt.imshow(im)
    
    labels_pos = ((labels_single_layer == 1)).astype(np.uint8)
    labels_pos_dilated = cv2.dilate(labels_pos, np.ones((9,9),np.uint8))
    plt.imshow(labels_pos_dilated, alpha=labels_pos_dilated, cmap='brg')
    
    labels_neg = ((labels_single_layer == 0)).astype(np.uint8)
    labels_neg_dilated = cv2.dilate(labels_neg, np.ones((9,9),np.uint8))
    plt.imshow(labels_neg_dilated, alpha=labels_neg_dilated, cmap='seismic')

    plt.title(label_name)
    

for i, (image_name, g) in enumerate(df_training.groupby('image_name')):
    
    # Get original image
    im = cv2.imread(os.path.join(IMAGE_PATH, image_name+'.tif'))
    labels = long_labels_to_multilabel_masks(g.row, g.col, g.label, im.shape[:2])
    visualise_label_layer(400, im, labels)

    if i > 3:
        break
    