In [84]:
import logging
import sys
sys.path.append('/home/mbewley/Development/benthoz')
logger = logging.getLogger()
handler = logging.FileHandler('notebook_log.txt')
handler.setLevel(logging.DEBUG)
logger.handlers = [handler]

%matplotlib inline
import matplotlib.pyplot as plt
import os
import glob
import pandas as pd
import numpy as np
import imageio
import cv2
from tqdm.notebook import tqdm_notebook as tqdm

import prep.image_fetcher
import prep.label_converter


os.chdir('/home/mbewley/Development/benthoz')
IMAGE_PATH = 'images'
LABELS_PATH = 'labels'

IM_SHAPE = (1024,1360)

if not os.path.exists(IMAGE_PATH):
    os.mkdir(IMAGE_PATH)
    
if not os.path.exists(LABELS_PATH):
    os.mkdir(LABELS_PATH)

IMAGE_LIST_FILE = 'BENTHOZ-2015-imagelist.csv'

# Defining the Hierarchy and Classes
First, we need to look up the class IDs used in the data set, referencing them against class name as per `id_lookups.csv` (also part of the data set paper).
We then need to build the heirarchy to define parent/child relationships.

In [124]:
df_id_lookups = prep.label_converter.build_hierarchy_from_id_lookup(id_lookup_file='idlookups.csv')
df_id_lookups

Unnamed: 0_level_0,name,parsed_name,depth,child_name,ancestor_id_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Not Considered,[Not Considered],1,Not Considered,[1]
2,Biota,[Biota],1,Biota,[2]
238,Physical,[Physical],1,Physical,[238]
3,Worms,"[Biota, Worms]",2,Worms,"[2, 3]"
13,Sponges,"[Biota, Sponges]",2,Sponges,"[2, 13]"
...,...,...,...,...,...
161,Cnidaria: Corals: Black & Octocorals: Branchin...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",7,Arborescent,"[2, 118, 126, 143, 152, 159, 161]"
149,Cnidaria: Corals: Black & Octocorals: Fan (2D)...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",7,Simple,"[2, 118, 126, 143, 146, 148, 149]"
150,Cnidaria: Corals: Black & Octocorals: Fan (2D)...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",7,Complex,"[2, 118, 126, 143, 146, 148, 150]"
156,Cnidaria: Corals: Black & Octocorals: Branchin...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",8,Simple,"[2, 118, 126, 143, 152, 153, 155, 156]"


In [125]:
df_id_lookups = pd.read_csv('idlookups.csv', index_col=0)
print(f'There are {len(df_id_lookups)} classes defined in the id lookup list')
df_id_lookups.sample(10)

There are 148 classes defined in the id lookup list


Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
73,Jellies
238,Physical
41,Macroalgae: Sheet-like / membraneous: Red
3,Worms
252,Substrate: Consolidated (hard)
115,Crustacea: Barnacles
164,Cnidaria: Colonial anemones: Corallimorphs
24,Sponges: Erect forms: Simple
2,Biota
68,Macroalgae: Encrusting: Green


The naming convention separates layers of the hierarchy with a colon ':', so we can break this into a list of descendents, and calculate the depth of the tree.

In [126]:
df_id_lookups['parsed_name'] = df_id_lookups.name.apply(lambda s: s.split(': '))
df_id_lookups['depth'] = df_id_lookups.parsed_name.apply(lambda d: len(d))
df_id_lookups.sample(10)

Unnamed: 0_level_0,name,parsed_name,depth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
240,Substrate: Unconsolidated (soft),"[Substrate, Unconsolidated (soft)]",2
310,Cnidaria: Corals: Black & Octocorals: Massive,"[Cnidaria, Corals, Black & Octocorals, Massive ]",4
160,Cnidaria: Corals: Black & Octocorals: Branchin...,"[Cnidaria, Corals, Black & Octocorals, Branchi...",6
19,Sponges: Massive forms: Barrels,"[Sponges, Massive forms, Barrels]",3
400,Macroalgae: Large canopy-forming: Ecklonia rad...,"[Macroalgae, Large canopy-forming, Ecklonia ra...",3
249,Substrate: Unconsolidated (soft): Pebble / gra...,"[Substrate, Unconsolidated (soft), Pebble / gr...",5
22,Sponges: Hollow forms: Cups and alikes,"[Sponges, Hollow forms, Cups and alikes]",3
45,Macroalgae: Large canopy-forming: Brown,"[Macroalgae, Large canopy-forming, Brown]",3
62,Macroalgae: Erect coarse branching,"[Macroalgae, Erect coarse branching]",2
12,Worms: Acorn worms,"[Worms, Acorn worms]",2


The two top nodes "Biota" and "Physical" are not prepended to their children, so we need to do this manually.

In [127]:
# Top level classes
df_id_lookups.query('depth == 1').name.values

array(['Not Considered', 'Biota', 'Physical', 'Worms', 'Sponges',
       'Seagrasses', 'Molluscs', 'Macroalgae', 'Jellies', 'Fishes',
       'Echinoderms', 'Crustacea', 'Cnidaria', 'Bryozoa', 'Bioturbation',
       'Bacterial mats', 'Ascidians', 'Substrate'], dtype=object)

In [128]:
# Manually define biota and physical children

biota_kids = ['Worms', 'Sponges', 'Seagrasses', 'Molluscs', 'Macroalgae', 'Jellies', 'Fishes', 'Echinoderms', 'Crustacea',
              'Cnidaria', 'Bryozoa', 'Bioturbation', 'Bacterial mats', 'Ascidians']

physical_kids = ['Substrate']

# Prepend them to name lists, and add to depth.
biota_inds = df_id_lookups.parsed_name.apply(lambda d: d[0] in biota_kids)
df_id_lookups.loc[biota_inds, 'depth'] += 1
df_id_lookups.loc[biota_inds, 'parsed_name'] = df_id_lookups.loc[biota_inds, 'parsed_name'].apply(lambda d: ['Biota'] + d)

physical_inds = df_id_lookups.parsed_name.apply(lambda d: d[0] in physical_kids)
df_id_lookups.loc[physical_inds, 'depth'] += 1
df_id_lookups.loc[physical_inds, 'parsed_name'] = df_id_lookups.loc[physical_inds, 'parsed_name'].apply(lambda d: ['Physical'] + d)


df_id_lookups['child_name'] = df_id_lookups.parsed_name.apply(lambda d: d[-1])

display(df_id_lookups.head(10))
display(df_id_lookups.sample(10))

Unnamed: 0_level_0,name,parsed_name,depth,child_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Not Considered,[Not Considered],1,Not Considered
2,Biota,[Biota],1,Biota
238,Physical,[Physical],1,Physical
3,Worms,"[Biota, Worms]",2,Worms
13,Sponges,"[Biota, Sponges]",2,Sponges
30,Seagrasses,"[Biota, Seagrasses]",2,Seagrasses
32,Molluscs,"[Biota, Molluscs]",2,Molluscs
39,Macroalgae,"[Biota, Macroalgae]",2,Macroalgae
73,Jellies,"[Biota, Jellies]",2,Jellies
81,Fishes,"[Biota, Fishes]",2,Fishes


Unnamed: 0_level_0,name,parsed_name,depth,child_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
129,Cnidaria: Corals: Stony corals: Sub-massive,"[Biota, Cnidaria, Corals, Stony corals, Sub-ma...",5,Sub-massive
38,Molluscs: Bivalves,"[Biota, Molluscs, Bivalves]",3,Bivalves
90,Echinoderms: Sea urchins: Regular urchins,"[Biota, Echinoderms, Sea urchins, Regular urch...",4,Regular urchins
32,Molluscs,"[Biota, Molluscs]",2,Molluscs
159,Cnidaria: Corals: Black & Octocorals: Branchin...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",6,Fleshy
141,Cnidaria: Corals: Stony corals: Branching,"[Biota, Cnidaria, Corals, Stony corals, Branch...",5,Branching
54,Macroalgae: Filamentous / filiform,"[Biota, Macroalgae, Filamentous / filiform]",3,Filamentous / filiform
59,Macroalgae: Erect fine branching: Red,"[Biota, Macroalgae, Erect fine branching, Red]",4,Red
154,Cnidaria: Corals: Black & Octocorals: Branchin...,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",7,Bushy
56,Macroalgae: Filamentous / filiform: Green,"[Biota, Macroalgae, Filamentous / filiform, Gr...",4,Green


In [129]:
# Convert "parsed_name" to a list of node IDs, instead of strings

def get_ancestor_ids(label_id):
    # Get ancestors NOT including this label id.
    parsed_name = df_id_lookups.loc[label_id, 'parsed_name']
    ancestor_parsed_ids = []
    for i in range(len(parsed_name)):
        ancestor_parsed_name = parsed_name[:i+1]
        ancestor_parsed_id = df_id_lookups[df_id_lookups.parsed_name.apply(lambda d: d == ancestor_parsed_name)].index[0]
        ancestor_parsed_ids.append(ancestor_parsed_id)
    return ancestor_parsed_ids[:-1]


df_id_lookups['ancestor_id_list'] = [get_ancestor_ids(d) for d in df_id_lookups.index]
df_id_lookups = df_id_lookups.sort_index()
df_id_lookups['bit_vector_idx'] = range(len(df_id_lookups))
df_id_lookups

Unnamed: 0_level_0,name,parsed_name,depth,child_name,ancestor_id_list,bit_vector_idx
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Not Considered,[Not Considered],1,Not Considered,[],0
2,Biota,[Biota],1,Biota,[],1
3,Worms,"[Biota, Worms]",2,Worms,[2],2
4,Worms: Polychaetes,"[Biota, Worms, Polychaetes]",3,Polychaetes,"[2, 3]",3
5,Worms: Polychaetes: Tube worms,"[Biota, Worms, Polychaetes, Tube worms]",4,Tube worms,"[2, 3, 4]",4
...,...,...,...,...,...,...
310,Cnidaria: Corals: Black & Octocorals: Massive,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",5,Massive,"[2, 118, 126, 143]",143
391,Cnidaria: Corals: Stony corals: Branching: Poc...,"[Biota, Cnidaria, Corals, Stony corals, Branch...",6,Pocillopora damicornis,"[2, 118, 126, 127, 141]",144
400,Macroalgae: Large canopy-forming: Ecklonia rad...,"[Biota, Macroalgae, Large canopy-forming, Eckl...",4,Ecklonia radiata,"[2, 39, 44]",145
402,Echinoderms: Sea urchins: Regular urchins: Cen...,"[Biota, Echinoderms, Sea urchins, Regular urch...",5,Centrostephanus rodgersii,"[2, 88, 89, 90]",146


In [130]:
# Add a list of all descendents

def get_descendant_ids(label_id):
    # Get descendents NOT including this label_id.
    descendant_idx = df_id_lookups.ancestor_id_list.apply(lambda d: label_id in d)
    descendant_label_ids = df_id_lookups.loc[descendant_idx].index.values
    return descendant_label_ids

df_id_lookups['descendant_id_list'] = [get_descendant_ids(d) for d in df_id_lookups.index]
df_id_lookups

Unnamed: 0_level_0,name,parsed_name,depth,child_name,ancestor_id_list,bit_vector_idx,descendant_id_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Not Considered,[Not Considered],1,Not Considered,[],0,[]
2,Biota,[Biota],1,Biota,[],1,"[3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19..."
3,Worms,"[Biota, Worms]",2,Worms,[2],2,"[4, 5, 7, 8, 12]"
4,Worms: Polychaetes,"[Biota, Worms, Polychaetes]",3,Polychaetes,"[2, 3]",3,[5]
5,Worms: Polychaetes: Tube worms,"[Biota, Worms, Polychaetes, Tube worms]",4,Tube worms,"[2, 3, 4]",4,[]
...,...,...,...,...,...,...,...
310,Cnidaria: Corals: Black & Octocorals: Massive,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",5,Massive,"[2, 118, 126, 143]",143,[]
391,Cnidaria: Corals: Stony corals: Branching: Poc...,"[Biota, Cnidaria, Corals, Stony corals, Branch...",6,Pocillopora damicornis,"[2, 118, 126, 127, 141]",144,[]
400,Macroalgae: Large canopy-forming: Ecklonia rad...,"[Biota, Macroalgae, Large canopy-forming, Eckl...",4,Ecklonia radiata,"[2, 39, 44]",145,[]
402,Echinoderms: Sea urchins: Regular urchins: Cen...,"[Biota, Echinoderms, Sea urchins, Regular urch...",5,Centrostephanus rodgersii,"[2, 88, 89, 90]",146,[]


We now want to represent this class hierarchy as a bit-vector. Each class index has a unique bit in the vector (indexed by `id2ind`). A root level class will turn on a single bit. A depth 4 class will turn on 4 bits.

This means that for an image, the label mask will be a 3D matrix of shape (`len(df_id_lookups)`, `im.shape[0]`, `im.shape[1]`)

In [131]:
DONT_KNOW = -1
NO = 0
YES = 1

def get_bit_vector(label_id):
    k = len(df_id_lookups)
    bit_vector = np.zeros(k, dtype=np.int8)
    bit_vector[:] = NO
    
    idx = df_id_lookups.loc[label_id]['bit_vector_idx']
    bit_vector[idx] = YES
    
    for ancestor_id in df_id_lookups.loc[label_id, 'ancestor_id_list']:
        idx = df_id_lookups.loc[ancestor_id]['bit_vector_idx']
        bit_vector[idx] = YES
        
    for descendant_id in df_id_lookups.loc[label_id, 'descendant_id_list']:
        idx = df_id_lookups.loc[descendant_id]['bit_vector_idx']
        bit_vector[idx] = DONT_KNOW

    return bit_vector
        
        
df_id_lookups['bit_vector'] = [get_bit_vector(d) for d in df_id_lookups.index]
df_id_lookups

Unnamed: 0_level_0,name,parsed_name,depth,child_name,ancestor_id_list,bit_vector_idx,descendant_id_list,bit_vector
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Not Considered,[Not Considered],1,Not Considered,[],0,[],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Biota,[Biota],1,Biota,[],1,"[3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19...","[0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,..."
3,Worms,"[Biota, Worms]",2,Worms,[2],2,"[4, 5, 7, 8, 12]","[0, 1, 1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0..."
4,Worms: Polychaetes,"[Biota, Worms, Polychaetes]",3,Polychaetes,"[2, 3]",3,[5],"[0, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
5,Worms: Polychaetes: Tube worms,"[Biota, Worms, Polychaetes, Tube worms]",4,Tube worms,"[2, 3, 4]",4,[],"[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
310,Cnidaria: Corals: Black & Octocorals: Massive,"[Biota, Cnidaria, Corals, Black & Octocorals, ...",5,Massive,"[2, 118, 126, 143]",143,[],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
391,Cnidaria: Corals: Stony corals: Branching: Poc...,"[Biota, Cnidaria, Corals, Stony corals, Branch...",6,Pocillopora damicornis,"[2, 118, 126, 127, 141]",144,[],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
400,Macroalgae: Large canopy-forming: Ecklonia rad...,"[Biota, Macroalgae, Large canopy-forming, Eckl...",4,Ecklonia radiata,"[2, 39, 44]",145,[],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
402,Echinoderms: Sea urchins: Regular urchins: Cen...,"[Biota, Echinoderms, Sea urchins, Regular urch...",5,Centrostephanus rodgersii,"[2, 88, 89, 90]",146,[],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Sparse Points to Image Masks
The next step is to convert the sparse lists of points in (image, row, column, label_id) form, to a 1:1 label mask per image. Initially, we will do this just as a classic multi class problem, with label_id = 0 as the placeholder for "unknown" pixels which should not be considered by the loss function. Later, we will use the hierarchical relationships between the labels to set it up as a multi-label problem.

In [132]:
DONT_KNOW = -1
NO = 0
YES = 1

def long_labels_to_multilabel_masks(rows, cols, label_ids, im_shape):
    unique_label_ids = list(set(label_ids))
    
    # Stub out the label_mask to -1 for "don't know" unless otherwise specified.
    label_masks = np.ones([len(df_id_lookups), im_shape[0], im_shape[1]], dtype=np.int8) * DONT_KNOW
    for row, col, label_id in zip(*(rows, cols, label_ids)):
        
        # Now we know this particular pixel's value - zero it out, then flip on the relevant bits for the given class to set explicit YES and implicit DONT_KNOWs
        label_masks[:, row, col]
        
        label_masks[:, row, col] = df_id_lookups.loc[label_id, 'bit_vector']
            
    return label_masks

# Data Set Preparation
The first step is to:

1. Acquire the images.
2. Check they have been successfully acquired, by comparing them against the training set.

In [None]:
prep.image_fetcher.fetch_all_images(IMAGE_LIST_FILE, IMAGE_PATH)

In [2]:
# List number of downloaded images
downloaded_images = [f.strip('.tif') for f in os.listdir(os.path.join(IMAGE_PATH)) if f.endswith('tif')]
downloaded_images
len(downloaded_images)

9436

In [122]:
df_training = pd.read_csv('data_splits/public_labels_train.csv')
df_training.head()

Unnamed: 0,image_name,row,col,label
0,PR_20081006_232302_383_LC16,82,1255,2
1,PR_20081006_232302_383_LC16,114,586,2
2,PR_20081006_232302_383_LC16,119,647,2
3,PR_20081006_232302_383_LC16,120,1011,2
4,PR_20081006_232302_383_LC16,155,50,241


In [123]:
# row and col slightly exceed image dimensions clip.

df_training.loc[:,'row'] = df_training.row.apply(lambda x: int(min(x, IM_SHAPE[0]-1)))
df_training.loc[:,'col'] = df_training.col.apply(lambda x: min(x, IM_SHAPE[1]-1))
df_training.head()

Unnamed: 0,image_name,row,col,label
0,PR_20081006_232302_383_LC16,82,1255,2
1,PR_20081006_232302_383_LC16,114,586,2
2,PR_20081006_232302_383_LC16,119,647,2
3,PR_20081006_232302_383_LC16,120,1011,2
4,PR_20081006_232302_383_LC16,155,50,241


In [4]:
training_images = df_training.image_name.unique().tolist()

missing_images = list(set(training_images) - set(downloaded_images) & set(training_images))
missing_image_dates = set([s.split('_')[1] for s in missing_images])
print(f'Missing {len(missing_images)} images on dates: {missing_image_dates}')

Missing 310 images on dates: {'20110416', '20110415'}


In [133]:
# Convert long-form sparse point labels to "don't know" images. Treat this as multi-class for now (single image, with an indep label assigned to each point)

def visualise_label_layer(label_id, im, labels):
    bitvector_ind = df_id_lookups.loc[label_id]['bit_vector_idx']
    print(f'Label {label_id} - displaying layer from bitvector_index {bitvector_ind}')
    labels_single_layer = labels[bitvector_ind]
    
    # Get positive ones and dilate to make visible.
    labels_single_layer = labels[bitvector_ind,:,:]

    label_name = df_id_lookups.loc[label_id]['name']
    plt.figure(figsize=(20,20))
    plt.imshow(im)
    
    labels_pos = ((labels_single_layer == 1)).astype(np.uint8)
    labels_pos_dilated = cv2.dilate(labels_pos, np.ones((9,9),np.uint8))
    plt.imshow(labels_pos_dilated, alpha=labels_pos_dilated, cmap='brg')
    
    labels_neg = ((labels_single_layer == 0)).astype(np.uint8)
    labels_neg_dilated = cv2.dilate(labels_neg, np.ones((9,9),np.uint8))
    plt.imshow(labels_neg_dilated, alpha=labels_neg_dilated, cmap='seismic')

    plt.title(label_name)
    

num_images = len(df_training.image_name.unique())
for i, (image_name, g) in tqdm(enumerate(df_training.groupby('image_name')), total=num_images):

    # Prepare the label mask
    labels = long_labels_to_multilabel_masks(g.row, g.col, g.label, IM_SHAPE)
    
    # Save labels to disk as npz in similar structure to images
    np.savez_compressed(os.path.join(LABELS_PATH, image_name+'.npz'), labels=labels)
    
    
#     im = cv2.imread(os.path.join(IMAGE_PATH, image_name+'.tif'))
#     visualise_label_layer(45, im, labels)
#     break

    

HBox(children=(FloatProgress(value=0.0, max=6853.0), HTML(value='')))


