In [None]:
from pycocotools.coco import COCO
from pathlib import Path

from pycocotools import mask as maskUtils
import numpy as np
import matplotlib.pyplot as plt

from spectral.io.envi import open as envi_open

from itertools import islice

In [41]:
# Parameters
SAMPLES_FRAC = 0.05
WAVELENGTHS_PER_PIXEL = 250

In [None]:
# Define paths
MASSIMAL_ROOT_DIR = Path('/data/massimal/seabee-minio/')

In [None]:
# Find all annotation files in the dataset
annotation_files = list(MASSIMAL_ROOT_DIR.rglob('*/annotations_json/*_coco.json'))
print('Found annotation files:')
for annotation_file in annotation_files:
       print(f'<dataset root> / {annotation_file.relative_to(MASSIMAL_ROOT_DIR)}')

Found annotation files:
<dataset root> / larvik/kongsbakkebukta/aerial/hsi/20230830/massimal_larvik_kongsbakkebukta_20230830_hsi_annotation/annotations_json/massimal_larvik_kongsbakkebukta_20230830_coco.json
<dataset root> / larvik/olbergholmen/aerial/hsi/20230830/massimal_larvik_olbergholmen_20230830_hsi_annotation/annotations_json/massimal_larvik_olbergholmen_20230830_coco.json
<dataset root> / smola/maholmen/aerial/hsi/20230621/massimal_smola_maholmen_20230621_hsi_annotation/annotations_json/massimal_smola_maholmen_20230621_coco.json
<dataset root> / vega/sola/aerial/hsi/20220823/massimal_vega_sola_20220823_hsi_annotation/annotations_json/massimal_vega_sola_20220823_coco.json
<dataset root> / bodo/sandsund/aerial/hsi/20210602/massimal_bodo_sandsund_20210602_hsi_annotation/annotations_json/massimal_bodo_sandsund_20210602_coco.json
<dataset root> / bodo/juvika/aerial/hsi/20220624/massimal_bodo_juvika_20220624_hsi_annotation/annotations_json/massimal_bodo_juvika_20220624_coco.json


In [44]:
# Determine total number of annotated images pixels across all annotation files
total_pixels = 0
for annotation_file in annotation_files:
    coco = COCO(annotation_file)
    for ann in coco.dataset['annotations']:
        mask = maskUtils.decode(ann['segmentation'])
        total_pixels += np.sum(mask)
total_bytes = total_pixels * WAVELENGTHS_PER_PIXEL * 4  # 4 bytes per float32 sample

print('')
print(f'Total annotated pixels across all annotation files: {total_pixels}')
print(f'Total size for all annotated spectra: {total_bytes / (1024**3):.2f} GB')
print(f'Total size for sampled subset of spectra: {total_bytes * SAMPLES_FRAC / (1024**3):.2f} GB')

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!

Total annotated pixels across all annotation files: 104806257
Total size for all annotated spectra: 97.61 GB
Total size for sampled subset of spectra: 4.88 GB


In [None]:
# Loop through annotation files and collect example spectra
data = []

for annotation_file in annotation_files[0:1]: # Limit to first file for testing
    print(f'Processing annotation file: {annotation_file}')
    dataset_root_dir = annotation_file.parent.parent.parent

    # Load the COCO annotations       
    coco = COCO(annotation_file)
    
    # Loop through images in the COCO dataset
    for img_id, img_info in list(islice(coco.imgs.items(), 2)): # Limit to a few images for testing
        # Get image file name and stem
        img_file_stem = img_info['file_name'].split('.')[0]
        
        # Load annotations for the image
        anns = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
        print(f'Image ID: {img_id}, Annotations: {len(anns)}, File: {img_file_stem}')

        # Find matching hyperspectral image file
        hyspec_im_path = list(dataset_root_dir.rglob(img_file_stem + '*.hdr'))
        if len(hyspec_im_path) >= 1:
            hyspec_im_path = hyspec_im_path[0] # Take the first match (any files with identical names are identical)
            print(f'Found matching hyperspectral image : {hyspec_im_path.name}')
        else:
            print(f'No matching hyperspectral image found for {img_file_stem} in {dataset_root_dir}')
            continue
        
        # Open hyperspectral image
        print(f'Reading hyperspectral file: {hyspec_im_path}')
        hyspec_im_handle = envi_open(hyspec_im_path)
        hyspec_im = np.array(hyspec_im_handle.load())

        # Create empty list for spectra and annotation categories
        spectra = []
        categories = []

        # Loop through annotations in image
        for ann in anns:
            ann_binary_mask = maskUtils.decode(ann['segmentation']).astype(bool) # type: ignore
            ann_spectra = hyspec_im[ann_binary_mask]
            ann_category = ann['category_id']

            n_random_samp = np.int64(SAMPLES_FRAC*len(ann_spectra))
            if n_random_samp > 0:
                random_indices = np.random.choice(len(ann_spectra), size=n_random_samp, replace=False)
                ann_spectra = ann_spectra[random_indices]
            else:
                continue
            spectra.append(ann_spectra)
            categories.append([ann_category]*len(ann_spectra))
        
        data.append({
            'annotation_file': annotation_file,
            'image_file_name': img_file_stem,
            'spectra': spectra,
            'categories': categories
        })

    print("\n")


Processing annotation file: /data/massimal/seabee-minio/larvik/kongsbakkebukta/aerial/hsi/20230830/massimal_larvik_kongsbakkebukta_20230830_hsi_annotation/annotations_json/massimal_larvik_kongsbakkebukta_20230830_coco.json
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Image ID: 1, Annotations: 5, File: massimal_larvik_kongsbakkebukta_202308301328_hsi_005
Found matching hyperspectral image : massimal_larvik_kongsbakkebukta_202308301328_hsi_005_reflectance.bip.hdr
Reading hyperspectral file: /data/massimal/seabee-minio/larvik/kongsbakkebukta/aerial/hsi/20230830/massimal_larvik_kongsbakkebukta_202308301328_hsi/2a_reflectance/massimal_larvik_kongsbakkebukta_202308301328_hsi_005_reflectance.bip.hdr
Image ID: 2, Annotations: 10, File: massimal_larvik_kongsbakkebukta_202308301328_hsi_006
Found matching hyperspectral image : massimal_larvik_kongsbakkebukta_202308301328_hsi_006_reflectance.bip.hdr
Reading hyperspectral file: /data/massimal/seabee-minio/larvi