In [1]:
import numpy as np
import skimage.io
from pathlib import Path
import tqdm
from massimal import hyspec_io
import datetime

In [2]:
# Parameters
samples_frac = 0.05

In [4]:
# Paths
hyspec_base = Path('/home/mha114/data/massimal/vega_sola/hyperspec/images/2a_rrs/')
annotation_dir = Path("/home/mha114/data/massimal/vega_sola/hyperspec/annotation/v2.1/png_gray/train_val/")
sampled_spectra_dir = Path('/home/mha114/data/massimal/vega_sola/hyperspec/sampled_spectra/sola_sampled_spectra_v1')

sampled_spectra_dir.mkdir(exist_ok=True,parents=True)


In [5]:
label_image_paths = [path for path in sorted(annotation_dir.glob('*.png'))]
for path in label_image_paths: print(path.stem)

Vega_SolaNW_Coast1_Pika_L_10-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_11-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_12-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_13-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_15-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_16-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_17-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_18-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_22-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_23-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_24-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_6-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_7-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_8-Crop Wavelengths.bip
Vega_SolaNW_Coast1_Pika_L_9-Crop Wavelengths.bip
Vega_SolaNW_Coast2_Pika_L_15-Crop Wavelengths.bip
Vega_SolaNW_Coast2_Pika_L_16-Crop Wavelengths.bip
Vega_SolaNW_Coast2_Pika_L_17-Crop Wavelengths.bip
Vega_SolaNW_Coast2_Pika_L_19-Crop Wavelengths.bip
Vega_SolaNW_Coast2_Pika_L_20-Crop Wavelengths.bip
Vega

In [6]:
# Create random number generator
rng = np.random.default_rng()

In [10]:
spectra_subset_list = []
labels_subset_list = []

for label_image_path in tqdm.tqdm(label_image_paths):
    hyspec_image_path = [path for path in hyspec_base.rglob(label_image_path.stem+'*.hdr')][0]

    label_image = skimage.io.imread(label_image_path)
    (hyspec_image,_,_,_) = hyspec_io.load_envi_image(str(hyspec_image_path))

    # Create annotation mask
    annotation_mask = label_image!=0

    # Get spectra and labels
    spectra = hyspec_image[annotation_mask]
    labels = label_image[annotation_mask]

    # Create random indices for subset
    n_random_samp = np.int64(samples_frac*spectra.shape[0])
    random_indices = rng.choice(spectra.shape[0],size=n_random_samp,axis=0,replace=False,shuffle=False)
    
    # Extract spectra and labels, and append to lists
    spectra_subset_list.append(spectra[random_indices])
    labels_subset_list.append(labels[random_indices])

100%|██████████| 73/73 [04:36<00:00,  3.79s/it]


In [11]:
X = np.concatenate(spectra_subset_list)
y = np.concatenate(labels_subset_list)

In [12]:
print(X.shape)
print(y.shape)

(2061350, 255)
(2061350,)


In [16]:
print(f"Total memory needed for X: {(X.itemsize*np.size(X))/(2**20)} MB")

Total memory needed for X: 2005.173683166504 MB


In [19]:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%Hh%Mm%Ss')

In [20]:
# Save to file
spectra_file = str(sampled_spectra_dir / ('20220823_Vega_Sola_ExtractedSpectraAndLabels' + timestamp + '.npz'))
np.savez(spectra_file, X=X,y=y)

In [21]:
data = {}
with np.load(spectra_file) as npz_files:
    print('Found the following Numpy arrays in the saved file:')
    for variable_name in npz_files.files:
        print(variable_name)
        data[variable_name] = npz_files[variable_name]

Found the following Numpy arrays in the saved file:
X
y
