# Ethiopia Dataset Gen

Generate and prepare Ethiopia Dataset

In [1]:
import os
import glob
import xarray as xr
import numpy as np
import matplotlib as plt
import pandas as pd
from sklearn.utils import shuffle

## Define Variables

In [2]:
nodata_val = -10001
max_classes = 6
bands = ['CB', 'B', 'G', 'Y', 'R', 'RE', 'NIR1', 'NIR2']
dataset_metadata = {
    'WV03_20141024_M1BS_1040010003ACA100-toa_Gonji_5class.tif': {
        '0': 75000,
        '1': 75000,
        '2': 0,
        '3': 825,
        '4': 100000,
        '5': 0,
    },
    'WV03_20141118_M1BS_1040010004D47B00-toa_Gonji_5class.tif': {
        '0': 75000,
        '1': 75000,
        '2': 100000,
        '3': 99725,
        '4': 100000,
        '5': 0,
    },
    'WV03_20141207_M1BS_10400100053C0600-toa_Gonji_5class.tif': {
        '0': 75000,
        '1': 75000,
        '2': 100000,
        '3': 99725,
        '4': 100000,
        '5': 0,
    },
    'WV03_20180114_M1BS_1040010037AF5F00-toa_Gonji_6class.tif': {
        '0': 75000,
        '1': 75000,
        '2': 100000,
        '3': 99725,
        '4': 0,
        '5': 300000,
    },
}

## Define Data Location Paths

In [3]:
images_path: str = '/Users/jacaraba/Desktop/development/ilab/ethiopia-lcluc/adapt-data/data/images'
labels_path: str = '/Users/jacaraba/Desktop/development/ilab/ethiopia-lcluc/adapt-data/data/labels'

In [4]:
images_list: list = sorted(glob.glob(os.path.join(images_path, '*.tif')))
labels_list: list = sorted(glob.glob(os.path.join(labels_path, '*.tif')))

## Generate Dataset

In [None]:
# create dataframe and list to store points
list_points = []
df_points = pd.DataFrame(columns=bands + ['CLASS'])
    
for image,label in zip(images_list, labels_list):
    
    # open imagery
    filename = image.split('/')[-1]
    image = xr.open_rasterio(image).values
    label = xr.open_rasterio(label).values
    
    # Some preprocessing
    label = np.squeeze(label) if len(label.shape) != 2 else label
    label = label - 1 if np.min(label) == 1 else label
    print(image.shape, label.shape, np.unique(label), filename)

    for c in range(max_classes):

        indices = 0
        selected_points = 0
        num_points = dataset_metadata[filename][str(c)]

        x_indices, y_indices = np.where(label == c)  # we extract all class c points from the imagery
        x_indices, y_indices = shuffle(x_indices, y_indices)  # we make sure values are fully shuffled
        print(f"Class {c}:", x_indices.shape, y_indices.shape)

        if x_indices.shape[0] != 0:
        
            while selected_points < num_points:
                
                sv, lv = image[:, x_indices[indices], y_indices[indices]], \
                    int(label[x_indices[indices], y_indices[indices]])

                if sv[0] != nodata_val:
                    
                    list_points.append(
                        pd.DataFrame(
                            [np.append(sv, [lv])],
                            columns=list(df_points.columns))
                    )                    
                    selected_points += 1
                indices += 1

df_points = pd.concat(list_points)
print(df_points)

(8, 8069, 2484) (8069, 2484) [ 0  1  3  4 14] WV03_20141024_M1BS_1040010003ACA100-toa_Gonji_5class.tif
Class 0: (15408517,) (15408517,)
Class 1: (4131660,) (4131660,)
Class 2: (0,) (0,)
Class 3: (830,) (830,)
Class 4: (481725,) (481725,)
Class 5: (0,) (0,)
(8, 8069, 8003) (8069, 8003) [ 0  1  2  3  4 14] WV03_20141118_M1BS_1040010004D47B00-toa_Gonji_5class.tif
Class 0: (47905479,) (47905479,)
Class 1: (14533773,) (14533773,)
Class 2: (147723,) (147723,)
Class 3: (206861,) (206861,)
Class 4: (1715572,) (1715572,)


In [None]:
"""
# ----------------------------------------------------------------------------
        # 2. Extract points out of spatial imagery - rasters
        # ----------------------------------------------------------------------------
        # set empty dataframe to store point values
        #list_points = []
        #df_points = pd.DataFrame(columns=args.bands + ['CLASS'])
        #logging.info(f"Generating {data_df['ntiles'].sum()} points dataset.")

        # start iterating over each file
        #for di in data_df.index:

            # get filename for output purposes, in the future, add to column
            #filename = data_df['data'][di].split('/')[-1]
            #logging.info(f'Processing {filename}')

            # read imagery from disk and process both image and mask
            #img = xr.open_rasterio(data_df['data'][di]).values
            #mask = xr.open_rasterio(data_df['label'][di]).values

            # ------------------------------------------------------------------------
            # Unique processing of this project - Start
            # ------------------------------------------------------------------------
            # squeeze mask if needed, start classes fro 0-n_classes
            #mask = np.squeeze(mask) if len(mask.shape) != 2 else mask
            #mask = mask - 1 if np.min(mask) == 1 else mask
            # ------------------------------------------------------------------------
            # Unique processing of this project - Done
            # ------------------------------------------------------------------------

            # crop ROI, from outside to inside based on pixel address
            ymin, ymax = data_df['ymin'][di], data_df['ymax'][di]
            xmin, xmax = data_df['xmin'][di], data_df['xmax'][di]
            img, mask = \
                img[:, ymin:ymax, xmin:xmax], mask[ymin:ymax, xmin:xmax]

            # crop ROI, from outside to inside based on pixel value
            # img = np.clip(img, 0, 10000)

            # get N points from imagery
            points_per_class = data_df['ntiles'][di] // args.n_classes
            #logging.info(f'Generating {points_per_class} points per class.')

            # extract values from imagery, two classes
            for cv in range(args.n_classes):

                logging.info(f'Starting with class: {cv}')
                bbox = img.shape  # size of the image
                counter = 0  # counter for class balancing

                while counter < points_per_class:

                    # get indices and extract spectral and class value
                    y, x = random.randrange(bbox[1]), random.randrange(bbox[2])
                    sv, lv = img[:, y, x], int(mask[y, x])

                    if lv == cv:
                        # trying speed up here - looks like from list is faster
                        list_points.append(
                            pd.DataFrame(
                                [np.append(sv, [lv])],
                                columns=list(df_points.columns))
                        )
                        counter += 1

        df_points = pd.concat(list_points)

        # ----------------------------------------------------------------------------
        # 3. Save file to disk
        # ----------------------------------------------------------------------------
        df_points.to_csv(args.train_csv, index=False)
        logging.info(f'Saved dataset file {args.train_csv}')
"""