# Ethiopia Dataset Gen

Generate and prepare Ethiopia Dataset

In [6]:
import os
import glob
import xarray as xr
import numpy as np
import matplotlib as plt
import pandas as pd
from sklearn.utils import shuffle

## Define Variables

In [11]:
output_filename = 'train_data_ethiopia_v2.csv'
nodata_val = -10001
max_classes = 6
bands = ['CB', 'B', 'G', 'Y', 'R', 'RE', 'NIR1', 'NIR2']
dataset_metadata = {
    'WV03_20141024_data.tif': {
        '0': 50000,
        '1': 50000,
        '2': 0,
        '3': 825,
        '4': 65000,
        '5': 0,
    },
    'WV03_20141118_data.tif': {
        '0': 50000,
        '1': 50000,
        '2': 65000,
        '3': 96703,
        '4': 65000,
        '5': 0,
    },
    'WV03_20141207_data.tif': {
        '0': 50000,
        '1': 50000,
        '2': 65000,
        '3': 95703,
        '4': 65000,
        '5': 0,
    },
    'WV03_20180114_data.tif': {
        '0': 50000,
        '1': 50000,
        '2': 65000,
        '3': 7769,
        '4': 0,
        '5': 186796,
    },
}

## Define Data Location Paths

In [12]:
#images_path: str = '/att/pubrepo/ILAB/projects/Ethiopia/ethiopia-lcluc/data/images'
#labels_path: str = '/att/pubrepo/ILAB/projects/Ethiopia/ethiopia-lcluc/data/labels'
images_path: str = '/adapt/nobackup/projects/ilab/projects/Ethiopia/LCLUC_Ethiopia/data/images'
labels_path: str = '/adapt/nobackup/projects/ilab/projects/Ethiopia/LCLUC_Ethiopia/data/labels'

In [13]:
images_list: list = sorted(glob.glob(os.path.join(images_path, '*.tif')))
labels_list: list = sorted(glob.glob(os.path.join(labels_path, '*.tif')))

## Generate Dataset

In [14]:
# create dataframe and list to store points
list_points = []
df_points = pd.DataFrame(columns=bands + ['CLASS'])
    
for image,label in zip(images_list, labels_list):
    
    # open imagery
    filename = image.split('/')[-1]
    image = xr.open_rasterio(image).values
    label = xr.open_rasterio(label).values
    
    # Some preprocessing
    label = np.squeeze(label) if len(label.shape) != 2 else label
    label = label - 1 if np.min(label) == 1 else label
    print(image.shape, label.shape, np.unique(label), filename)

    for c in range(max_classes):

        indices = 0
        selected_points = 0
        num_points = dataset_metadata[filename][str(c)]
        print(f'Class {str(c)} points to extract {str(num_points)}')

        x_indices, y_indices = np.where(label == c)  # we extract all class c points from the imagery
        x_indices, y_indices = shuffle(x_indices, y_indices)  # we make sure values are fully shuffled
        print(f"Class {c}:", x_indices.shape, y_indices.shape)

        if x_indices.shape[0] != 0:
            try:
                while selected_points < num_points:

                    sv, lv = image[:, x_indices[indices], y_indices[indices]], \
                        int(label[x_indices[indices], y_indices[indices]])

                    if sv[0] != nodata_val:

                        list_points.append(
                            pd.DataFrame(
                                [np.append(sv, [lv])],
                                columns=list(df_points.columns))
                        )                    
                        selected_points += 1
                    else:
                        print("YES")
                    indices += 1
            except IndexError:
                pass

        print(selected_points)

df_points = pd.concat(list_points)
df_points.to_csv(output_filename, index=False)

(8, 8069, 2240) (8069, 2240) [ 0  1  3  4 14] WV03_20141024_data.tif
Class 0 points to extract 50000
Class 0: (13875423,) (13875423,)
50000
Class 1 points to extract 50000
Class 1: (3734985,) (3734985,)
50000
Class 2 points to extract 0
Class 2: (0,) (0,)
0
Class 3 points to extract 825
Class 3: (830,) (830,)
825
Class 4 points to extract 65000
Class 4: (444610,) (444610,)
65000
Class 5 points to extract 0
Class 5: (0,) (0,)
0
(8, 8069, 6210) (8069, 6210) [ 0  1  2  3  4 14] WV03_20141118_data.tif
Class 0 points to extract 50000
Class 0: (36815318,) (36815318,)
50000
Class 1 points to extract 50000
Class 1: (11587674,) (11587674,)
50000
Class 2 points to extract 65000
Class 2: (147723,) (147723,)
65000
Class 3 points to extract 96703
Class 3: (206031,) (206031,)
96703
Class 4 points to extract 65000
Class 4: (1300005,) (1300005,)
65000
Class 5 points to extract 0
Class 5: (0,) (0,)
0
(8, 8069, 7108) (8069, 7108) [ 0  1  2  3  4 14] WV03_20141207_data.tif
Class 0 points to extract 50000