In [None]:
# Imports 
import os
from os import listdir
import random
import itertools
from pathlib import Path

In [None]:
# Main parameters
inria_dataset_root_dir = Path("/media/dlsupport/DATA1/EOData/INRIA/AerialImageDataset")

# Define INRIA data dataframe

The INRIA Aerial Image Labeling Dataset has the following structure : 

<!-- language: lang-none -->

    .
    ├── test
    │   └── images
    └── train
        ├── gt
        └── images
        
Each image has a filename of type {town_prefix}{i}.tif with i in [1:36] Gt (ground truth) and image has the same filename
First we load info of train images with *load_geo_img_dir* utils function of EOTorchLoader

In [None]:
import pandas as pd
from eotorchloader.dataset.utils import load_geo_img_dir

In [None]:
inria_train_val_df = load_geo_img_dir(inria_dataset_root_dir/"train"/"images")

In [None]:
inria_train_val_df.head(10)

then we define a split in train data between train and val dataset. This could be done in multiple way.

 * set val image as first images of each town. Usually for INRIA dataset val images are set as the 6 first image of each towns.
 * set val as all the image in a town and train as the images in all other towns.
 
To use this two split we add 2 columns to the extracted dataframe :
 
 * a column with the town id/name
 * a column "standart_split" with take value in ["train", "val", "test"]


In [None]:
# First add a town columns.
inria_train_val_df[['town', 'num']] = inria_train_val_df["name"].str.extract('([a-zA-Z\-]+)([^a-zA-Z\-]+)', expand=True)
inria_train_val_df['num'] = inria_train_val_df['num'].astype(int)
# then we add standard_split columns
inria_train_val_df["standard_split"] = "train"
inria_train_val_df.loc[inria_train_val_df["num"]<=6 ,"standard_split"] = "val"
# finally we rename path as img_path and add a gt_path columns with corresponding mask path
inria_train_val_df = inria_train_val_df.rename(columns={"path": "img_path"})
inria_train_val_df["msk_path"] =  inria_train_val_df["img_path"].str.replace("images", "gt", regex=False)

In [None]:
print(inria_train_val_df[['name','town', 'num', 'standard_split']]) 
print(inria_train_val_df['town'].unique())

Once we have the list of image and mask we could intialize a TorchDataset which crop the image.

 * the tile_size is set in pixel
 * by default no transofmr is apply and the sample are in form {"image" : np.array, "mask" :np.array } in channel first order (CHW or rasterio like)

In [None]:
# import for use in train code
from eotorchloader.dataset.scene_dataset import LargeImageDataset

In [None]:
inria_train_df = inria_train_val_df[inria_train_val_df["standard_split"]=="train"]
image_files_train = inria_train_df["img_path"].values
mask_files_train = inria_train_df["msk_path"].values
print(image_files_train[0:5])
print(mask_files_train[0:5])

train_dataset_tile = LargeImageDataset(
    image_files=image_files_train,
    mask_files=mask_files_train,
    tile_size = 512,
    transforms=None,
    image_bands=[1,2,3],
    mask_bands=[1])

In [None]:
import numpy as np
test_idx = 195
test_data = train_dataset_tile[test_idx]
print(f" keys : {test_data.keys()}")
img_shape =  test_data['image'].shape
msk_shape = test_data['mask'].shape
print(f" image type : {img_shape}, mask type : {msk_shape}")

In [None]:
print(np.histogram(test_data['mask'], bins=10))
print(np.unique(test_data['mask']))