<a href="https://colab.research.google.com/github/marlenebauer/Deep_learning/blob/main/Filter_masks_above_median.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script filters out those mask and raster tiles of training_tiles_above_median, that are already part of the testing set. As we do the splitting of training, validation and testing set before adding those tiles to the data augumentation, we need to make sure non of them are already in the testing set.
This maintains the test set as a true representation of unseen data, allowing for an accurate evaluation of the model’s performance.


## Set up

In [None]:
#connect google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install the following libraries
import numpy as np
import keras
from keras import layers
from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_datasets as tfdatasets
import glob
import random
import os
import shutil

## Load tiles and generate training, validation and testing data

In [None]:
#Set the path to the folder containing the mask and raster files
raster_folder= '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/rasters_filtered_zero'
mask_folder= '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/maks_filtered_zero'

8260
8260


In [None]:
# Get all files
mask_files = sorted(glob.glob(mask_folder + '/*.npy'))
print(f"Total number of masks: {len(mask_files)}")
raster_files = sorted(glob.glob(raster_folder + '/*.npy'))
print(f"Total number of rasters: {len(raster_files)}")

In [None]:
# stack the data
data = np.column_stack((raster_paths, mask_paths))

In [None]:
len(data)

8260

In [None]:
data[0, ]

array(['/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/rasters_filtered_zero/32562_5513_raster_tile_0_1650.npy',
       '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/maks_filtered_zero/32562_5513_mask_tile_0_1650.npy'],
      dtype='<U132')

In [None]:
np.random.seed(5) # set seed to ensure reproducability
np.random.shuffle(data)
data[0,]

array(['/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/rasters_filtered_zero/32571_5516_raster_tile_2250_1100.npy',
       '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/maks_filtered_zero/32571_5516_mask_tile_2250_1100.npy'],
      dtype='<U132')

In [None]:
# Split in train, val and test
prop_train = 0.70 # 70% for training
prop_val = 0.15 #15% for validation -> 15% for testing

# compute the split indices
train_idx= int(prop_train * len(data))
val_idx= int((prop_train+prop_val)*len(data))

# Split the data
training, validation, testing = np.split(data, [train_idx, val_idx])

# Check training[0]
training[0,]

array(['/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/rasters_filtered_zero/32571_5516_raster_tile_2250_1100.npy',
       '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_zero/maks_filtered_zero/32571_5516_mask_tile_2250_1100.npy'],
      dtype='<U132')

In [None]:
print(len(training))
print(len(validation))
print(len(testing))

5782
1239
1239


In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((training[:,0], training[:,1]))

In [None]:
type(train_ds)

In [None]:
len(train_ds)

5782

In [None]:
# Function to load .npy files
def load_npy_files(raster_path, mask_path):
    raster = np.load(raster_path.numpy().decode('utf-8'))
    raster = np.transpose(raster, (1,2,0))

    mask = np.load(mask_path.numpy().decode('utf-8'))
    mask = np.expand_dims(mask, axis=-1)
    return raster, mask

# TensorFlow wrapper for loading .npy files
def load_npy_tf(raster_path, mask_path):
    raster, mask = tf.py_function(func=load_npy_files, inp=[raster_path, mask_path], Tout=[tf.float32, tf.float32])
    raster.set_shape([128, 128, 3])  # Set shape for the raster image
    mask.set_shape([128, 128, 1])
    return raster, mask

In [None]:
train_ds = train_ds.map(load_npy_tf)

In [None]:
# check the shape of our first element
for element in train_ds.take(1):
  print(element)

(<tf.Tensor: shape=(128, 128, 3), dtype=float32, numpy=
array([[[ 83.,  89.,  69.],
        [ 81.,  85.,  67.],
        [ 80.,  85.,  67.],
        ...,
        [138., 126., 134.],
        [139., 127., 136.],
        [135., 125., 134.]],

       [[ 83.,  89.,  70.],
        [ 82.,  87.,  68.],
        [ 79.,  84.,  66.],
        ...,
        [137., 126., 134.],
        [137., 126., 135.],
        [137., 126., 135.]],

       [[ 75.,  82.,  65.],
        [ 79.,  85.,  67.],
        [ 79.,  86.,  67.],
        ...,
        [138., 126., 135.],
        [137., 126., 135.],
        [136., 125., 134.]],

       ...,

       [[123., 113., 126.],
        [159., 145., 154.],
        [171., 154., 158.],
        ...,
        [159., 145., 146.],
        [179., 162., 157.],
        [162., 143., 132.]],

       [[ 79.,  81., 117.],
        [101.,  95., 114.],
        [149., 136., 145.],
        ...,
        [151., 138., 141.],
        [168., 152., 151.],
        [179., 161., 153.]],

       [[ 87.,  

In [None]:
len(train_ds)

5782

In [None]:
# now prepare val and test data
val_ds = tf.data.Dataset.from_tensor_slices((validation[:,0], validation[:,1]))
test_ds = tf.data.Dataset.from_tensor_slices((testing[:,0], testing[:,1]))

# load the arrays
val_ds = val_ds.map(load_npy_tf)
test_ds = test_ds.map(load_npy_tf)

## Filter out tiles, that are already in testing

In [None]:
# filter out those masks/rasters of Training_tiles_filtered_median, that are already included in testing
# this is neccessary, because we first split the data into training, validation and testing and then add the masks/rasters above median
# to our data augumentation.
# Therefore, we have to make sure that the masks/rasters that we use for the Data augumentaion, are not included in the testing dataset
# load in high prio tiles
prio_mask_folder = '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_median/masks_filtered_median'
prio_raster_folder = '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_median/rasters_filtered_median'

prio_raster_paths = sorted(glob.glob(prio_raster_folder+'/*.npy'))
prio_mask_paths = sorted(glob.glob(prio_mask_folder+'/*.npy'))

print(len(prio_raster_paths))
print(len(prio_mask_paths))

4129
4129


In [None]:
# stack the data
#prio_data = np.column_stack((prio_raster_paths, prio_mask_paths))

In [None]:
#extract the filenames of the masks in the testing dataset
testing_mask_filenames = [os.path.basename(pair[1]) for pair in testing]
print(testing_mask_filenames)

In [None]:
# only keep those masks, that are not already in the testing dataset
filtered_prio_mask_paths = [path for path in prio_mask_paths if os.path.basename(path) not in testing_mask_filenames]


In [None]:
print(len(filtered_prio_mask_paths))

3502


In [None]:
print(len(prio_mask_paths))

4129


## get the matching rasters

In [None]:
# get the matching rasters for the filtered_prio_mask_paths
def extract_common_part(filename):
    basename = os.path.basename(filename)
    parts = basename.split('_')
    if len(parts) >= 6:
        first_part = parts[0] + '_' + parts[1]  # "32562_5513"
        tile_part = parts[-2] + '_' + parts[-1].replace('.npy', '')  # "1000_1200"
        return first_part, tile_part
    return None, None

In [None]:
#extract common parts
raster_common_parts = set(extract_common_part(f) for f in prio_raster_paths)
mask_common_parts = set(extract_common_part(f) for f in filtered_prio_mask_paths)

In [None]:
#get masks
matching_mask_files = [f for f in filtered_prio_mask_paths if extract_common_part(f) in raster_common_parts]
print(f"Number of matching masks: {len(matching_mask_files)}")


In [None]:
#get rasters
matching_raster_files = [f for f in prio_raster_paths
                         if extract_common_part(f) in mask_common_parts]
print(f"Number of matching rasters: {len(matching_raster_files)}")

Number of matching rasters: 3502


## Export filtered tiles

In [None]:
# # export matching rasters
filtered_rasters_folder = '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_median_new/rasters_filtered_median_testing'

if not os.path.exists(filtered_rasters_folder):
    os.makedirs(filtered_rasters_folder)

# Copy matching raster files to the new folder
for raster_file in matching_raster_files:
    # Extract the filename from the full path
    filename = os.path.basename(raster_file)

    # Construct the destination path
    destination_path = os.path.join(filtered_rasters_folder, filename)

    # Copy the raster file to the 'filtered' folder
    shutil.copy(raster_file, destination_path)

print(f"Copied {len(matching_raster_files)} raster files to the 'filtered_rasters' folder.")

Copied 3502 raster files to the 'filtered_rasters' folder.


In [None]:
# export matching masks
filtered_masks_folder = '/content/drive/MyDrive/deep_learning_project/Training_tiles_filtered_median_new/masks_filtered_median_testing'

if not os.path.exists(filtered_masks_folder):
    os.makedirs(filtered_masks_folder)

# Copy matching raster files to the new folder
for mask_file in matching_mask_files:
    # Extract the filename from the full path
    filename = os.path.basename(mask_file)

    # Construct the destination path
    destination_path = os.path.join(filtered_masks_folder, filename)

    # Copy the raster file to the 'filtered' folder
    shutil.copy(mask_file, destination_path)

print(f"Copied {len(matching_mask_files)} masks files to the 'filtered_masks' folder.")

Copied 3502 masks files to the 'filtered_masks' folder.
