# Dataset preparation

In [1]:
import os

Connect to the cloud

In [None]:
!gcloud auth login

Data

In [31]:
S2Hand_path = "sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand"
LabelHand_path = "sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand"

os.makedirs(S2Hand_path, exist_ok=True)
os.makedirs(LabelHand_path, exist_ok=True)

In [None]:
!gsutil -m rsync -r gs://sen1floods11/$S2Hand_path $S2Hand_path
!gsutil -m rsync -r gs://sen1floods11/$LabelHand_path $LabelHand_path

Test splits

In [13]:
splits_path = "sen1floods11/v1.1/splits/flood_handlabeled"
os.makedirs(splits_path, exist_ok=True)

In [None]:
!gsutil -m rsync -r gs://sen1floods11/$splits_path $splits_path

# Step 1

## Number of images in each split and each region

### By split:

In [25]:
import pandas as pd

bolivia_data_path = os.path.join(splits_path, "flood_bolivia_data.csv")
train_data_path = os.path.join(splits_path, "flood_train_data.csv")
val_data_path = os.path.join(splits_path, "flood_valid_data.csv")
test_data_path = os.path.join(splits_path, "flood_test_data.csv")

bolivia_data = pd.read_csv(bolivia_data_path, header=None, names=["scene", "mask"])
train_data = pd.read_csv(train_data_path, header=None, names=["scene", "mask"])
valid_data = pd.read_csv(val_data_path, header=None, names=["scene", "mask"])
test_data = pd.read_csv(test_data_path, header=None, names=["scene", "mask"])

In [29]:
bolivia_data.head()

Unnamed: 0,scene,mask
0,Bolivia_103757_S1Hand.tif,Bolivia_103757_LabelHand.tif
1,Bolivia_129334_S1Hand.tif,Bolivia_129334_LabelHand.tif
2,Bolivia_195474_S1Hand.tif,Bolivia_195474_LabelHand.tif
3,Bolivia_23014_S1Hand.tif,Bolivia_23014_LabelHand.tif
4,Bolivia_233925_S1Hand.tif,Bolivia_233925_LabelHand.tif


In [28]:
num_bolivia = bolivia_data.shape[0]
num_train = train_data.shape[0]
num_valid = valid_data.shape[0]
num_test = test_data.shape[0]

print(f"Number of Bolivia scenes: {num_bolivia}")
print(f"Number of training scenes: {num_train}")
print(f"Number of validation scenes: {num_valid}")
print(f"Number of test scenes: {num_test}")

Number of Bolivia scenes: 15
Number of training scenes: 252
Number of validation scenes: 89
Number of test scenes: 90


### By region:

List all files in the S2Hand_path.
Each file follows the naming scheme EVENT_CHIPID_LAYER.tif (e.g. Bolivia_103757_S2Hand.tif) so EVENT is the region.

In [None]:
region_counts = {}

for filename in os.listdir(S2Hand_path):
    if filename.endswith(".tif"):
        region_name = filename.split('_')[0]                                # Extract the region name (EVENT part of the filename)
        region_counts[region_name] = region_counts.get(region_name, 0) + 1

In [75]:
df = pd.DataFrame.from_dict(region_counts, orient='index', columns=['Count'])
df.index.name = 'Region'
df.reset_index(inplace=True)
df

Unnamed: 0,Region,Count
0,Bolivia,15
1,Ghana,53
2,India,68
3,Mekong,30
4,Nigeria,18
5,Pakistan,28
6,Paraguay,67
7,Somalia,26
8,Spain,30
9,Sri-Lanka,42


In [34]:
total_number = sum(region_counts.values())
total_number

446

## Per-channel mean and standard deviation.

In [51]:
import rasterio
import numpy as np

image_paths = [os.path.join(S2Hand_path, filename) for filename in os.listdir(S2Hand_path) if filename.endswith('.tif')]

scenes_list = []

# Read the image data into a list of arrays
for image_path in image_paths:
    with rasterio.open(image_path) as src:
        scenes_list.append(src.read())

In [59]:
# from matplotlib import pyplot as plt

# plt.imshow(scenes_list[0][0, :, :], cmap='gray')

In [54]:
# Convert into (scenes, bands, height, width)
scenes_array = np.array(scenes_list)
scenes_array.shape

(446, 13, 512, 512)

In [74]:
import utils

In [None]:
scenes_array = utils.dn2reflectance(scenes_array)

In [64]:
wavelengths = utils.get_satellite_wavelength("sentinel2")
bands_names = utils.get_bands_names(wavelengths)

In [None]:
means = scenes_array.mean(axis=(0, 2, 3))
std_devs = scenes_array.std(axis=(0, 2, 3))

In [73]:
channel_stats = pd.DataFrame({"Band": bands_names, 'Mean': means, 'STD': std_devs})
channel_stats

Unnamed: 0,Band,Mean,STD
0,B01,0.161226,0.069464
1,B02,0.137989,0.073459
2,B03,0.134443,0.073161
3,B04,0.119516,0.08606
4,B05,0.143917,0.077136
5,B06,0.234425,0.092163
6,B07,0.279647,0.108803
7,B08,0.257841,0.102925
8,B8A,0.302382,0.120511
9,B09,0.047673,0.033169
