# Dataset preparation

In [1]:
import os

Connect to the cloud

In [None]:
!gcloud auth login

Data

In [31]:
S2Hand_path = "sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand"
LabelHand_path = "sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand"

os.makedirs(S2Hand_path, exist_ok=True)
os.makedirs(LabelHand_path, exist_ok=True)

In [None]:
!gsutil -m rsync -r gs://sen1floods11/$S2Hand_path $S2Hand_path
!gsutil -m rsync -r gs://sen1floods11/$LabelHand_path $LabelHand_path

Test splits

In [13]:
splits_path = "sen1floods11/v1.1/splits/flood_handlabeled"
os.makedirs(splits_path, exist_ok=True)

In [None]:
!gsutil -m rsync -r gs://sen1floods11/$splits_path $splits_path

# Step 1

## Number of images in each split and each region

### By split:

In [25]:
import pandas as pd

bolivia_data_path = os.path.join(splits_path, "flood_bolivia_data.csv")
train_data_path = os.path.join(splits_path, "flood_train_data.csv")
val_data_path = os.path.join(splits_path, "flood_valid_data.csv")
test_data_path = os.path.join(splits_path, "flood_test_data.csv")

bolivia_data = pd.read_csv(bolivia_data_path, header=None, names=["scene", "mask"])
train_data = pd.read_csv(train_data_path, header=None, names=["scene", "mask"])
valid_data = pd.read_csv(val_data_path, header=None, names=["scene", "mask"])
test_data = pd.read_csv(test_data_path, header=None, names=["scene", "mask"])

In [29]:
bolivia_data.head()

Unnamed: 0,scene,mask
0,Bolivia_103757_S1Hand.tif,Bolivia_103757_LabelHand.tif
1,Bolivia_129334_S1Hand.tif,Bolivia_129334_LabelHand.tif
2,Bolivia_195474_S1Hand.tif,Bolivia_195474_LabelHand.tif
3,Bolivia_23014_S1Hand.tif,Bolivia_23014_LabelHand.tif
4,Bolivia_233925_S1Hand.tif,Bolivia_233925_LabelHand.tif


In [28]:
num_bolivia = bolivia_data.shape[0]
num_train = train_data.shape[0]
num_valid = valid_data.shape[0]
num_test = test_data.shape[0]

print(f"Number of Bolivia scenes: {num_bolivia}")
print(f"Number of training scenes: {num_train}")
print(f"Number of validation scenes: {num_valid}")
print(f"Number of test scenes: {num_test}")

Number of Bolivia scenes: 15
Number of training scenes: 252
Number of validation scenes: 89
Number of test scenes: 90


### By region:

List all files in the S2Hand_path.
Each file follows the naming scheme EVENT_CHIPID_LAYER.tif (e.g. Bolivia_103757_S2Hand.tif) so EVENT is the region.

In [None]:
region_counts = {}

for filename in os.listdir(S2Hand_path):
    if filename.endswith(".tif"):
        region_name = filename.split('_')[0]                                # Extract the region name (EVENT part of the filename)
        region_counts[region_name] = region_counts.get(region_name, 0) + 1

In [75]:
df = pd.DataFrame.from_dict(region_counts, orient='index', columns=['Count'])
df.index.name = 'Region'
df.reset_index(inplace=True)
df

Unnamed: 0,Region,Count
0,Bolivia,15
1,Ghana,53
2,India,68
3,Mekong,30
4,Nigeria,18
5,Pakistan,28
6,Paraguay,67
7,Somalia,26
8,Spain,30
9,Sri-Lanka,42


In [34]:
total_number = sum(region_counts.values())
total_number

446

## Per-channel mean and standard deviation.

In [96]:
import utils

In [79]:
scenes, _ = utils.get_scenes_arr(S2Hand_path)
scenes.shape

In [59]:
# from matplotlib import pyplot as plt

# plt.imshow(scenes_list[0][0, :, :], cmap='gray')

In [81]:
scenes = utils.dn2reflectance(scenes)

In [82]:
means = scenes.mean(axis=(0, 2, 3))
std_devs = scenes.std(axis=(0, 2, 3))

In [83]:
wavelengths = utils.get_satellite_wavelength("sentinel2")
bands_names = utils.get_bands_names(wavelengths)

In [111]:
channel_stats = pd.DataFrame({"band": bands_names, 'mean': means, 'std': std_devs})
channel_stats

Unnamed: 0,band,mean,std
0,B01,0.161226,0.069464
1,B02,0.137989,0.073459
2,B03,0.134443,0.073161
3,B04,0.119516,0.08606
4,B05,0.143917,0.077136
5,B06,0.234425,0.092163
6,B07,0.279647,0.108803
7,B08,0.257841,0.102925
8,B8A,0.302382,0.120511
9,B09,0.047673,0.033169


## Probability of water (based on labels)

### Per image (scene)

In [103]:
import utils

In [104]:
scenes_masks, chip_ids = utils.get_scenes_arr(LabelHand_path)
scenes_masks.shape

(446, 1, 512, 512)

In [98]:
water_probabilities = utils.calc_water_probabilities(scenes_masks)

In [110]:
water_probabilities_df = pd.DataFrame({'chip_id': chip_ids, 'water_probability': water_probabilities})
water_probabilities_df

Unnamed: 0,chip_id,water_probability
0,103757,0.40149
1,129334,0.658679
2,195474,0.005935
3,23014,0.03714
4,233925,0.000007
...,...,...
441,908474,0.007044
442,933610,0.01459
443,955053,0.136713
444,986268,0.034686


### Per train/dev/test sets and for the held-out region (Bolivia) 