## EDA + DATA PREPARATION
* In this notebook we conduct a study on which bands are good to train an Image classifier with


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss


import matplotlib.pyplot as plt
import seaborn as sns


from itertools import combinations
import random

from PIL import Image


import warnings
warnings.filterwarnings("ignore")




In [None]:
path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
train = pd.read_csv(path + "Train.csv")
test = pd.read_csv(path + "Test.csv")
submission = pd.read_csv(path + "SampleSubmission.csv")
images = np.load(path + "composite_images.npz")
display(train.head(), test.head())

In [None]:
def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]

def get_event_id(value):
  return value.split("_")[3]
for df in [train, test]:
  df['location_id'] = df['event_id'].apply(lambda x: get_location(x))
  df['event'] = df['event_id'].apply(lambda x: get_event_id(x))

print(len(set(train['location_id'])), len(set(test['location_id'])))
print(len(set(train['location_id']).intersection(set(test['location_id']))))
print(len(images))
display(train.head(), test.head())

In [None]:
train.groupby(['location_id'])['event_id'].count()

* each image has 730 events
* no intersection of images betweeen the two data sets (unique sets)
* The numpy files has 898 images for both train and test

* The images are annual cloud-free composite images from Sentinel-2 satellite imagery. They are of size 128x128 and contain the following 6 channels:

      Sentinel-2 B2 (Blue)
      Sentinel-2 B3 (Green)
      Sentinel-2 B4 (Red)
      Sentinel-2 B8 (NIR)
      Sentinel-2 B11 (SWIR)
      Slope (derived from NASA SRTM)

* the images are essentially static for any event/location pair over the study period.
  * the images only serve as spatial representations of the environment for that location over the 730 day period
  * it reflects static or semi-static environmental conditons (e.g land use, vegetation, water bodies, topography) that could influence flood occurence
  * so the images cannot provide temporal insights but what we can do is extract spatial features such as NDVI, NDWI, NDBI, Topographic features like slope and elevation changes from the slope channel
  * combine the spatial features with temporal precipitation data to enrich the dataset by treating the spatial features as fixed covariates that describe each location.
    * Areas with high NDWI Might flood more frequently with heavy precipitation
    * LOcations with high slope values might experience flash floods after intense rainfall

  * Image processing:
    * Use pretrained models to extract image embeddings or use PCA for dimensionality reduction
    * create a binary classifier where 1 is images where a flood has occured in any of the 730 events and 0 if no floods has occured to create a soft flag for flood-prone locations. Even if not perfect they can serve as a proxy for environmental vulnerability to floods
    * The image classifier naturally reduces the extreme imbalance in the dataset by focusing on binary flood/non-flood classification
  
  * clustering locations:
    * group events/locations based on spatial features (e.g NDVI, NDWI) to identify patterns in flood susceptibility
  * correlating spatial features with precipitation thresholds:
    * study how spatial features interact with specific precipitation thresholds that leads to floods


### Visualize Images

In [None]:
BAND_NAMES = ('B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE')
H,W, NUM_CHANNELS = IMG_DIM = (128, 128, len(BAND_NAMES))
print(IMG_DIM)

In [None]:
event_id = 'id_rhg5w8vmv3ny'

import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations

def calculate_ndvi(red_band, nir_band):
    """Calculate Normalized Difference Vegetation Index (NDVI)"""
    return (nir_band - red_band) / (nir_band + red_band)

def calculate_evi(blue, red, nir):
    """Calculate Enhanced Vegetation Index (EVI)"""
    G = 2.5
    C1 = 6
    C2 = 7.5
    L = 1
    return G * ((nir - red) / (nir + C1 * red - C2 * blue + L))

def calculate_nbr(nir, swir):
    """Normalized Burn Ratio (NBR)"""
    return (nir - swir) / (nir + swir)

def create_comprehensive_band_visualization(images, event_id, band_names):
    # Map band indices for easy access
    band_indices = {name: band_names.index(name) for name in band_names}

    # Extract all bands
    bands = {name: images[event_id][..., idx] for name, idx in band_indices.items()}

    # Create various composite and index images
    composites = {
        'RGB': np.stack([bands['B4'], bands['B3'], bands['B2']], axis=-1),
        'False Color Vegetation': np.stack([bands['B8'], bands['B4'], bands['B3']], axis=-1),
        'Urban': np.stack([bands['B11'], bands['B8'], bands['B4']], axis=-1),
        'Moisture Stress': np.stack([bands['B11'], bands['B8'], bands['B2']], axis=-1)
    }

    # Normalize composites
    for name, composite in composites.items():
        composites[name] = (composite - composite.min()) / (composite.max() - composite.min())

    # Calculate indices
    indices = {
        'NDVI': calculate_ndvi(bands['B4'], bands['B8']),
        'EVI': calculate_evi(bands['B2'], bands['B4'], bands['B8']),
        'NBR': calculate_nbr(bands['B8'], bands['B11']),
        'Slope': bands['SLOPE']
    }

    # Prepare all potential 3-band combinations for CNN exploration
    band_list = ['B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE']
    potential_combinations = list(combinations(band_list, 3))

    # Create visualization
    num_images = len(composites) + len(indices) + len(potential_combinations)
    rows = (num_images + 5) // 6  # Ensure we have enough rows

    _, axes = plt.subplots(
        nrows=rows,
        ncols=6,
        figsize=(20, rows * 3.5),
        facecolor='white'
    )
    axes = axes.flatten()

    # Plot composites
    for i, (name, composite) in enumerate(composites.items()):
        axes[i].imshow(composite)
        axes[i].set_title(name)
        axes[i].axis('off')

    # Plot indices
    for i, (name, index) in enumerate(indices.items(), start=len(composites)):
        axes[i].imshow(index, cmap='viridis')
        axes[i].set_title(name)
        axes[i].axis('off')

    # Plot potential 3-band combinations
    combo_start = len(composites) + len(indices)
    for i, combo in enumerate(potential_combinations[:6-combo_start], start=combo_start):
        three_band_combo = np.stack([bands[combo[0]], bands[combo[1]], bands[combo[2]]], axis=-1)
        three_band_combo = (three_band_combo - three_band_combo.min()) / (three_band_combo.max() - three_band_combo.min())
        axes[i].imshow(three_band_combo)
        axes[i].set_title(f'{combo[0]}/{combo[1]}/{combo[2]}')
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

    # Return potential combinations for CNN
    return potential_combinations

# Generate visualizations and get potential combinations
potential_cnn_combinations = create_comprehensive_band_visualization(images, event_id, BAND_NAMES)

# Print out the potential combinations
print("Potential 3-band combinations for CNN:")
for combo in potential_cnn_combinations:
    print(combo)


### Visualize for Flood and Non-Flood images
* Visualize if flood and non flood images have subtle differences visible to the human eye

In [None]:
train_grouped = pd.DataFrame(train.groupby('location_id')['label'].agg('max')).reset_index()
test_grouped = pd.DataFrame(test.groupby('location_id')['event_id'].count()).reset_index()
test_grouped.columns = ['location_id', 'event_id_counts']
display(train_grouped.head(), test_grouped.head())

In [None]:
train_grouped['label'].value_counts()

In [None]:
def create_comprehensive_band_visualization(images, event_id, band_names):
    # Map band indices for easy access
    band_indices = {name: band_names.index(name) for name in band_names}

    # Extract all bands
    bands = {name: images[event_id][..., idx] for name, idx in band_indices.items()}

    # Create various composite and index images
    composites = {
        'RGB': np.stack([bands['B4'], bands['B3'], bands['B2']], axis=-1),
        'False Color Vegetation': np.stack([bands['B8'], bands['B4'], bands['B3']], axis=-1),
        'Urban': np.stack([bands['B11'], bands['B8'], bands['B4']], axis=-1),
        'Moisture Stress': np.stack([bands['B11'], bands['B8'], bands['B2']], axis=-1)
    }

    # Normalize composites
    for name, composite in composites.items():
        composites[name] = (composite - composite.min()) / (composite.max() - composite.min())

    # Calculate indices
    indices = {
        'NDVI': calculate_ndvi(bands['B4'], bands['B8']),
        'EVI': calculate_evi(bands['B2'], bands['B4'], bands['B8']),
        'NBR': calculate_nbr(bands['B8'], bands['B11']),
        'Slope': bands['SLOPE']
    }

    # Prepare all potential 3-band combinations for CNN exploration
    band_list = ['B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE']
    potential_combinations = list(combinations(band_list, 3))

    # Combine all images
    all_images = {}
    all_images.update(composites)
    all_images.update(indices)

    # Additional 3-band combinations
    for combo in potential_combinations[:6]:  # Limit to first 6 combinations
        combo_name = f'{combo[0]}/{combo[1]}/{combo[2]}'
        all_images[combo_name] = np.stack([bands[combo[0]], bands[combo[1]], bands[combo[2]]], axis=-1)
        all_images[combo_name] = (all_images[combo_name] - all_images[combo_name].min()) / (all_images[combo_name].max() - all_images[combo_name].min())

    return all_images

def plot_comprehensive_image_comparison(images, flood_image, non_flood_image, band_names):
    # Generate visualizations for both images
    flood_visualizations = create_comprehensive_band_visualization(images, flood_image, band_names)
    non_flood_visualizations = create_comprehensive_band_visualization(images, non_flood_image, band_names)

    # Get all visualization names (ensure same order for both)
    viz_names = list(flood_visualizations.keys())

    # Calculate number of rows needed
    num_images = len(viz_names)
    num_cols = 2  # One for flood, one for non-flood
    num_rows = num_images  # Each visualization type gets its own row

    # Create figure
    fig, axes = plt.subplots(
        nrows=num_rows,
        ncols=num_cols,
        figsize=(20, num_rows * 5),
        facecolor='white'
    )

    # Add suptitle
    fig.suptitle('Flood vs Non-Flood Image Comparison', fontsize=16)

    # Plot all visualizations
    for i, name in enumerate(viz_names):
        # Flood image (left column)
        if flood_visualizations[name].ndim == 3:
            axes[i, 0].imshow(flood_visualizations[name])
        else:
            axes[i, 0].imshow(flood_visualizations[name], cmap='viridis')
        axes[i, 0].set_title(f"Flood: {name}")
        axes[i, 0].axis('off')

        # Non-flood image (right column)
        if non_flood_visualizations[name].ndim == 3:
            axes[i, 1].imshow(non_flood_visualizations[name])
        else:
            axes[i, 1].imshow(non_flood_visualizations[name], cmap='viridis')
        axes[i, 1].set_title(f"Non-Flood: {name}")
        axes[i, 1].axis('off')

    plt.tight_layout()
    plt.show()

def plot_flood_and_non_flood_images(train_df, images, band_names):
    # Split data into flood and non-flood groups
    flood_images = train_df[train_df['label'] == 1]['location_id'].values
    non_flood_images = train_df[train_df['label'] == 0]['location_id'].values

    # Randomly choose one image from each group
    random_flood_image = random.choice(flood_images)
    random_non_flood_image = random.choice(non_flood_images)

    print(f"Random flood image: {random_flood_image}")
    print(f"Random non-flood image: {random_non_flood_image}")

    # Plot comprehensive comparison
    plot_comprehensive_image_comparison(images, random_flood_image, random_non_flood_image, band_names)

# Call the function
plot_flood_and_non_flood_images(train_grouped, images, BAND_NAMES)

In [None]:


# Calculate custom ratio-based bands using available bands
def calculate_custom_ratio_bands(bands):
    custom_bands = {}

    # Model A: (B2-B3)/(B2+B3), (B4-B8)/(B4+B8), (B11-SLOPE)/(B11+SLOPE)
    custom_bands['A1'] = (bands['B2'] - bands['B3']) / (bands['B2'] + bands['B3'])
    custom_bands['A2'] = (bands['B4'] - bands['B8']) / (bands['B4'] + bands['B8'])
    custom_bands['A3'] = (bands['B11'] - bands['SLOPE']) / (bands['B11'] + bands['SLOPE'])

    # Model C: (B2-B4)/(B2+B4), (B3-B8)/(B3+B8), (B4-B11)/(B4+B11)
    custom_bands['C1'] = (bands['B2'] - bands['B4']) / (bands['B2'] + bands['B4'])
    custom_bands['C2'] = (bands['B3'] - bands['B8']) / (bands['B3'] + bands['B8'])
    custom_bands['C3'] = (bands['B4'] - bands['B11']) / (bands['B4'] + bands['B11'])

    # Model D: (B2-B8)/(B2+B8), (B3-B11)/(B3+B11), (B8-SLOPE)/(B8+SLOPE)
    custom_bands['D1'] = (bands['B2'] - bands['B8']) / (bands['B2'] + bands['B8'])
    custom_bands['D2'] = (bands['B3'] - bands['B11']) / (bands['B3'] + bands['B11'])
    custom_bands['D3'] = (bands['B8'] - bands['SLOPE']) / (bands['B8'] + bands['SLOPE'])

    # Model E: (B2-B11)/(B2+B11), (B4-BSLOPE)/(B4+B SLOPE), (B8-B11)/(B8+B11)
    custom_bands['E1'] = (bands['B2'] - bands['B11']) / (bands['B2'] + bands['B11'])
    custom_bands['E2'] = (bands['B4'] - bands['SLOPE']) / (bands['B4'] + bands['SLOPE'])
    custom_bands['E3'] = (bands['B8'] - bands['B11']) / (bands['B8'] + bands['B11'])

    # Model G: (B2-B3)/(B2+B3), (B3-B11)/(B3+B11), (B11-SLOPE)/(B11+SLOPE)
    custom_bands['G1'] = (bands['B2'] - bands['B3']) / (bands['B2'] + bands['B3'])
    custom_bands['G2'] = (bands['B3'] - bands['B11']) / (bands['B3'] + bands['B11'])
    custom_bands['G3'] = (bands['B11'] - bands['SLOPE']) / (bands['B11'] + bands['SLOPE'])

    return custom_bands

# Function to create a comprehensive band visualization
def create_comprehensive_band_visualization(images, event_id, band_names):
    # Map band indices for easy access
    band_indices = {name: band_names.index(name) for name in band_names}

    # Extract all bands
    bands = {name: images[event_id][..., idx] for name, idx in band_indices.items()}

    # Create composites and indices (already present in your code)
    composites = {
        'RGB': np.stack([bands['B4'], bands['B3'], bands['B2']], axis=-1),
        'False Color Vegetation': np.stack([bands['B8'], bands['B4'], bands['B3']], axis=-1),
        'Urban': np.stack([bands['B11'], bands['B8'], bands['B4']], axis=-1),
        'Moisture Stress': np.stack([bands['B11'], bands['B8'], bands['B2']], axis=-1)
    }

    # Normalize composites
    for name, composite in composites.items():
        composites[name] = (composite - composite.min()) / (composite.max() - composite.min())

    # Calculate indices (already defined)
    indices = {
        'NDVI': calculate_ndvi(bands['B4'], bands['B8']),
        'EVI': calculate_evi(bands['B2'], bands['B4'], bands['B8']),
        'NBR': calculate_nbr(bands['B8'], bands['B11']),
        'Slope': bands['SLOPE']
    }

    # Calculate custom ratio-based bands
    custom_bands = calculate_custom_ratio_bands(bands)

    # Prepare all potential 3-band combinations for CNN exploration
    band_list = ['B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE']
    potential_combinations = list(combinations(band_list, 3))

    # Combine all images
    all_images = {}
    all_images.update(composites)
    all_images.update(indices)
    all_images.update(custom_bands)

    # Additional 3-band combinations
    for combo in potential_combinations:  # Include all combinations
        combo_name = f'{combo[0]}/{combo[1]}/{combo[2]}'
        all_images[combo_name] = np.stack([bands[combo[0]], bands[combo[1]], bands[combo[2]]], axis=-1)
        all_images[combo_name] = (all_images[combo_name] - all_images[combo_name].min()) / (all_images[combo_name].max() - all_images[combo_name].min())

    return all_images

def plot_comprehensive_image_comparison(images, flood_image, non_flood_image, band_names):
    # Generate visualizations for both images
    flood_visualizations = create_comprehensive_band_visualization(images, flood_image, band_names)
    non_flood_visualizations = create_comprehensive_band_visualization(images, non_flood_image, band_names)

    # Get all visualization names (ensure same order for both)
    viz_names = list(flood_visualizations.keys())

    # Calculate number of rows needed
    num_images = len(viz_names)
    num_cols = 2  # One for flood, one for non-flood
    num_rows = num_images  # Each visualization type gets its own row

    # Create figure
    fig, axes = plt.subplots(
        nrows=num_rows,
        ncols=num_cols,
        figsize=(20, num_rows * 5),
        facecolor='white'
    )

    # Add suptitle
    fig.suptitle('Flood vs Non-Flood Image Comparison', fontsize=16)

    # Plot all visualizations
    for i, name in enumerate(viz_names):
        # Flood image (left column)
        if flood_visualizations[name].ndim == 3:
            axes[i, 0].imshow(flood_visualizations[name])
        else:
            axes[i, 0].imshow(flood_visualizations[name], cmap='viridis')
        axes[i, 0].set_title(f"Flood: {name}")
        axes[i, 0].axis('off')

        # Non-flood image (right column)
        if non_flood_visualizations[name].ndim == 3:
            axes[i, 1].imshow(non_flood_visualizations[name])
        else:
            axes[i, 1].imshow(non_flood_visualizations[name], cmap='viridis')
        axes[i, 1].set_title(f"Non-Flood: {name}")
        axes[i, 1].axis('off')

    plt.tight_layout()
    plt.show()

def plot_flood_and_non_flood_images(train_df, images, band_names):
    # Split data into flood and non-flood groups
    flood_images = train_df[train_df['label'] == 1]['location_id'].values
    non_flood_images = train_df[train_df['label'] == 0]['location_id'].values

    # Randomly choose one image from each group
    random_flood_image = random.choice(flood_images)
    random_non_flood_image = random.choice(non_flood_images)

    print(f"Random flood image: {random_flood_image}, Non-flood image: {random_non_flood_image}")

    plot_comprehensive_image_comparison(images, random_flood_image, random_non_flood_image, band_names)

# Call the function
plot_flood_and_non_flood_images(train_grouped, images, BAND_NAMES)


### Save all these possible combinations in image folders for further experimentation
* Before we saved them all but in the end we only used the Moisture Stress, so in this updated code I will only save the Moisture stress and Ignore everything else
* Feel free to study the code and uncomment if you want all the possible combinations

In [None]:
import os
import numpy as np
from PIL import Image
from itertools import combinations
from tqdm import tqdm

# Create directories for each valid combination
def create_directories(parent_dir, combinations):
    for combo in combinations:
        combo_folder = os.path.join(parent_dir, combo)
        # Check if the directory exists, and create it if it doesn't
        if not os.path.exists(combo_folder):
            os.makedirs(combo_folder)

# Function to check if a given combination is valid (i.e., 3 bands)
def is_valid_combination(bands, combination):
    return all(band in bands for band in combination)

# Save valid 3-band combinations (composite images)
def save_valid_3band_combinations(images, train_df, band_names, parent_dir, image_format='png'):
    # Define possible band combinations (3-band)
    composites = {
        # 'RGB': ['B4', 'B3', 'B2'],
        # 'False Color Vegetation': ['B8', 'B4', 'B3'],
        # 'Urban': ['B11', 'B8', 'B4'],
        'Moisture Stress': ['B11', 'B8', 'B2'],
        # You can add more combinations as needed
    }

    # Additional 3-band combinations from the available bands
    # band_list = ['B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE']
    # additional_combinations = list(combinations(band_list, 3))

    # # Add additional combinations to the composites dictionary
    # for comb in additional_combinations:
    #     comb_name = '_'.join(comb)
    #     composites[comb_name] = list(comb)

    # Create directories for the combinations
    create_directories(parent_dir, composites.keys())
    print(composites.keys())

    # Iterate over the training dataset
    for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Saving Images", unit="image"):
        location_id = row['location_id']

        # Extract bands for the current location
        band_indices = {name: band_names.index(name) for name in band_names}
        bands = {name: images[location_id][..., idx] for name, idx in band_indices.items()}

        # Iterate through the composites and save valid 3-channel images
        for composite_name, band_keys in composites.items():
            if is_valid_combination(bands, band_keys):
                # Stack the bands
                composite_image = np.stack([bands[band] for band in band_keys], axis=-1)

                # Ensure the image has exactly 3 channels
                if composite_image.shape[-1] == 3:
                    # Normalize the image to the range [0, 255]
                    composite_image = (composite_image - composite_image.min()) / (composite_image.max() - composite_image.min()) * 255
                    composite_image = composite_image.astype(np.uint8)

                    # Resize the image to 384x384
                    pil_img = Image.fromarray(composite_image)
                    pil_img = pil_img.resize((384, 384), Image.Resampling.LANCZOS)

                    # Construct the path to save the image
                    save_dir = os.path.join(parent_dir, composite_name)
                    save_path = os.path.join(save_dir, f'{location_id}.{image_format}')

                    # Check if the image already exists before saving it
                    if not os.path.exists(save_path):
                        # Save the image
                        pil_img.save(save_path)

    print(f"Images have been saved successfully to {parent_dir}")

# Example usage
save_valid_3band_combinations(images, train_grouped, BAND_NAMES, '/kaggle/working/', image_format='png')


In [None]:
save_valid_3band_combinations(images, test_grouped, BAND_NAMES, '/kaggle/working/', image_format='png')


In [None]:
len(os.listdir('/kaggle/working/Moisture Stress'))