# 01 - Exploratory Data Analysis of RAW input data

In [1]:
# Import necessary libraries
import rasterio
from rasterio.enums import Resampling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from skimage.feature import graycomatrix, graycoprops
from rasterio.warp import reproject, calculate_default_transform
from typing import Tuple, Dict, Any

# Set plotting style
sns.set(style="whitegrid")

In [None]:
# Utils functions

def plot_image(image: np.ndarray, title: str, cmap: str = 'viridis') -> None:
    """Plot a single band image.

    Args:
        image (np.ndarray): The image array to plot.
        title (str): Title of the plot.
        cmap (str): Colormap to use for the plot.
    """
    plt.figure(figsize=(10, 10))
    plt.imshow(image, cmap=cmap)
    plt.colorbar()
    plt.title(title)
    plt.axis('off')
    plt.show()

def plot_histogram(image: np.ndarray, title: str) -> None:
    """Plot a histogram of pixel values in an image.

    Args:
        image (np.ndarray): The image array to plot.
        title (str): Title of the histogram.
    """
    plt.figure(figsize=(10, 6))
    plt.hist(image.flatten(), bins=50, color='c', edgecolor='k', alpha=0.7)
    plt.title(title)
    plt.xlabel('Pixel Values')
    plt.ylabel('Frequency')
    plt.show()

def summarize_image(image: np.ndarray, name: str) -> None:
    """Print summary statistics of an image.

    Args:
        image (np.ndarray): The image array to summarize.
        name (str): Name of the image.
    """
    print(f"Summary statistics for {name}:")
    print(f" - Shape: {image.shape}")
    print(f" - Min value: {np.min(image)}")
    print(f" - Max value: {np.max(image)}")
    print(f" - Mean value: {np.mean(image)}")
    print(f" - Standard deviation: {np.std(image)}\n")

def resample_image(src: rasterio.io.DatasetReader, target_transform: rasterio.Affine, 
                   target_shape: Tuple[int, int]) -> np.ndarray:
    """Resample an image to a target resolution and shape.

    Args:
        src (rasterio.io.DatasetReader): Source dataset reader.
        target_transform (rasterio.Affine): Target affine transform.
        target_shape (Tuple[int, int]): Target shape.

    Returns:
        np.ndarray: Resampled image.
    """
    data = src.read(
        out_shape=(
            src.count,
            target_shape[0],
            target_shape[1]
        ),
        resampling=Resampling.bilinear
    )
    return data.squeeze()


def reproject_resample(src_path: str, target_crs: str, 
                       target_transform: rasterio.Affine, target_shape: Tuple[int, int]) -> Tuple[np.ndarray, Dict[str, Any]]:
    """Reproject and resample an image to match a target CRS and resolution.

    Args:
        src_path (str): Path to the source image.
        target_crs (str): Target coordinate reference system.
        target_transform (rasterio.Affine): Target affine transform.
        target_shape (Tuple[int, int]): Target shape.

    Returns:
        Tuple[np.ndarray, Dict[str, Any]]: Resampled image and metadata.
    """
    with rasterio.open(src_path) as src:
        if src.crs != target_crs:
            transform, width, height = calculate_default_transform(
                src.crs, target_crs, src.width, src.height, *src.bounds)
            kwargs = src.meta.copy()
            kwargs.update({
                'crs': target_crs,
                'transform': transform,
                'width': width,
                'height': height
            })

            data = np.empty((src.count, target_shape[0], target_shape[1]), dtype=src.dtypes[0])
            reproject(
                source=rasterio.band(src, 1),
                destination=data,
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=target_transform,
                dst_crs=target_crs,
                resampling=Resampling.bilinear
            )
        else:
            kwargs = src.meta.copy()
            data = src.read(
                out_shape=(
                    src.count,
                    target_shape[0],
                    target_shape[1]
                ),
                resampling=Resampling.bilinear
            )
    return data.squeeze(), kwargs

def plot_pca(vv_band: np.ndarray, hand_resampled: np.ndarray, mask_resampled: np.ndarray, sample_size: int = 100000) -> None:
    """Perform PCA on VV Band and HAND data and plot the results.

    Args:
        vv_band (np.ndarray): VV Band image array.
        hand_resampled (np.ndarray): Resampled HAND image array.
        mask_resampled (np.ndarray): Resampled binary mask array.
        sample_size (int): Number of pixels to sample for PCA.
    """
    np.random.seed(42)
    
    vv_flat = vv_band.flatten()
    hand_flat = hand_resampled.flatten()
    mask_flat = mask_resampled.flatten()
    
    combined = np.vstack((vv_flat, hand_flat)).T
    indices = np.random.choice(combined.shape[0], sample_size, replace=False)
    combined_sampled = combined[indices, :]
    mask_sampled = mask_flat[indices]
    
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(combined_sampled)

    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=mask_sampled, cmap='coolwarm', alpha=0.5)
    plt.title('PCA of VV Band and HAND')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, label='Binary Mask')
    plt.show()

def plot_texture_features(image: np.ndarray, distances: list[int], angles: list[float]) -> None:
    """Plot texture features of an image using GLCM.

    Args:
        image (np.ndarray): Image array to analyze.
        distances (list[int]): List of distances for GLCM.
        angles (list[float]): List of angles for GLCM.
    """
    image_uint8 = (image / image.max() * 255).astype('uint8')
    
    glcm = graycomatrix(image_uint8, distances=distances, angles=angles, symmetric=True, normed=True)
    features = {
        'Contrast': graycoprops(glcm, 'contrast').flatten(),
        'Dissimilarity': graycoprops(glcm, 'dissimilarity').flatten(),
        'Homogeneity': graycoprops(glcm, 'homogeneity').flatten(),
        'Energy': graycoprops(glcm, 'energy').flatten(),
        'Correlation': graycoprops(glcm, 'correlation').flatten(),
        'ASM': graycoprops(glcm, 'ASM').flatten()
    }

    plt.figure(figsize=(15, 10))
    for i, (feature, values) in enumerate(features.items()):
        plt.subplot(2, 3, i + 1)
        sns.barplot(x=['d=1, a=0', 'd=1, a=pi/4', 'd=1, a=pi/2', 'd=1, a=3pi/4'], y=values)
        plt.title(feature)
        plt.xlabel('Parameter Combination')
        plt.ylabel('Value')

    plt.tight_layout()
    plt.show()

def plot_overlay(image: np.ndarray, mask: np.ndarray, title: str, alpha: float = 0.5) -> None:
    """Plot an overlay of a binary mask on an image.

    Args:
        image (np.ndarray): Base image array.
        mask (np.ndarray): Mask array to overlay.
        title (str): Title of the plot.
        alpha (float): Alpha value for mask transparency.
    """
    plt.figure(figsize=(10, 10))
    plt.imshow(image, cmap='gray', extent=[0, image.shape[1], 0, image.shape[0]])
    plt.imshow(mask, cmap='cool', alpha=alpha, extent=[0, mask.shape[1], 0, mask.shape[0]])
    plt.title(title)
    plt.axis('off')
    plt.show()

In [None]:
# Example RAW Files paths

vv_band_path = "../data/input_scene_1/vv/vv.tif"
hand_path = "../data/input_scene_1/hand/hand.tif"
mask_path = "../data/input_scene_1/mask/corrected/mask.tif"

In [None]:
# Load Sentinel-1 VV Band Image
with rasterio.open(vv_band_path) as src:
    vv_band = src.read(1)
    vv_transform = src.transform
    vv_crs = src.crs
    vv_shape = vv_band.shape

# Load HAND Image
with rasterio.open(hand_path) as src:
    hand = src.read(1)
    hand_transform = src.transform
    hand_crs = src.crs

# Load Binary Mask
with rasterio.open(mask_path) as src:
    binary_mask = src.read(1)
    mask_transform = src.transform
    mask_crs = src.crs

In [None]:
# Summary Data of Files
summarize_image(vv_band, "Sentinel-1 VV Band")
summarize_image(hand, "HAND")
summarize_image(binary_mask, "Binary Mask")

In [None]:
# Plot the Images
plot_image(vv_band, 'Sentinel-1 VV Band Image')
plot_image(hand, 'HAND Image', cmap='terrain')
plot_image(binary_mask, 'Binary Mask Image', cmap='gray')

In [None]:
# Plot Histograms of the Images
plot_histogram(vv_band, 'Histogram of Sentinel-1 VV Band Image')
plot_histogram(hand, 'Histogram of HAND Image')
plot_histogram(binary_mask, 'Histogram of Binary Mask Image')

In [None]:
# Define a target transform and shape based on the VV band image
target_transform = vv_transform
target_shape = vv_band.shape

In [None]:
# Resample HAND image and Binary Mask to match VV Band resolution and size
hand_resampled, _ = reproject_resample(hand_path, vv_crs, vv_transform, vv_shape)
mask_resampled, _ = reproject_resample(mask_path, vv_crs, vv_transform, vv_shape)

In [None]:
# Summary Data of Resampled Files
summarize_image(vv_band, "Sentinel-1 VV Band")
summarize_image(hand_resampled, "HAND (Resampled)")
summarize_image(mask_resampled, "Binary Mask (Resampled)")

In [None]:
# Plot the Resampled Images
plot_image(vv_band, 'Sentinel-1 VV Band Image')
plot_image(hand_resampled, 'HAND Image (Resampled)', cmap='terrain')
plot_image(mask_resampled, 'Binary Mask Image (Resampled)', cmap='gray')

In [None]:
# Plot Histograms of the Resampled Images
plot_histogram(vv_band, 'Histogram of Sentinel-1 VV Band Image')
plot_histogram(hand_resampled, 'Histogram of HAND Image (Resampled)')
plot_histogram(mask_resampled, 'Histogram of Binary Mask Image (Resampled)')

In [None]:
# Correlation Analysis between VV Band and HAND
corr = np.corrcoef(vv_band.flatten(), hand_resampled.flatten())[0, 1]
print(f"Correlation between VV Band and HAND: {corr:.2f}")

In [None]:
# Distribution of Pixel Values in VV Band for Water and Non-Water Areas
np.random.seed(42)
sample_size = 100000

water_pixels_vv = vv_band[mask_resampled == 1]
non_water_pixels_vv = vv_band[mask_resampled == 0]

sample_size_water = min(len(water_pixels_vv), sample_size)
sample_size_non_water = min(len(non_water_pixels_vv), sample_size)

water_pixels_vv_sampled = np.random.choice(water_pixels_vv, sample_size_water, replace=False)
non_water_pixels_vv_sampled = np.random.choice(non_water_pixels_vv, sample_size_non_water, replace=False)

In [None]:
# Plot the distribution of VV Band pixel values for Water and Non-Water Areas
plt.figure(figsize=(12, 6))
sns.histplot(water_pixels_vv_sampled, color='b', label='Water Pixels', kde=True, stat="density")
sns.histplot(non_water_pixels_vv_sampled, color='r', label='Non-Water Pixels', kde=True, stat="density")
plt.title('Distribution of VV Band Pixel Values for Water and Non-Water Areas')
plt.xlabel('Pixel Value')
plt.ylabel('Density')
plt.legend()
plt.show()