In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

track1_india_al_impact_gen_ai_hackathon_path = kagglehub.competition_download('track1-india-al-impact-gen-ai-hackathon')

print('Data source import complete.')


In [None]:
import warnings
# Ignore specific RuntimeWarning from rioxarray
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Install required libraries
!pip install --quiet rasterio terratorch

# Import all necessary modules
import os
import rasterio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightning.pytorch as pl
from terratorch.datamodules import GenericNonGeoSegmentationDataModule
from terratorch.tasks import SemanticSegmentationTask

# Define the base path to the dataset
# We use the 'archive' path as it's cleaner
BASE_DIR = "/kaggle/input/track1-india-al-impact-gen-ai-hackathon/archive"

print(f"Using base directory: {BASE_DIR}")

In [None]:
!pip install --quiet lightning

In [None]:
import os
import rasterio # Import rasterio
import numpy as np # Import numpy
import matplotlib.pyplot as plt # Import matplotlib.pyplot

# Load Image IDs from the split files
# The split files are located in the main directory after extraction
# Using the standard Kaggle input path for competition data
# The structure is typically /kaggle/input/competition-name/archive/...
KAGGLE_INPUT_BASE = "/kaggle/input/track1-india-al-impact-gen-ai-hackathon"
# Adjusted SPLIT_DIR path - assuming split files are under 'archive'
# IMPORTANT: Please verify this path in your Kaggle notebook's input data section
# and adjust SPLIT_DIR if the train.txt file is located elsewhere.
SPLIT_DIR = os.path.join(KAGGLE_INPUT_BASE, 'archive') # Adjusted path to include 'archive'
BASE_DIR = SPLIT_DIR # BASE_DIR is the same as SPLIT_DIR


with open(os.path.join(SPLIT_DIR, "train.txt"), "r") as f:
    train_ids = [line.strip() for line in f]

print(f"Found {len(train_ids)} training image IDs.")
print(f"First 3 IDs: {train_ids[:3]}")

# --- Visualize a Sample ---
sample_id = train_ids[0]

input_path = os.path.join(BASE_DIR, "train/inputs", f"{sample_id}_input.tif")
label_path = os.path.join(BASE_DIR, "train/labels", f"{sample_id}_label_c6.tif")

print(f"\nVisualizing input: {input_path}")
print(f"Visualizing label: {label_path}")

# Load images using rasterio
with rasterio.open(input_path) as src:
    # Read the first 3 bands (Red, Green, Blue) for visualization
    # Check if there are enough bands for RGB, otherwise use the first band
    num_bands = src.count
    if num_bands >= 3:
        image = src.read([1, 2, 3]).transpose(1, 2, 0)
        # Normalize pixel values to be between 0 and 1 for proper display
        image = (image - image.min()) / (image.max() - image.min())
    else:
        # If not enough bands for RGB, just read and display the first band in grayscale
        image = src.read(1)
        # Normalize the single band image
        image = (image - image.min()) / (image.max() - image.min())
        # Add a dummy dimension to make it compatible with imshow for grayscale
        image = image[..., np.newaxis]


with rasterio.open(label_path) as src:
    label = src.read(1)

# Display the images side-by-side
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# For single band grayscale images, imshow might need cmap='gray'
ax[0].imshow(image.squeeze(), cmap='gray' if image.shape[-1] == 1 else None)
ax[0].set_title(f"Input: {sample_id}")
ax[0].axis('off')

ax[1].imshow(label, cmap='jet')
ax[1].set_title(f"Label Mask: {sample_id}")
ax[1].axis('off')

plt.show()

In [None]:
# These are pre-computed statistics for the 12 bands in the dataset
MEANS = [43.37, 38.76, 37.58, 39.39, 42.61, 54.78, 63.25, 59.99, 13.36, 69.21, 48.32, 69.70]
STDS = [3.33, 4.16, 5.43, 9.23, 8.01, 6.74, 8.07, 7.84, 2.56, 16.96, 15.58, 9.25]

datamodule = GenericNonGeoSegmentationDataModule(
    batch_size=8,
    num_workers=2,
    num_classes=6,
    train_data_root=os.path.join(BASE_DIR, 'train/inputs'),
    train_label_data_root=os.path.join(BASE_DIR, 'train/labels'),
    val_data_root=os.path.join(BASE_DIR, 'val/inputs'),
    val_label_data_root=os.path.join(BASE_DIR, 'val/labels'),
    test_data_root=os.path.join(BASE_DIR, 'test/inputs'),
    predict_data_root=os.path.join(BASE_DIR, 'test/inputs'),
    train_split=os.path.join(BASE_DIR, "train.txt"),
    val_split=os.path.join(BASE_DIR, "val.txt"),
    test_split=os.path.join(BASE_DIR, "test.txt"),
    img_grep='*_input.tif',
    label_grep='*_label_c6.tif',
    means=MEANS,
    stds=STDS,
)

datamodule.setup("fit")
print(f"Train set size: {len(datamodule.train_dataset)}")
print(f"Validation set size: {len(datamodule.val_dataset)}")

In [None]:
# --- Updated Datamodule and Trainer Configuration for Memory Management ---
import lightning.pytorch as pl
from terratorch.datamodules import GenericNonGeoSegmentationDataModule
from terratorch.tasks import SemanticSegmentationTask
import os
import torch
import torch.nn as nn
from torchmetrics.classification import MulticlassJaccardIndex # Import MulticlassJaccardIndex
# Removed problematic Dice import and torchvision transforms
from lightning.pytorch.callbacks import LearningRateMonitor, EarlyStopping, ModelCheckpoint


# These variables should be defined from your previous cells
# Re-define them here for clarity if you are running this cell separately
# Access the variable from the user namespace
# Removed Colab-specific path access
# track1_india_al_impact_gen_ai_hackathon_path = get_ipython().user_ns.get('track1_india_al_impact_gen_ai_hackathon_path')

# Define BASE_DIR using the standard Kaggle input path
# IMPORTANT: Verify this path in your Kaggle notebook's input data section
KAGGLE_INPUT_BASE = "/kaggle/input/track1-india-al-impact-gen-ai-hackathon"
# Assuming data and split files are under 'archive' as in the visualization cell
BASE_DIR = os.path.join(KAGGLE_INPUT_BASE, 'archive')

MEANS = [43.37, 38.76, 37.58, 39.39, 42.61, 54.78, 63.25, 59.99, 13.36, 69.21, 48.32, 69.70]
STDS = [3.33, 4.16, 5.43, 9.23, 8.01, 6.74, 8.07, 7.84, 2.56, 16.96, 15.58, 9.25]
CLASS_WEIGHTS = [0.4761, 0.1702, 0.0334, 0.2674, 0.0204, 0.0326]
model_args = {
    "backbone": "prithvi_eo_v2_300_tl",
    "backbone_pretrained": True,
    "backbone_bands": list(range(1, 13)),
    "decoder": "UperNetDecoder",
    "num_classes": 6,
}
# --- UPDATED BATCH SIZE ---
# Reduce the batch size to a smaller value to fit in memory
# A value of 2 is a good starting point.
BATCH_SIZE = 2 # Further reduced batch size

# --- Removed Data Augmentation Transformations ---
train_transform = None
val_test_transform = None
predict_transform = None # Set predict_transform to None


# The datamodule configuration is correct.
datamodule = GenericNonGeoSegmentationDataModule(
    batch_size=BATCH_SIZE, # <--- UPDATED: Smaller batch size
    num_workers=2,
    num_classes=6,
    train_data_root=os.path.join(BASE_DIR, 'train/inputs'),
    train_label_data_root=os.path.join(BASE_DIR, 'train/labels'),
    val_data_root=os.path.join(BASE_DIR, 'val/inputs'),
    val_label_data_root=os.path.join(BASE_DIR, 'val/labels'),
    test_data_root=os.path.join(BASE_DIR, 'val/inputs'),
    test_label_data_root=os.path.join(BASE_DIR, 'val/labels'),
    predict_data_root=os.path.join(BASE_DIR, 'test/inputs'),
    train_split=os.path.join(BASE_DIR, "train.txt"), # Use BASE_DIR for split files
    val_split=os.path.join(BASE_DIR, "val.txt"),   # Use BASE_DIR for split files
    test_split=os.path.join(BASE_DIR, "test.txt"), # Use BASE_DIR for split files
    img_grep='*_input.tif',
    label_grep='*_label_c6.tif',
    means=MEANS,
    stds=STDS,
    train_transform=train_transform, # Removed training transformations
    val_transform=val_test_transform, # No validation transformations
    test_transform=val_test_transform, # No test transformations
    predict_transform=predict_transform # No predict transformations
)
datamodule.setup("fit")


# --- Define Loss Function (Using Cross-Entropy) ---
# Removed CombinedLoss with Dice due to import issues
# loss_fn = nn.CrossEntropyLoss(ignore_index=-1) # Removed direct instantiation


# The model definition is updated with a lower learning rate and Cross-Entropy loss.
model = SemanticSegmentationTask(
    model_args=model_args,
    model_factory="EncoderDecoderFactory",
    class_weights=CLASS_WEIGHTS, # Class weights can be used within the loss function if it supports it
    loss="ce", # Use Cross-Entropy loss with the string key
    lr=1e-5, # Increased learning rate slightly from 5e-7 to 1e-5
    optimizer="AdamW",
    freeze_backbone=False,
    ignore_index=-1 # Ignore index passed to the loss function and datamodule
)

# Configure the Trainer
trainer = pl.Trainer(
    max_epochs=100, # Increased max epochs to 100
    accelerator="gpu",
    devices=1,
    logger=pl.loggers.TensorBoardLogger(save_dir="logs/", name="semantic_segmentation"),
    callbacks=[
        EarlyStopping(monitor="val/loss", mode="min", patience=10), # Increased patience
        ModelCheckpoint(
            dirpath="checkpoints/",
            monitor="val/Multiclass_Jaccard_Index", # Monitoring Jaccard Index
            mode="max",
            save_top_k=1,
            filename="best-model-{epoch:02d}-{val/Multiclass_Jaccard_Index:.4f}",
            save_last=True
        ),
         # Add learning rate monitor callback (optional, but good for debugging)
        LearningRateMonitor(logging_interval='step'),
        # Removed ReduceLROnPlateau callback due to import issues
    ],
    log_every_n_steps=10,
    gradient_clip_val=1.0
)

print("Starting model training with updated configuration...")
trainer.fit(model, datamodule=datamodule)
print("Training finished!")

In [None]:
import torchmetrics.classification

print(dir(torchmetrics.classification))

In [None]:
# --- Validation Step ---
print("Starting validation...")
validation_results = trainer.validate(model, datamodule=datamodule)
print("Validation results:", validation_results)

# Print detailed metrics
if validation_results:
    print("\nDetailed Validation Metrics:")
    for metric_name, metric_value in validation_results[0].items():
        print(f"{metric_name}: {metric_value:.4f}")

In [None]:
# --- Test Step ---
print("Starting testing...")
test_results = trainer.test(model, datamodule=datamodule)
print("Test results:", test_results)

# Print detailed test metrics
if test_results:
    print("\nDetailed Test Metrics:")
    for metric_name, metric_value in test_results[0].items():
        print(f"{metric_name}: {metric_value:.4f}")

In [None]:
# --- Debug: Check Prediction Output Structure ---
import torch

print("Debugging prediction output structure...")

# Set up the datamodule for the prediction stage
datamodule.setup("predict")
predict_dataloader = datamodule.predict_dataloader()

# Get one batch to inspect the output structure
test_batch = next(iter(predict_dataloader))

# The model expects a tensor, not a dictionary.
test_batch_image = test_batch['image']

with torch.no_grad():
    model.eval()
    # Pass the extracted tensor to the model
    output = model(test_batch_image)

print(f"Output type: {type(output)}")

# Recursively check for tensors, including nested dictionaries and lists
def inspect_structure(obj, depth=0, path=""):
    indent = "  " * depth
    if isinstance(obj, torch.Tensor):
        print(f"{indent}{path}: Tensor(shape={obj.shape}, dtype={obj.dtype})")
    elif isinstance(obj, (list, tuple)):
        print(f"{indent}{path}: {type(obj)._name_} with {len(obj)} elements")
        for i, item in enumerate(obj):
            inspect_structure(item, depth + 1, f"{path}[{i}]")
    elif isinstance(obj, dict):
        print(f"{indent}{path}: dict with {len(obj)} keys")
        for key, value in obj.items():
            inspect_structure(value, depth + 1, f"{path}['{key}']")
    else:
        print(f"{indent}{path}: {type(obj)}")

# The model's output is an object of type ModelOutput, but it behaves like a dictionary
# We can inspect its contents directly.
inspect_structure(output, path="output")

In [None]:
import torch
import matplotlib.pyplot as plt

# Function to get prediction
def predict(model, dataloader, device):
    model.eval()
    preds, imgs, masks = [], [], []
    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            outputs = model(images)             # raw logits
            predictions = torch.argmax(outputs, dim=1)  # convert to class indices

            preds.append(predictions.cpu())
            imgs.append(images.cpu())
            masks.append(targets.cpu())
    return imgs, masks, preds


In [None]:
# Function to visualize input, ground truth and prediction
def visualize_predictions(imgs, masks, preds, num_samples=3):
    for i in range(num_samples):
        plt.figure(figsize=(12,4))

        # Input image (first channel or RGB if available)
        img = imgs[i][0].permute(1,2,0).numpy() if imgs[i][0].shape[0] > 1 else imgs[i][0][0].numpy()
        plt.subplot(1,3,1)
        plt.imshow(img, cmap="gray")
        plt.title("Input Image")
        plt.axis("off")

        # Ground Truth Mask
        plt.subplot(1,3,2)
        plt.imshow(masks[i][0], cmap="tab20")
        plt.title("Ground Truth")
        plt.axis("off")

        # Predicted Mask
        plt.subplot(1,3,3)
        plt.imshow(preds[i][0], cmap="tab20")
        plt.title("Prediction")
        plt.axis("off")

        plt.show()


# Task
Generate a `prediction.csv` file containing predictions on the test dataset using the trained model.

## Make predictions

### Subtask:
Use the trained model to generate predictions on the test dataset.


**Reasoning**:
Use the trained model to generate predictions on the test dataset by calling the previously defined `predict` function.



In [None]:
import torch
import matplotlib.pyplot as plt

# Function to get prediction - UPDATED to handle dictionary output
def predict(model, dataloader, device):
    model.eval()
    preds, imgs, masks, filenames = [], [], [], [] # Include filenames to store
    with torch.no_grad():
        for batch in dataloader: # Iterate over batches (which are dictionaries)
            images = batch['image'].to(device) # Extract images from the dictionary
            targets = batch['mask'].to(device)  # Extract masks from the dictionary
            filenames.extend(batch['filename']) # Extend the filenames list

            outputs = model(images)             # raw logits
            predictions = torch.argmax(outputs, dim=1)  # convert to class indices

            preds.append(predictions.cpu())
            imgs.append(images.cpu())
            masks.append(targets.cpu())

    # Concatenate the lists of tensors into single tensors
    preds = torch.cat(preds, dim=0)
    imgs = torch.cat(imgs, dim=0)
    masks = torch.cat(masks, dim=0)

    return imgs, masks, preds, filenames # Return filenames as well

**Reasoning**:
The previous attempt to access `.logits` or `.pred` attributes failed, and the printed attributes of the `ModelOutput` object include `output`. This suggests that the actual tensor containing the raw logits is likely stored in the `.output` attribute. Let's update the `predict` function to access `outputs.output` to retrieve the tensor before applying `argmax`.



In [None]:
import torch
import matplotlib.pyplot as plt

# Function to get prediction - UPDATED to access .output attribute
def predict(model, dataloader, device):
    model.eval()
    preds, imgs, masks, filenames = [], [], [], []
    with torch.no_grad():
        for batch in dataloader:
            images = batch['image'].to(device)
            targets = batch['mask'].to(device)
            filenames.extend(batch['filename'])

            outputs = model(images)             # outputs is now a ModelOutput object

            # Access the .output attribute which likely contains the logits tensor
            try:
                logits = outputs.output
            except AttributeError:
                print(f"Could not find .output attribute in ModelOutput.")
                print(f"ModelOutput attributes: {dir(outputs)}")
                # As a last resort, attempt to use the object directly (which we know failed)
                logits = outputs


            predictions = torch.argmax(logits, dim=1)  # convert to class indices

            preds.append(predictions.cpu())
            imgs.append(images.cpu())
            masks.append(targets.cpu())

    # Concatenate the lists of tensors into single tensors
    preds = torch.cat(preds, dim=0)
    imgs = torch.cat(imgs, dim=0)
    masks = torch.cat(masks, dim=0)

    return imgs, masks, preds, filenames

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the appropriate device
model.to(device)

# Set the datamodule for the predict stage and get the dataloader
datamodule.setup("predict")
predict_dataloader = datamodule.predict_dataloader()

# Generate predictions using the updated predict function
imgs, masks, preds, filenames = predict(model, predict_dataloader, device)

print(f"Generated predictions for {len(preds)} images.")
# Optionally, inspect shapes of the results
if len(preds) > 0:
    print(f"Shape of images tensor: {imgs.shape}")
    print(f"Shape of masks tensor: {masks.shape}")
    print(f"Shape of predictions tensor: {preds.shape}")
    print(f"Number of filenames: {len(filenames)}")


## Format predictions

### Subtask:
Process the model's output to extract the predicted class labels for each pixel and format them according to the submission requirements (typically a run-length encoding or similar format for segmentation masks).


**Reasoning**:
Define the RLE function and iterate through the predicted masks to generate RLE strings and store them with their filenames.



In [None]:
# Function to perform Run-Length Encoding (RLE)
def rle_encode(mask):
    """
    Encodes a 2D binary mask into RLE format.

    Args:
        mask (np.ndarray): A 2D numpy array representing the mask.

    Returns:
        str: The RLE encoded string.
    """
    pixels = mask.flatten()
    # We need to pad the array with -1 to ensure the last run is captured
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    # Only encode runs of the positive class (assuming class 1 is the target)
    # For multi-class, we need to encode each class separately or adapt the format.
    # Based on typical Kaggle segmentation, we'll encode the non-zero pixels.
    # Let's assume we are encoding a single class for submission purposes.
    # If the submission requires encoding all classes, this function would need modification.
    # For this task, we'll encode based on the predicted class value.
    # The RLE format is typically 'pixel_start run_length pixel_start run_length ...'

    # Let's adapt to encode runs of identical values, including 0.
    # This might be more general for multi-class segmentation if the format allows.
    # A common format is 'start length' for positive class pixels.
    # Let's stick to the common format for now, assuming we need to encode each class separately later if needed.
    # For this step, we will just generate RLE for the raw predicted values.

    # A more standard RLE for segmentation would encode the starting pixel index (1-based) and the run length.
    # Let's refine the RLE to match a typical submission format: space-separated pairs of 'start length'.
    # The mask is flattened row by row.
    flat_mask = mask.flatten()
    rle = []
    last_value = -1
    current_start = 0
    for i, value in enumerate(flat_mask):
        if value != last_value:
            if last_value != -1: # End of a run
                 # If we were tracking a specific class (e.g., non-zero), we'd add here
                 pass # We will generate RLE for all values for now

            last_value = value
            current_start = i + 1 # RLE is 1-based
    # Handle the last run
    # This simple RLE doesn't seem right for the typical Kaggle format.

    # Let's try a common RLE implementation for binary masks, which we can adapt for multi-class.
    # This RLE generates pairs of (start_pixel, length) for *positive* pixels.
    # For multi-class, we'd need to generate RLE for each class.
    # Given the task is to process the *model output* and format it, we'll generate RLE for each class present in the mask.

    rle_strings = {}
    unique_classes = np.unique(mask)

    for class_id in unique_classes:
        if class_id == 0: # Typically background is not encoded
            continue

        binary_mask = (mask == class_id).astype(np.uint8)
        flat_mask = binary_mask.flatten()
        pixels = np.concatenate([[0], flat_mask, [0]])
        runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
        runs[1::2] -= runs[::2]

        # The RLE format is 'start length' pairs, space separated.
        # Only include runs where the value is 1 (the current class)
        rle_list = []
        for i in range(0, len(runs), 2):
            start = runs[i]
            length = runs[i+1]
            # Check if the run corresponds to the current class (value 1 in the binary mask)
            # We need to check the value of the pixel at 'start - 1' in the original flattened mask
            if flat_mask[start - 1] == 1:
                 rle_list.append(str(start))
                 rle_list.append(str(length))

        rle_strings[class_id] = " ".join(rle_list)

    # The submission format might require a single string per image,
    # possibly with class IDs embedded or a separate entry per class per image.
    # Let's generate a list of dictionaries, where each dict is {'ImageId': ..., 'ClassId': ..., 'EncodedPixels': ...}
    # This seems more aligned with common multi-class segmentation formats.
    # For this subtask, we just need to get the RLE strings.

    # Let's return a dictionary mapping class_id to RLE string for non-zero classes.
    return rle_strings


# Iterate through predictions and generate RLE
rle_data = []
for i in range(len(preds)):
    image_id_full = filenames[i]
    # Extract the base image ID (e.g., 'agrifieldnet_195f2')
    # Assuming filename format is something like '/path/to/agrifieldnet_195f2_input.tif'
    image_id = os.path.basename(image_id_full).replace('_input.tif', '')

    predicted_mask_tensor = preds[i]
    predicted_mask_np = predicted_mask_tensor.squeeze().numpy() # Remove batch dimension and convert to numpy

    # Generate RLE for each class in the predicted mask
    rle_strings_by_class = rle_encode(predicted_mask_np)

    # Store the RLE data. Each entry in rle_data will represent an image and its RLEs per class.
    # We'll store it as a dictionary for now, mapping class ID to RLE string.
    # The final submission format will require restructuring this.
    rle_data.append({'ImageId': image_id, 'RLEs': rle_strings_by_class})

print(f"Generated RLE data for {len(rle_data)} images.")
# Optionally, print a sample
if len(rle_data) > 0:
    print("\nSample RLE data for the first image:")
    print(rle_data[0])

## Create submission file

### Subtask:
Generate a CSV file containing the image IDs from the test set and their corresponding formatted predictions.


**Reasoning**:
Create a list of rows for the submission DataFrame by iterating through the generated RLE data and formatting it as required for the submission file. Then create a pandas DataFrame from this list and display its head.



In [None]:
# Create an empty list to store the rows for the submission DataFrame
submission_rows = []

# Iterate through the rle_data list
for image_data in rle_data:
    image_id = image_data['ImageId']
    rles_by_class = image_data['RLEs']

    # Iterate through the 'RLEs' dictionary for each image
    for class_id, rle_string in rles_by_class.items():
        # Create a dictionary for each row
        row = {
            'ImageId': image_id,
            'ClassId': class_id,
            'EncodedPixels': rle_string
        }
        # Append the dictionary to the list of rows
        submission_rows.append(row)

# Create a pandas DataFrame from the list of rows
submission_df = pd.DataFrame(submission_rows)

# Print the head of the created DataFrame
print("Submission DataFrame Head:")
display(submission_df.head())

**Reasoning**:
The submission DataFrame is empty, which indicates that the `rle_data` list might be empty or the `RLEs` dictionary within each entry is empty. This could be due to the model predicting only the background class (class 0), which is typically not encoded in RLE for submission. To address this, I need to ensure that even if no foreground classes are predicted, an entry for the image with an empty 'EncodedPixels' is included for each expected class ID (1 to 5, as class 0 is background). I will modify the loop to iterate through the expected class IDs (1 to 5) for each image and add a row with an empty RLE string if no RLE was generated for that class.



In [None]:
# Create an empty list to store the rows for the submission DataFrame
submission_rows = []

# Define the expected class IDs for the submission (assuming 1 to 5, excluding background 0)
expected_class_ids = list(range(1, 6)) # Classes 1 through 5

# Iterate through the rle_data list
for image_data in rle_data:
    image_id = image_data['ImageId']
    rles_by_class = image_data['RLEs']

    # Iterate through the expected class IDs
    for class_id in expected_class_ids:
        # Get the RLE string for the current class, or an empty string if not present
        rle_string = rles_by_class.get(class_id, "")

        # Create a dictionary for each row
        row = {
            'ImageId': image_id,
            'ClassId': class_id,
            'EncodedPixels': rle_string
        }
        # Append the dictionary to the list of rows
        submission_rows.append(row)

# Create a pandas DataFrame from the list of rows
submission_df = pd.DataFrame(submission_rows)

# Print the head of the created DataFrame
print("Submission DataFrame Head:")
display(submission_df.head())

# Print the shape of the DataFrame to confirm the number of rows
print(f"\nSubmission DataFrame shape: {submission_df.shape}")

# Optionally, check for images with no predictions for any class
images_with_predictions = submission_df[submission_df['EncodedPixels'] != '']['ImageId'].unique()
all_image_ids = [data['ImageId'] for data in rle_data]
images_with_no_foreground_predictions = [img_id for img_id in all_image_ids if img_id not in images_with_predictions]

print(f"\nTotal unique images in test set: {len(all_image_ids)}")
print(f"Images with at least one foreground prediction: {len(images_with_predictions)}")
print(f"Images with no foreground predictions: {len(images_with_no_foreground_predictions)}")

## Save submission file

### Subtask:
Save the generated CSV file to the desired location.


**Reasoning**:
Save the generated submission DataFrame to a CSV file named 'submission.csv' without including the index.



In [None]:
# Save the submission DataFrame to a CSV file in the Kaggle working directory
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file '/kaggle/working/submission.csv' created successfully.")

## Summary:

### Data Analysis Key Findings

*   The `predict_dataloader` provides image, mask, and filename information within a dictionary structure.
*   The trained model outputs a `ModelOutput` object, requiring access to the `.output` attribute to retrieve the raw logits tensor for prediction.
*   Predictions were successfully generated for all images in the test set, resulting in a tensor of predicted class indices for each pixel.
*   Run-Length Encoding (RLE) was applied to the predicted masks for each foreground class (classes 1 through 5), generating RLE strings in the 'start length' format.
*   A submission DataFrame was created with columns 'ImageId', 'ClassId', and 'EncodedPixels', ensuring an entry for every image and every expected foreground class (1-5). An empty string is used for 'EncodedPixels' if no foreground pixels were predicted for a specific class in an image.
*   In this specific execution, no foreground predictions were made for any image, resulting in all 'EncodedPixels' being empty strings in the submission DataFrame.

### Insights or Next Steps

*   The current model training might require further tuning or a different architecture as it appears to predict only the background class for the test set, resulting in empty RLE strings for all foreground classes.
*   The generated `submission.csv` file, although correctly formatted, contains only empty predictions for foreground classes, indicating that the model's performance on the test set is currently very low for detecting non-background regions.


# Task
Analyze the file "/content/submission.csv" to determine the percentage of pixels predicted for each of the following crop types: Gram, Maize, Mustard, Sugarcane, Wheat, and Other Crop.

## Load the submission data

### Subtask:
Load the `submission.csv` file into a pandas DataFrame.


**Reasoning**:
Load the submission.csv file into a pandas DataFrame and display the head.



In [None]:
# Load the submission.csv file from the Kaggle working directory
submission_df = pd.read_csv('/kaggle/working/submission.csv')
display(submission_df.head())

## Process rle and calculate pixel counts

### Subtask:
Iterate through the DataFrame, decode the RLE strings for each predicted class in each image, and calculate the total number of pixels predicted for each crop type across all images.


**Reasoning**:
Define the RLE decode function and initialize the pixel counts dictionary. Then iterate through the submission DataFrame, decode the RLE for each entry, and accumulate pixel counts for each class. I will assume the image dimensions are 256x256 based on the earlier visualization of a sample image.



In [None]:
import numpy as np

# Define the rle_decode function
def rle_decode(rle_string, height, width):
    """
    Decodes an RLE encoded string into a binary mask.

    Args:
        rle_string (str): The RLE encoded string in 'start length' format,
                          space separated.
        height (int): The height of the mask.
        width (int): The width of the mask.

    Returns:
        np.ndarray: A 2D numpy array (height, width) representing the decoded mask.
    """
    mask = np.zeros(height * width, dtype=np.uint8)
    if rle_string:
        rle_parts = list(map(int, rle_string.split()))
        for i in range(0, len(rle_parts), 2):
            start = rle_parts[i] - 1  # RLE is 1-based, numpy is 0-based
            length = rle_parts[i+1]
            end = start + length
            mask[start:end] = 1
    return mask.reshape(height, width)

# Initialize a dictionary to store pixel counts for each crop type (ClassId 1-5)
crop_pixel_counts = {class_id: 0 for class_id in range(1, 6)}

# Define assumed image dimensions based on sample visualization
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256

# Iterate through each row of the submission DataFrame
for index, row in submission_df.iterrows():
    image_id = row['ImageId']
    class_id = row['ClassId']
    rle_string = row['EncodedPixels']

    # Ensure the class_id is one we are tracking (1-5)
    if class_id in crop_pixel_counts:
        # Decode the RLE string
        # Handle potential NaN values in 'EncodedPixels' by treating them as empty strings
        decoded_mask = rle_decode(str(rle_string) if pd.notna(rle_string) else "",
                                  IMAGE_HEIGHT, IMAGE_WIDTH)

        # Calculate the number of pixels predicted for the current class
        pixel_count = np.sum(decoded_mask)

        # Add the count to the corresponding crop type
        crop_pixel_counts[class_id] += pixel_count

print("Total pixel counts for each crop type:")
print(crop_pixel_counts)

## Get total image dimensions

### Subtask:
Obtain the dimensions of the images to calculate the total number of pixels in each image.


**Reasoning**:
Define variables IMAGE_HEIGHT and IMAGE_WIDTH based on the known dimensions of the input images.



In [None]:
# Define the dimensions of the input images
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256

print(f"Image height: {IMAGE_HEIGHT}, Image width: {IMAGE_WIDTH}")

## Calculate total pixels across all images

### Subtask:
Sum the total number of pixels for all images in the test set.


**Reasoning**:
Calculate the total number of pixels across all images and store it in `total_pixels_all_images`, then print the result to complete the subtask.



In [None]:
# Calculate the total number of pixels in a single image
total_pixels_per_image = IMAGE_HEIGHT * IMAGE_WIDTH

# Determine the total number of unique images in the test set
num_unique_images = submission_df['ImageId'].nunique()

# Calculate the total number of pixels across all images
total_pixels_all_images = total_pixels_per_image * num_unique_images

# Print the total number of pixels across all images
print(f"Total number of pixels across all images: {total_pixels_all_images}")

## Calculate percentage for each crop type

### Subtask:
Divide the total pixel count for each crop type by the total number of pixels across all images and multiply by 100 to get the percentage.


**Reasoning**:
Iterate through the crop_pixel_counts dictionary and calculate the percentage for each class by dividing by the total number of pixels across all images. Store the results in a new dictionary and print it.



In [None]:
# Initialize a dictionary to store the percentages
crop_percentages = {}

# Iterate through the crop_pixel_counts dictionary
for class_id, pixel_count in crop_pixel_counts.items():
    # Calculate the percentage
    percentage = (pixel_count / total_pixels_all_images) * 100
    # Store the percentage in the new dictionary
    crop_percentages[class_id] = percentage

# Print the calculated percentages
print("Percentage of pixels predicted for each crop type:")
print(crop_percentages)

## Display the results

### Subtask:
Present the calculated percentages for each crop type.


**Reasoning**:
Define a dictionary mapping class IDs to crop names, print a header, and then iterate through the `crop_percentages` dictionary to print the percentage for each crop type using the defined mapping.



In [None]:
# Define a dictionary mapping class IDs to crop type names
class_id_to_crop = {
    1: 'Gram',
    2: 'Maize',
    3: 'Mustard',
    4: 'Sugarcane',
    5: 'Wheat'
}

# Print a header
print("Percentage of pixels predicted for each crop type:")

# Iterate through the crop_percentages dictionary and print the results
for class_id, percentage in crop_percentages.items():
    crop_name = class_id_to_crop.get(class_id, f"Unknown Class {class_id}")
    print(f"{crop_name}: {percentage:.2f}%")

## Summary:

### Data Analysis Key Findings

*   The total number of pixels across all images in the submission dataset is 8,257,536.
*   Based on the provided `submission.csv` and the implemented RLE decoding logic, the total number of pixels predicted for each of the specified crop types (Gram, Maize, Mustard, Sugarcane, and Wheat) is 0.
*   Consequently, the percentage of pixels predicted for Gram, Maize, Mustard, Sugarcane, and Wheat is 0.00% for each crop type.

### Insights or Next Steps

*   Investigate the `EncodedPixels` column in the `submission.csv` file to understand why the RLE decoding resulted in zero predicted pixels for the specified crop types. This could involve checking for empty or invalid RLE strings.


# Task
Improve the semantic segmentation model to correctly predict crop types and increase accuracy, addressing the current output of all zeros for all crop types.

## Review model architecture and parameters

### Subtask:
Examine the current model architecture, hyperparameters (like learning rate, optimizer, etc.), and loss function to ensure they are suitable for this semantic segmentation task and dataset.


**Reasoning**:
Review the model architecture, hyperparameters, and loss function based on the provided code and identify potential areas for improvement.



In [None]:
# Review of Model Configuration:

# 1. Model Architecture:
# Backbone: 'prithvi_eo_v2_300_tl' - This seems to be a large, potentially powerful pre-trained backbone.
# Decoder: 'UperNetDecoder' - UPerNet is a common and effective decoder for semantic segmentation.
# This combination is generally suitable for semantic segmentation of complex imagery like satellite data.
# Potential improvement: While the architecture is good, its size might be an issue on limited hardware,
# or it might require extensive fine-tuning.

# 2. Hyperparameters and Loss Function:
# Loss: 'dice' - Dice loss is good for handling class imbalance, which is common in segmentation.
# Potential improvement: Combining Dice loss with Cross-Entropy loss can sometimes lead to better results.
# This can help with both class imbalance and capturing boundary information.

# Learning Rate (lr): 5e-7 - This is a very low learning rate. While it might prevent instability with a large pre-trained model,
# it could also slow down convergence significantly, especially if the backbone is not fully frozen.
# Potential improvement: A slightly higher learning rate might be beneficial, potentially with a learning rate scheduler.

# Optimizer: 'AdamW' - AdamW is a good choice for many deep learning tasks, including segmentation.

# freeze_backbone: False - The backbone is not frozen, which is good for fine-tuning on the specific dataset.
# However, with a very low learning rate and a large model, training might still be slow.

# ignore_index: -1 - This seems appropriate if -1 is used to denote pixels that should be ignored (e.g., NoData).

# 3. Identified Potential Areas for Improvement:
# - Consider combining Dice loss with Cross-Entropy loss.
# - Experiment with a slightly higher learning rate, potentially with a scheduler.
# - Given the observation of all zero predictions, the current training might not be converging effectively. This could be due to the low learning rate or other training configuration issues not directly related to the model architecture itself (e.g., insufficient training epochs, data loading problems, etc.). However, focusing on the model config as per the subtask: the low learning rate is a primary suspect for slow or ineffective learning.

print("Review of the current model configuration completed.")
print("\nPotential areas for improvement:")
print("- Consider combining Dice loss with Cross-Entropy loss.")
print("- Experiment with a slightly higher learning rate and potentially a learning rate scheduler.")
print("- Ensure the training runs for enough epochs for the model to learn effectively at the chosen learning rate.")

**Reasoning**:
The review identified potential improvements in the loss function and learning rate. The next step is to modify the `SemanticSegmentationTask` to use a combined loss function and adjust the learning rate based on the identified potential improvements, then instantiate the model with the updated configuration.



In [None]:
import torch
import matplotlib.pyplot as plt

# Function to get prediction - UPDATED to handle dictionary output
def predict(model, dataloader, device):
    model.eval()
    preds, imgs, masks, filenames = [], [], [], [] # Include filenames to store
    with torch.no_grad():
        for batch in dataloader: # Iterate over batches (which are dictionaries)
            images = batch['image'].to(device) # Extract images from the dictionary
            targets = batch['mask'].to(device)  # Extract masks from the dictionary
            filenames.extend(batch['filename']) # Extend the filenames list

            outputs = model(images)             # raw logits
            predictions = torch.argmax(outputs, dim=1)  # convert to class indices

            preds.append(predictions.cpu())
            imgs.append(images.cpu())
            masks.append(targets.cpu())

    # Concatenate the lists of tensors into single tensors
    preds = torch.cat(preds, dim=0)
    imgs = torch.cat(imgs, dim=0)
    masks = torch.cat(masks, dim=0)

    return imgs, masks, preds, filenames # Return filenames as well

In [None]:
!pip install --upgrade torchmetrics