# Final EDA Notebook

This notebook provides a comprehensive exploratory data analysis (EDA) of the food classification dataset. The goal is to understand the data distribution, label structure, image properties, and to visualize both ground truth and model predictions. This EDA supports model development and helps identify potential issues or opportunities for improvement.

In [None]:
# Standard library imports
import os
import random
import re
from statistics import multimode

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from tqdm import tqdm

# Allow duplicate OpenMP libraries (fixes some multi-threading issues on some systems)
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

## 1. Environment Setup

- **Device selection**: Automatically uses GPU if available, otherwise falls back to CPU.
- **Directory paths**: Set up paths for training images, test images, label file, and category names.


In [None]:
print(f"CUDA Devices: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
images_train_dir = 'images_train'
images_test_dir = 'images_test'
labels_dir = 'train_onehot.csv'
label_names_dir = 'categories_new.csv'

## 2. Data Loading 

- **DataFrames**: Load label and category data for analysis.

In [None]:
labels_df = pd.read_csv(labels_dir)
category_df = pd.read_csv(label_names_dir)

print("Dataset shape:", labels_df.shape)
print("First rows:\n")
labels_df.head()

## 3. Label Analysis

- **Label matrix**: Analyze the distribution of labels per image and images per label.
- **Visualization**: Plot the number of labels per image and the frequency of each label in the dataset.
- **Insights**: Identify class imbalance and multi-label characteristics.

In [None]:
# Extract label matrix (numpy array, shape: [num_samples, 498])
label_matrix = labels_df.iloc[:, 1:].values
print("Label stats:")

# Labels per image
labels_per_image = label_matrix.sum(axis=1)
print(f"Labels per image: mean={labels_per_image.mean():.2f}, min={labels_per_image.min()}, max={labels_per_image.max()}")

# Build a DataFrame too
labels_per_image_counts = pd.Series(labels_per_image).value_counts().sort_index()
label_count_df = pd.DataFrame({
    'Number of Labels': labels_per_image_counts.index,
    'Image Count': labels_per_image_counts.values
})
label_count_df['Cumulative %'] = label_count_df['Image Count'].cumsum() / label_count_df['Image Count'].sum() * 100

# Also make a nice plot of the distribution
plt.figure(figsize=(8, 5))
sns.barplot(x=label_count_df['Number of Labels'], y=label_count_df['Image Count'])
plt.title("Distribution of Labels per Image")
plt.xlabel("Number of Labels")
plt.ylabel("Number of Images")
plt.show()

print("\nImages by number of labels")
label_count_df

In [None]:
# Images per label
images_per_label = label_matrix.sum(axis=0)
print(f"Images per label: mean={images_per_label.mean():.2f}, min={images_per_label.min()}, max={images_per_label.max()}")

# Build a DataFrame too
label_freq_df = pd.DataFrame({
    'Label': category_df['name'],
    'Image Count': images_per_label
})
label_freq_df = label_freq_df.sort_values(by='Image Count', ascending=False).reset_index(drop=True)
label_freq_df


## 4. Image Size Exploration

- **Image dimensions**: Analyze the width and height of all training images.
- **Visualization**: Plot the distribution of image sizes to inform preprocessing and augmentation choices.

In [None]:
img_filenames = labels_df.iloc[:, 0].values

widths, heights = [], []

for filename in tqdm(img_filenames):
    with Image.open(os.path.join(images_train_dir, filename)) as img:
        w, h = img.size
        widths.append(w)
        heights.append(h)

# Summary
print("Image dimension stats:")
print(f"Width: mean={np.mean(widths):.2f}, median: {np.median(widths)}, min={np.min(widths)}, max={np.max(widths)}, mode={multimode(widths)}")
print(f"Height: mean={np.mean(heights):.2f}, median: {np.median(heights)}, min={np.min(heights)}, max={np.max(heights)}, mode={multimode(heights)}")

# Plot
plt.figure(figsize=(8, 4))
sns.histplot(widths, color="blue", label="Widths")
sns.histplot(heights, color="orange", label="Heights")
plt.legend()
plt.title("Image Dimensions (Pre-Resize)")
plt.show()


## 5. Dataset and Dataloader Construction

- **Custom Dataset**: Defines a PyTorch Dataset for both training and test images, supporting flexible transforms.
- **Dataloaders**: Efficiently load images in batches for analysis and visualization.

In [None]:
image_dim_px = 224

class FoodDataset(Dataset):
    def __init__(self, img_dir, labels_csv = None, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.is_training = labels_csv is not None # If labels_csv is provided, it's a training dataset. Else, it's a test dataset.

        if self.is_training:
            # Load training data
            self.labels_df = pd.read_csv(labels_csv)      
            self.filenames = self.labels_df.iloc[:, 0].values  # image filenames
            self.labels = self.labels_df.iloc[:, 1:].values.astype('float')  # one-hot labels
        else:
            self.filenames = sorted(os.listdir(img_dir))
            self.labels = None  # No labels for the test set

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        # Find the image file path and open it 
        img_path = os.path.join(self.img_dir, self.filenames[idx])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        if self.is_training:
            # For training data, return the image and its label
            label = torch.tensor(self.labels[idx])
            return image, label
        else:
            # For test data, return the image and its filename
            return image, self.filenames[idx]  

transform = transforms.Compose([
    transforms.Resize((image_dim_px, image_dim_px)),
    transforms.ToTensor()
])

train_dataset = FoodDataset(images_train_dir, labels_dir, transform=transform)  
test_dataset = FoodDataset(images_test_dir, labels_csv=None, transform=transform)

batch_size = 64  
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

## 6. Visualization Utilities

- **plot_with_labels**: Visualize random samples from the dataset, showing ground truth or predicted labels.
- **Usage**: Supports both training and test sets, and can display model predictions from submission files.

In [None]:
def plot_with_labels(dataset, indices, submission_df=None):
    num_samples = len(indices)
    plt.figure(figsize=(3 * num_samples, 4))  # dynamically scale width

    # If using submission_df, build a filename-to-row lookup 
    if submission_df is not None:
        filename_to_row = {fname: i for i, fname in enumerate(submission_df.iloc[:, 0].values)}

    for i, idx in enumerate(indices):
        image, label_or_filename = dataset[idx]
        image_np = image.permute(1, 2, 0).numpy()  # CHW -> HWC for plotting

        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image_np)
        plt.axis('off')

        if submission_df is not None:
            # Test set: label_or_filename is the filename
            filename = label_or_filename
            row_idx = filename_to_row.get(filename, None)
            if row_idx is not None:
                label_row = submission_df.iloc[row_idx, 1:].values.astype(float)
                label_indices = np.where(label_row == 1)[0]
                label_names = [category_df['name'].iloc[j] for j in label_indices]
                if not label_names:
                    label_names = ["(No label assigned)"]
            else:
                label_names = ["Not found"]
        else:
            # Train set: label_or_filename is a tensor
            label_indices = np.where(label_or_filename.numpy() == 1)[0]
            label_names = category_df['name'].iloc[label_indices].tolist()
        # Make title string
        title = "\n".join(label_names)
        plt.title(title, fontsize=9)

    plt.tight_layout()
    plt.show()


# Show random samples from the dataset
num_samples = 7
random_indices = np.random.choice(len(train_dataset), num_samples, replace=False)
plot_with_labels(train_dataset, random_indices)

## 7. Post-Submission Analysis

- **Prediction visualization**: Plot random test images with predicted labels from a submission file.
- **Unlabeled images**: Count and report the number of test images with no labels assigned in a submission.

In [None]:
# post-submission exploration: plot random predictions from submission file

# Load submission CSV 
submission_df = pd.read_csv("ensemble_submission_nik_v11_1.csv")

num_samples = 7
random_indices = random.sample(range(len(test_dataset)), num_samples)
plot_with_labels(test_dataset, random_indices, submission_df)

In [None]:
# post-submission exploration: count number of images with no labels assigned from submission file

submission_df = pd.read_csv("ensemble_submission_nik_v11_1.csv")
no_label_count = submission_df.iloc[:, 1:].sum(axis=1).value_counts().get(0, 0)
print(f"Number of images with no labels assigned in submission: {no_label_count}/{len(submission_df)} ({no_label_count / len(submission_df) * 100:.2f}%)")


## 8. Normalization Metrics

- **Mean and std calculation**: Compute per-channel mean and standard deviation for the training set.
- **Purpose**: Used for normalization in model training and inference.

In [None]:
def get_mean_std(dataloader):
    mean = 0.0
    std = 0.0
    total_images = 0

    for images, _ in tqdm(dataloader):
        images.to(device)
        batch_samples = images.size(0)  
        images = images.view(batch_samples, images.size(1), -1)  # (B, C, H*W)
        mean += images.mean(2).sum(0)
        std += images.std(2).sum(0)
        total_images += batch_samples

    mean /= total_images
    std /= total_images
    return mean, std

mean, std = get_mean_std(train_dataloader)
print("Mean:", mean)
print("Std:", std)

## 9. Model / Ensemble Disagreement Analysis

- **Crosstabulation**: Compare multiple submission files to analyze pairwise model or ensemble disagreements.

In [None]:
# Crosstabulate pairwise disagreements between submission files

# Create list of submission files here
appends = ['13_0_1', '15_0_1', '16_0_1', '17_0_1','18', '19', '20']
submission_files = ['submission_nik_v' + append + '.csv' for append in appends]
submission_files.append('submission_billy_swin_60f1.csv')

# Read all submissions into a list of DataFrames
submissions = [pd.read_csv(f) for f in submission_files]

# Ensure all have the same order and columns
for df in submissions[1:]:
    assert (df["Filename"].values == submissions[0]["Filename"].values).all(), "Filenames do not match!"

# Stack label predictions (excluding filename column)
preds = [df.iloc[:, 1:].values for df in submissions]
preds = np.stack(preds, axis=0)  # shape: (num_models, num_images, num_labels)

num_models = len(submissions)
disagreement_matrix = np.zeros((num_models, num_models), dtype=int)

for i in range(num_models):
    for j in range(num_models):
        # Count number of images where any label prediction differs between model i and model j
        disagreement_matrix[i, j] = np.sum(np.any(preds[i] != preds[j], axis=1))

# Create a DataFrame for pretty display
labels = []
for fname in submission_files:
    match = re.search(r'(\d{2})', fname)
    if match:
        labels.append(match.group(1))
    else:
        labels.append(fname)  # fallback to filename if no 2-digit number found

crosstab_df = pd.DataFrame(disagreement_matrix, index=labels, columns=labels)
print("Pairwise number of images with any label disagreement between models:")
display(crosstab_df)