In [None]:
# ================================================================
# SNAPSHELF - FOOD IMAGE CLASSIFICATION MODEL COMPARISON
# ================================================================
# Project:      SnapShelf - Smart Food Inventory Management
# Module:       MOD002691 - Final Project (BSc Software Engineering)
# Institution:  Anglia Ruskin University, Cambridge
# Author:       Oriol Morros Vilaseca (SID: 2270056)
# Supervisor:   Mr Vitaliy Milke
# Date:         January 2026
# ================================================================
#
# PURPOSE:
# This notebook prepares a unified dataset for comparing three
# image classification approaches for food recognition:
#   1. Custom CNN (trained from scratch)
#   2. EfficientNetB0 (transfer learning)
#   3. YOLOv8 (object detection adapted for classification)
#
# RESEARCH QUESTION:
# How do different image classification approaches compare in
# accuracy, inference time, and computational efficiency for
# household food item recognition?
#
# DATASET SOURCES:
#   - Kaggle: moltean/fruits (Fruits-360)
#   - Kaggle: sshikamaru/fruit-recognition
#   - Kaggle: utkarshsaxenadn/fruits-classification
#
# OUTPUT:
# A merged dataset with 70/15/15 train/validation/test split
# across 14 food categories, ready for model training.
#
# ================================================================

import sys
import platform
from datetime import datetime

print("=" * 60)
print("SNAPSHELF - DATASET PREPARATION")
print("=" * 60)
print(f"Execution Date:  {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python Version:  {sys.version.split()[0]}")
print(f"Platform:        {platform.platform()}")
print("=" * 60)

SNAPSHELF - DATASET PREPARATION
Execution Date:  2026-01-03 15:59:35
Python Version:  3.12.12
Platform:        Linux-6.6.105+-x86_64-with-glibc2.35


In [None]:
# ================================================================
# CELL 2: MOUNT GOOGLE DRIVE & IMPORT LIBRARIES
# ================================================================
# Google Drive is used for:
#   - Loading source datasets (input)
#   - Saving processed dataset (output)
#   - Persistent storage between Colab sessions
#
# ================================================================

from google.colab import drive
import os
import shutil
import random
import numpy as np
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print("Google Drive mounted successfully.")
print(f"Random seed set to: {SEED}")

Mounted at /content/drive
Google Drive mounted successfully.
Random seed set to: 42


In [None]:
# ================================================================
# CELL 3: PROJECT CONFIGURATION
# ================================================================
# Central configuration for all paths and parameters.
# Modify SOURCE_DATASETS if your zip files are in a different location.
# ================================================================

# -----------------------------
# SOURCE DATASET PATHS (Input)
# -----------------------------
# These are the 3 Kaggle datasets stored in your Google Drive
DRIVE_BASE = "/content/drive/MyDrive/smartfoodai_datasets"

SOURCE_DATASETS = {
    "fruits_360": f"{DRIVE_BASE}/archive (3).zip",        # moltean/fruits
    "fruit_recognition": f"{DRIVE_BASE}/archive (1).zip", # sshikamaru/fruit-recognition
    "fruits_classification": f"{DRIVE_BASE}/archive (2).zip"  # utkarshsaxenadn/fruits-classification
}

# -----------------------------
# OUTPUT PATHS
# -----------------------------
# Where the processed dataset will be saved
OUTPUT_BASE = "/content/drive/MyDrive/snapshelf_dataset"
EXTRACTION_DIR = "/content/datasets_raw"      # Temporary extraction (local)
MERGED_DIR = "/content/merged_dataset"        # Temporary merged (local)

# Final output directories (saved to Drive)
TRAIN_DIR = f"{OUTPUT_BASE}/train"
VAL_DIR = f"{OUTPUT_BASE}/val"
TEST_DIR = f"{OUTPUT_BASE}/test"

# -----------------------------
# DATASET CONFIGURATION
# -----------------------------
# 14 target food categories for classification
TARGET_CLASSES = {
    "apple": ["apple"],
    "banana": ["banana"],
    "orange": ["orange"],
    "lemon": ["lemon"],
    "strawberry": ["strawberry"],
    "grape": ["grape"],
    "peach": ["peach"],
    "tomato": ["tomato"],
    "potato": ["potato"],
    "onion": ["onion"],
    "carrot": ["carrot"],
    "bell_pepper_red": ["pepper red", "bell pepper red", "red pepper"],
    "bell_pepper_green": ["pepper green", "bell pepper green", "green pepper"],
    "cucumber": ["cucumber"]
}

# Data split ratios (must sum to 1.0)
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Image settings
VALID_EXTENSIONS = {".jpg", ".jpeg", ".png"}

# -----------------------------
# VERIFY CONFIGURATION
# -----------------------------
print("=" * 60)
print("PROJECT CONFIGURATION")
print("=" * 60)

print(f"\n[SOURCE DATASETS]")
for name, path in SOURCE_DATASETS.items():
    exists = "✓" if os.path.exists(path) else "✗ NOT FOUND"
    print(f"  {name}: {exists}")

print(f"\n[OUTPUT LOCATION]")
print(f"  {OUTPUT_BASE}")

print(f"\n[TARGET CLASSES] ({len(TARGET_CLASSES)} categories)")
for cls in TARGET_CLASSES.keys():
    print(f"  • {cls}")

print(f"\n[DATA SPLIT]")
print(f"  Train:      {TRAIN_RATIO*100:.0f}%")
print(f"  Validation: {VAL_RATIO*100:.0f}%")
print(f"  Test:       {TEST_RATIO*100:.0f}%")

# Validate split ratios
assert abs(TRAIN_RATIO + VAL_RATIO + TEST_RATIO - 1.0) < 0.001, \
    "ERROR: Split ratios must sum to 1.0"
print("\nConfiguration validated successfully.")

PROJECT CONFIGURATION

[SOURCE DATASETS]
  fruits_360: ✓
  fruit_recognition: ✓
  fruits_classification: ✓

[OUTPUT LOCATION]
  /content/drive/MyDrive/snapshelf_dataset

[TARGET CLASSES] (14 categories)
  • apple
  • banana
  • orange
  • lemon
  • strawberry
  • grape
  • peach
  • tomato
  • potato
  • onion
  • carrot
  • bell_pepper_red
  • bell_pepper_green
  • cucumber

[DATA SPLIT]
  Train:      70%
  Validation: 15%
  Test:       15%

Configuration validated successfully.


In [None]:
# ================================================================
# CELL 4: EXTRACT SOURCE DATASETS
# ================================================================
# Extracts all zip files to a temporary local directory.
# Local storage is faster than working directly from Drive.
# ================================================================

import zipfile

def extract_dataset(zip_path, extract_to, dataset_name):
    """
    Extract a zip file to the specified directory.

    Args:
        zip_path: Path to the zip file
        extract_to: Destination directory
        dataset_name: Name for logging purposes

    Returns:
        bool: True if successful, False otherwise
    """
    try:
        print(f"  Extracting {dataset_name}...", end=" ")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("✓")
        return True
    except Exception as e:
        print(f"✗ Error: {e}")
        return False

# -----------------------------
# EXTRACT ALL DATASETS
# -----------------------------
print("=" * 60)
print("EXTRACTING SOURCE DATASETS")
print("=" * 60)

# Create extraction directory (clean start)
if os.path.exists(EXTRACTION_DIR):
    shutil.rmtree(EXTRACTION_DIR)
os.makedirs(EXTRACTION_DIR, exist_ok=True)

print(f"\nExtraction directory: {EXTRACTION_DIR}\n")

# Extract each dataset
success_count = 0
for name, zip_path in SOURCE_DATASETS.items():
    if extract_dataset(zip_path, EXTRACTION_DIR, name):
        success_count += 1

print(f"\n{'=' * 60}")
print(f"Extraction complete: {success_count}/{len(SOURCE_DATASETS)} datasets")

# -----------------------------
# SHOW EXTRACTED STRUCTURE
# -----------------------------
print(f"\n[EXTRACTED CONTENTS]")
for item in sorted(os.listdir(EXTRACTION_DIR)):
    item_path = os.path.join(EXTRACTION_DIR, item)
    if os.path.isdir(item_path):
        subcount = len(os.listdir(item_path))
        print(f"   {item}/ ({subcount} items)")
    else:
        print(f"   {item}")

EXTRACTING SOURCE DATASETS

Extraction directory: /content/datasets_raw

  Extracting fruits_360... ✓
  Extracting fruit_recognition... ✓
  Extracting fruits_classification... ✓

Extraction complete: 3/3 datasets

[EXTRACTED CONTENTS]
   Fruits Classification/ (4 items)
   fruits-360_100x100/ (1 items)
   fruits-360_3-body-problem/ (1 items)
   fruits-360_dataset_meta/ (1 items)
   fruits-360_multi/ (3 items)
   fruits-360_original-size/ (1 items)
   sampleSubmission.csv
   test/ (1 items)
   train/ (1 items)


In [None]:
# ================================================================
# CELL 5: SCAN & MAP SOURCE IMAGES TO TARGET CLASSES
# ================================================================
# Walks through all extracted directories and identifies images
# that match our 14 target food categories.
# Creates a mapping of source paths to target class names.
# ================================================================

def matches_target_class(folder_name, keywords):
    """
    Check if a folder name matches any of the target class keywords.

    Args:
        folder_name: Name of the folder to check
        keywords: List of keywords that indicate this class

    Returns:
        bool: True if folder matches the class
    """
    name_normalized = folder_name.lower().replace("_", " ").replace("-", " ")
    for keyword in keywords:
        if keyword in name_normalized:
            return True
    return False

def scan_for_images(base_path, target_classes, valid_extensions):
    """
    Recursively scan directories and map images to target classes.

    Args:
        base_path: Root directory to scan
        target_classes: Dict mapping class names to keyword lists
        valid_extensions: Set of valid image file extensions

    Returns:
        Dict mapping class names to lists of image paths
    """
    class_images = defaultdict(list)

    for root, dirs, files in os.walk(base_path):
        folder_name = os.path.basename(root)

        # Check if this folder matches any target class
        for class_name, keywords in target_classes.items():
            if matches_target_class(folder_name, keywords):
                # Collect all valid images from this folder
                for filename in files:
                    ext = os.path.splitext(filename)[1].lower()
                    if ext in valid_extensions:
                        full_path = os.path.join(root, filename)
                        class_images[class_name].append(full_path)
                break  # Folder matched, no need to check other classes

    return class_images

# -----------------------------
# SCAN ALL EXTRACTED DATASETS
# -----------------------------
print("=" * 60)
print("SCANNING FOR TARGET CLASS IMAGES")
print("=" * 60)

print(f"\nScanning: {EXTRACTION_DIR}")
print(f"Looking for {len(TARGET_CLASSES)} target classes...")
print()

class_images = scan_for_images(EXTRACTION_DIR, TARGET_CLASSES, VALID_EXTENSIONS)

# -----------------------------
# REPORT FINDINGS
# -----------------------------
print(f"{'Class':<20} {'Images Found':>12}")
print("-" * 34)

total_images = 0
classes_found = 0

for class_name in sorted(TARGET_CLASSES.keys()):
    count = len(class_images[class_name])
    total_images += count
    if count > 0:
        classes_found += 1
    status = "" if count > 0 else " [!] NO IMAGES"
    print(f"{class_name:<20} {count:>12,}{status}")

print("-" * 34)
print(f"{'TOTAL':<20} {total_images:>12,}")

print(f"\n[SUMMARY]")
print(f"  Classes with images: {classes_found}/{len(TARGET_CLASSES)}")
print(f"  Total images found:  {total_images:,}")

# Warn if any class has no images
missing_classes = [c for c in TARGET_CLASSES if len(class_images[c]) == 0]
if missing_classes:
    print(f"\n[WARNING] No images found for: {', '.join(missing_classes)}")
    print("  Check TARGET_CLASSES keywords or dataset contents.")

SCANNING FOR TARGET CLASS IMAGES

Scanning: /content/datasets_raw
Looking for 14 target classes...

Class                Images Found
----------------------------------
apple                      69,389
banana                      5,699
bell_pepper_green           1,036
bell_pepper_red             1,554
carrot                        402
cucumber                   15,773
grape                       8,861
lemon                       1,804
onion                       7,000
orange                      3,260
peach                      10,996
potato                      2,854
strawberry                  4,132
tomato                     34,606
----------------------------------
TOTAL                     167,366

[SUMMARY]
  Classes with images: 14/14
  Total images found:  167,366


In [None]:
# ================================================================
# CELL 6: CREATE STRATIFIED TRAIN/VAL/TEST SPLIT
# ================================================================
# Splits images into 70% train, 15% validation, 15% test.
# Uses stratified sampling to maintain class proportions.
# Shuffles with fixed seed for reproducibility.
# ================================================================

def create_stratified_split(class_images, train_ratio, val_ratio, test_ratio, seed=42):
    """
    Create stratified train/val/test split for each class.

    Args:
        class_images: Dict mapping class names to image path lists
        train_ratio: Proportion for training set
        val_ratio: Proportion for validation set
        test_ratio: Proportion for test set
        seed: Random seed for reproducibility

    Returns:
        Tuple of (train_dict, val_dict, test_dict)
    """
    random.seed(seed)

    train_split = defaultdict(list)
    val_split = defaultdict(list)
    test_split = defaultdict(list)

    for class_name, image_paths in class_images.items():
        # Shuffle images
        shuffled = image_paths.copy()
        random.shuffle(shuffled)

        # Calculate split indices
        n = len(shuffled)
        train_end = int(n * train_ratio)
        val_end = train_end + int(n * val_ratio)

        # Split
        train_split[class_name] = shuffled[:train_end]
        val_split[class_name] = shuffled[train_end:val_end]
        test_split[class_name] = shuffled[val_end:]

    return train_split, val_split, test_split

# -----------------------------
# PERFORM SPLIT
# -----------------------------
print("=" * 60)
print("CREATING TRAIN/VAL/TEST SPLIT")
print("=" * 60)

print(f"\nSplit ratios: {TRAIN_RATIO*100:.0f}% / {VAL_RATIO*100:.0f}% / {TEST_RATIO*100:.0f}%")
print(f"Random seed:  {SEED}")
print()

train_split, val_split, test_split = create_stratified_split(
    class_images,
    TRAIN_RATIO,
    VAL_RATIO,
    TEST_RATIO,
    seed=SEED
)

# -----------------------------
# REPORT SPLIT DISTRIBUTION
# -----------------------------
print(f"{'Class':<20} {'Train':>8} {'Val':>8} {'Test':>8} {'Total':>8}")
print("-" * 56)

total_train = total_val = total_test = 0

for class_name in sorted(TARGET_CLASSES.keys()):
    n_train = len(train_split[class_name])
    n_val = len(val_split[class_name])
    n_test = len(test_split[class_name])
    n_total = n_train + n_val + n_test

    total_train += n_train
    total_val += n_val
    total_test += n_test

    print(f"{class_name:<20} {n_train:>8,} {n_val:>8,} {n_test:>8,} {n_total:>8,}")

print("-" * 56)
total_all = total_train + total_val + total_test
print(f"{'TOTAL':<20} {total_train:>8,} {total_val:>8,} {total_test:>8,} {total_all:>8,}")

# Verify percentages
print(f"\n[ACTUAL PERCENTAGES]")
print(f"  Train:      {total_train/total_all*100:.1f}%")
print(f"  Validation: {total_val/total_all*100:.1f}%")
print(f"  Test:       {total_test/total_all*100:.1f}%")

print(f"\nStratified split created successfully.")

CREATING TRAIN/VAL/TEST SPLIT

Split ratios: 70% / 15% / 15%
Random seed:  42

Class                   Train      Val     Test    Total
--------------------------------------------------------
apple                  48,572   10,408   10,409   69,389
banana                  3,989      854      856    5,699
bell_pepper_green         725      155      156    1,036
bell_pepper_red         1,087      233      234    1,554
carrot                    281       60       61      402
cucumber               11,041    2,365    2,367   15,773
grape                   6,202    1,329    1,330    8,861
lemon                   1,262      270      272    1,804
onion                   4,900    1,050    1,050    7,000
orange                  2,282      489      489    3,260
peach                   7,697    1,649    1,650   10,996
potato                  1,997      428      429    2,854
strawberry              2,892      619      621    4,132
tomato                 24,224    5,190    5,192   34,606
---------

In [None]:
# ================================================================
# CELL 7: COPY IMAGES TO LOCAL DIRECTORY STRUCTURE
# ================================================================
# Copies images to LOCAL Colab storage (fast), not Drive (slow).
# Final step will zip and upload to Drive.
#
# Directory structure:
#   /content/snapshelf_dataset/
#       train/class_name/
#       val/class_name/
#       test/class_name/
# ================================================================

def copy_split_to_directory(split_dict, output_dir, split_name):
    """
    Copy images from split dictionary to output directory.

    Args:
        split_dict: Dict mapping class names to image path lists
        output_dir: Destination directory
        split_name: Name of split for logging (train/val/test)

    Returns:
        int: Total number of images copied
    """
    total_copied = 0

    for class_name, image_paths in split_dict.items():
        # Create class subdirectory
        class_dir = os.path.join(output_dir, class_name)
        os.makedirs(class_dir, exist_ok=True)

        # Copy images with unique names to avoid overwrites
        for idx, src_path in enumerate(image_paths):
            ext = os.path.splitext(src_path)[1].lower()
            dst_filename = f"{class_name}_{idx:05d}{ext}"
            dst_path = os.path.join(class_dir, dst_filename)

            try:
                shutil.copy2(src_path, dst_path)
                total_copied += 1
            except Exception as e:
                print(f"  [ERROR] Failed to copy {src_path}: {e}")

    return total_copied

# -----------------------------
# USE LOCAL STORAGE (FAST)
# -----------------------------
LOCAL_OUTPUT = "/content/snapshelf_dataset"
LOCAL_TRAIN = f"{LOCAL_OUTPUT}/train"
LOCAL_VAL = f"{LOCAL_OUTPUT}/val"
LOCAL_TEST = f"{LOCAL_OUTPUT}/test"

print("=" * 60)
print("COPYING IMAGES TO LOCAL DIRECTORY")
print("=" * 60)

print(f"\nDestination: {LOCAL_OUTPUT} (local Colab storage)")

# Remove existing output directory if it exists
if os.path.exists(LOCAL_OUTPUT):
    shutil.rmtree(LOCAL_OUTPUT)

os.makedirs(LOCAL_OUTPUT, exist_ok=True)

# -----------------------------
# COPY EACH SPLIT
# -----------------------------
splits = [
    ("train", train_split, LOCAL_TRAIN),
    ("val", val_split, LOCAL_VAL),
    ("test", test_split, LOCAL_TEST)
]

print(f"\nCopying images...\n")

copy_results = {}

for split_name, split_data, split_dir in splits:
    print(f"  [{split_name.upper()}] Copying...", end=" ", flush=True)
    count = copy_split_to_directory(split_data, split_dir, split_name)
    copy_results[split_name] = count
    print(f"{count:,} images done.")

# -----------------------------
# VERIFY RESULTS
# -----------------------------
print(f"\n{'=' * 60}")
print("COPY COMPLETE")
print("=" * 60)

print(f"\n{'Split':<12} {'Images':>12}")
print("-" * 26)
for split_name, count in copy_results.items():
    print(f"{split_name:<12} {count:>12,}")
print("-" * 26)
print(f"{'TOTAL':<12} {sum(copy_results.values()):>12,}")

print(f"\nDataset ready at: {LOCAL_OUTPUT}")

COPYING IMAGES TO LOCAL DIRECTORY

Destination: /content/snapshelf_dataset (local Colab storage)

Copying images...

  [TRAIN] Copying... 117,151 images done.
  [VAL] Copying... 25,099 images done.
  [TEST] Copying... 25,116 images done.

COPY COMPLETE

Split              Images
--------------------------
train             117,151
val                25,099
test               25,116
--------------------------
TOTAL             167,366

Dataset ready at: /content/snapshelf_dataset


In [None]:
# ================================================================
# CELL 8: CREATE ZIP ARCHIVE & SAVE TO GOOGLE DRIVE
# ================================================================
# Compresses the dataset into a single zip file for:
#   - Fast transfer to Google Drive
#   - Easy download/backup
#   - Quick loading in future notebooks
# ================================================================

import zipfile
from datetime import datetime

# -----------------------------
# CONFIGURATION
# -----------------------------
ZIP_FILENAME = "snapshelf_dataset_14classes_70-15-15.zip"
LOCAL_ZIP_PATH = f"/content/{ZIP_FILENAME}"
DRIVE_ZIP_PATH = f"/content/drive/MyDrive/snapshelf_datasets/{ZIP_FILENAME}"

# Ensure Drive destination folder exists
os.makedirs(os.path.dirname(DRIVE_ZIP_PATH), exist_ok=True)

# -----------------------------
# CREATE ZIP ARCHIVE
# -----------------------------
print("=" * 60)
print("CREATING ZIP ARCHIVE")
print("=" * 60)

print(f"\nSource:      {LOCAL_OUTPUT}")
print(f"Destination: {LOCAL_ZIP_PATH}")
print(f"\nCompressing (this may take a few minutes)...")

start_time = datetime.now()
file_count = 0

with zipfile.ZipFile(LOCAL_ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(LOCAL_OUTPUT):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, LOCAL_OUTPUT)
            zipf.write(file_path, arcname)
            file_count += 1

            # Progress indicator every 25000 files
            if file_count % 25000 == 0:
                print(f"  Compressed {file_count:,} files...")

elapsed = (datetime.now() - start_time).seconds
zip_size_mb = os.path.getsize(LOCAL_ZIP_PATH) / (1024 * 1024)

print(f"\n  Compression complete.")
print(f"  Files:    {file_count:,}")
print(f"  Size:     {zip_size_mb:.1f} MB")
print(f"  Time:     {elapsed} seconds")

# -----------------------------
# COPY TO GOOGLE DRIVE
# -----------------------------
print(f"\n{'=' * 60}")
print("SAVING TO GOOGLE DRIVE")
print("=" * 60)

print(f"\nCopying to: {DRIVE_ZIP_PATH}")
print("Please wait...")

shutil.copy2(LOCAL_ZIP_PATH, DRIVE_ZIP_PATH)

# Verify
if os.path.exists(DRIVE_ZIP_PATH):
    drive_size_mb = os.path.getsize(DRIVE_ZIP_PATH) / (1024 * 1024)
    print(f"\n[OK] Successfully saved to Google Drive")
    print(f"     File: {ZIP_FILENAME}")
    print(f"     Size: {drive_size_mb:.1f} MB")
else:
    print(f"\n[ERROR] Failed to save to Google Drive")

print(f"\n{'=' * 60}")
print("DATASET PREPARATION COMPLETE")
print("=" * 60)

CREATING ZIP ARCHIVE

Source:      /content/snapshelf_dataset
Destination: /content/snapshelf_dataset_14classes_70-15-15.zip

Compressing (this may take a few minutes)...
  Compressed 25,000 files...
  Compressed 50,000 files...
  Compressed 75,000 files...
  Compressed 100,000 files...
  Compressed 125,000 files...
  Compressed 150,000 files...

  Compression complete.
  Files:    167,366
  Size:     2445.3 MB
  Time:     167 seconds

SAVING TO GOOGLE DRIVE

Copying to: /content/drive/MyDrive/snapshelf_datasets/snapshelf_dataset_14classes_70-15-15.zip
Please wait...

[OK] Successfully saved to Google Drive
     File: snapshelf_dataset_14classes_70-15-15.zip
     Size: 2445.3 MB

DATASET PREPARATION COMPLETE


In [None]:
# ================================================================
# CELL 9: GENERATE DATASET SUMMARY REPORT
# ================================================================
# Creates a summary of the dataset for documentation purposes.
# This information will be useful for your dissertation.
# ================================================================

from datetime import datetime

# -----------------------------
# CALCULATE STATISTICS
# -----------------------------
class_stats = []
for class_name in sorted(TARGET_CLASSES.keys()):
    n_train = len(train_split[class_name])
    n_val = len(val_split[class_name])
    n_test = len(test_split[class_name])
    n_total = n_train + n_val + n_test
    class_stats.append({
        "class": class_name,
        "train": n_train,
        "val": n_val,
        "test": n_test,
        "total": n_total
    })

total_train = sum(c["train"] for c in class_stats)
total_val = sum(c["val"] for c in class_stats)
total_test = sum(c["test"] for c in class_stats)
total_all = total_train + total_val + total_test

# -----------------------------
# PRINT REPORT
# -----------------------------
print("=" * 70)
print("SNAPSHELF DATASET - SUMMARY REPORT")
print("=" * 70)

print(f"""
DATASET INFORMATION
-------------------
Created:        {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Source:         3 Kaggle datasets (Fruits-360, Fruit Recognition,
                Fruits Classification)
Classes:        {len(TARGET_CLASSES)}
Total Images:   {total_all:,}
Split Ratio:    70% / 15% / 15% (Train / Validation / Test)
Random Seed:    {SEED}

FILE LOCATION
-------------
Google Drive:   {DRIVE_ZIP_PATH}
Archive Size:   {zip_size_mb:.1f} MB

SPLIT DISTRIBUTION
------------------
Training Set:   {total_train:,} images ({total_train/total_all*100:.1f}%)
Validation Set: {total_val:,} images ({total_val/total_all*100:.1f}%)
Test Set:       {total_test:,} images ({total_test/total_all*100:.1f}%)
""")

print("CLASS DISTRIBUTION")
print("-" * 70)
print(f"{'Class':<20} {'Train':>10} {'Val':>10} {'Test':>10} {'Total':>10}")
print("-" * 70)

for stat in class_stats:
    print(f"{stat['class']:<20} {stat['train']:>10,} {stat['val']:>10,} {stat['test']:>10,} {stat['total']:>10,}")

print("-" * 70)
print(f"{'TOTAL':<20} {total_train:>10,} {total_val:>10,} {total_test:>10,} {total_all:>10,}")

# -----------------------------
# CLASS IMBALANCE NOTE
# -----------------------------
max_class = max(class_stats, key=lambda x: x["total"])
min_class = min(class_stats, key=lambda x: x["total"])
imbalance_ratio = max_class["total"] / min_class["total"]

print(f"""
CLASS IMBALANCE NOTE
--------------------
Largest class:  {max_class['class']} ({max_class['total']:,} images)
Smallest class: {min_class['class']} ({min_class['total']:,} images)
Imbalance ratio: {imbalance_ratio:.1f}:1

This imbalance should be noted in the dissertation methodology section.
Consider using class weights or stratified sampling during training.
""")

print("=" * 70)
print("END OF REPORT")
print("=" * 70)

SNAPSHELF DATASET - SUMMARY REPORT

DATASET INFORMATION
-------------------
Created:        2026-01-03 16:38:04
Source:         3 Kaggle datasets (Fruits-360, Fruit Recognition, 
                Fruits Classification)
Classes:        14
Total Images:   167,366
Split Ratio:    70% / 15% / 15% (Train / Validation / Test)
Random Seed:    42

FILE LOCATION
-------------
Google Drive:   /content/drive/MyDrive/snapshelf_datasets/snapshelf_dataset_14classes_70-15-15.zip
Archive Size:   2445.3 MB

SPLIT DISTRIBUTION
------------------
Training Set:   117,151 images (70.0%)
Validation Set: 25,099 images (15.0%)
Test Set:       25,116 images (15.0%)

CLASS DISTRIBUTION
----------------------------------------------------------------------
Class                     Train        Val       Test      Total
----------------------------------------------------------------------
apple                    48,572     10,408     10,409     69,389
banana                    3,989        854        856      5