In [6]:
from datasets import load_dataset
# Import necessary libraries
from collections import Counter

# Load the dataset from the Hugging Face Hub
dataset = load_dataset("timm/mini-imagenet")

# The `dataset` object is a DatasetDict containing the splits
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 5000
    })
})


In [7]:
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

# Function to perform EDA on a dataset split
def eda_on_split(split_name, split_dataset):
    labels = split_dataset['label']
    label_counts = Counter(labels)
    num_classes = len(label_counts)
    print(f"\n{split_name.upper()} SET:")
    print(f"Total number of classes: {num_classes}")
    print("Number of samples per class:")
    for label, count in sorted(label_counts.items()):
        print(f"Class {label}: {count} samples")

# Perform EDA on each split
eda_on_split("train", train_dataset)
eda_on_split("validation", val_dataset)
eda_on_split("test", test_dataset)


TRAIN SET:
Total number of classes: 100
Number of samples per class:
Class 0: 500 samples
Class 1: 500 samples
Class 2: 500 samples
Class 3: 500 samples
Class 4: 500 samples
Class 5: 500 samples
Class 6: 500 samples
Class 7: 500 samples
Class 8: 500 samples
Class 9: 500 samples
Class 10: 500 samples
Class 11: 500 samples
Class 12: 500 samples
Class 13: 500 samples
Class 14: 500 samples
Class 15: 500 samples
Class 16: 500 samples
Class 17: 500 samples
Class 18: 500 samples
Class 19: 500 samples
Class 20: 500 samples
Class 21: 500 samples
Class 22: 500 samples
Class 23: 500 samples
Class 24: 500 samples
Class 25: 500 samples
Class 26: 500 samples
Class 27: 500 samples
Class 28: 500 samples
Class 29: 500 samples
Class 30: 500 samples
Class 31: 500 samples
Class 32: 500 samples
Class 33: 500 samples
Class 34: 500 samples
Class 35: 500 samples
Class 36: 500 samples
Class 37: 500 samples
Class 38: 500 samples
Class 39: 500 samples
Class 40: 500 samples
Class 41: 500 samples
Class 42: 500 sa

In [8]:
# Analyze image size distribution
from collections import defaultdict
import numpy as np

def analyze_image_sizes(split_name, split_dataset):
    """Analyze image dimensions across a dataset split"""
    image_sizes = defaultdict(int)
    
    print(f"\n{split_name.upper()} SET - IMAGE SIZE DISTRIBUTION:")
    
    # Sample images to determine sizes (sample first 100 to be fast)
    for i, sample in enumerate(split_dataset):
        if i >= 100:  # Limit samples for speed
            break
        image = sample['image']
        size = image.size  # PIL Image.size returns (width, height)
        image_sizes[size] += 1
    
    print(f"Image sizes found in first 100 samples:")
    for size, count in sorted(image_sizes.items()):
        print(f"  {size}: {count} images")
    
    # Check if all images are the same size
    if len(image_sizes) == 1:
        size = list(image_sizes.keys())[0]
        print(f"✓ All images are uniform: {size[0]}×{size[1]} pixels")
    else:
        print(f"⚠ Images have varying sizes")

# Run analysis on each split
analyze_image_sizes("train", train_dataset)
analyze_image_sizes("validation", val_dataset)
analyze_image_sizes("test", test_dataset)


TRAIN SET - IMAGE SIZE DISTRIBUTION:
Image sizes found in first 100 samples:
  (307, 299): 1 images
  (333, 500): 1 images
  (344, 500): 1 images
  (357, 500): 1 images
  (358, 500): 1 images
  (375, 500): 3 images
  (378, 500): 1 images
  (381, 500): 1 images
  (382, 500): 1 images
  (388, 500): 1 images
  (399, 500): 2 images
  (400, 500): 1 images
  (407, 500): 1 images
  (425, 500): 1 images
  (426, 500): 1 images
  (428, 500): 1 images
  (438, 500): 1 images
  (459, 500): 1 images
  (460, 500): 1 images
  (464, 500): 1 images
  (469, 500): 1 images
  (475, 500): 1 images
  (476, 500): 1 images
  (498, 500): 1 images
  (499, 500): 1 images
  (500, 312): 1 images
  (500, 333): 3 images
  (500, 334): 2 images
  (500, 348): 1 images
  (500, 354): 1 images
  (500, 356): 1 images
  (500, 357): 1 images
  (500, 358): 1 images
  (500, 359): 1 images
  (500, 361): 1 images
  (500, 363): 1 images
  (500, 367): 1 images
  (500, 368): 1 images
  (500, 370): 4 images
  (500, 372): 1 images
  