# Dataset Exploration for Sim2Real Component Classification

This notebook explores the provided dataset with simulated (sim) and real images.

In [ ]:
# Imports
import os
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter

import pandas as pd
import numpy as np

# Set paths
sim_path = 'data/sim/train'   # path to simulated images
real_path = 'data/real/train' # path to real images

## 1. Inspect Classes and Counts

In [ ]:
def get_class_counts(path):
    classes = sorted(os.listdir(path))
    counts = {}
    for c in classes:
        c_path = os.path.join(path, c)
        counts[c] = len(os.listdir(c_path))
    return counts

sim_counts = get_class_counts(sim_path)
real_counts = get_class_counts(real_path)

print("Simulated Dataset Class Counts:")
print(sim_counts)
print("\nReal Dataset Class Counts:")
print(real_counts)

## 2. Plot Class Distributions

In [ ]:
def plot_class_distribution(counts, title):
    plt.figure(figsize=(10,5))
    plt.bar(counts.keys(), counts.values(), color='skyblue')
    plt.title(title)
    plt.ylabel('Number of Images')
    plt.xticks(rotation=45)
    plt.show()

plot_class_distribution(sim_counts, 'Simulated Dataset Class Distribution')
plot_class_distribution(real_counts, 'Real Dataset Class Distribution')

## 3. Show Sample Images per Class

In [ ]:
def show_samples(path, n=3):
    classes = sorted(os.listdir(path))
    plt.figure(figsize=(n*3, len(classes)*3))
    for i, c in enumerate(classes):
        c_path = os.path.join(path, c)
        imgs = os.listdir(c_path)[:n]
        for j, img_file in enumerate(imgs):
            img_path = os.path.join(c_path, img_file)
            img = Image.open(img_path).convert('RGB')
            plt.subplot(len(classes), n, i*n+j+1)
            plt.imshow(img)
            plt.axis('off')
            if j==1:
                plt.title(c)
    plt.show()

print('Sample images from simulated dataset:')
show_samples(sim_path, n=3)

print('Sample images from real dataset:')
show_samples(real_path, n=3)

## 4. Compute Dataset Summary

In [ ]:
def dataset_summary(path):
    classes = sorted(os.listdir(path))
    total_images = sum(len(os.listdir(os.path.join(path, c))) for c in classes)
    return {'num_classes': len(classes), 'num_images': total_images}

print('Simulated Dataset Summary:', dataset_summary(sim_path))
print('Real Dataset Summary:', dataset_summary(real_path))