# Visual Dataset Analysis

This notebook imports and analyzes the `train`, `test`, and `valid` datasets located on the external drive `T7-apps`.

In [9]:
import os
from pathlib import Path
import pandas as pd

# Define base path
dataset_root = Path("/Volumes/T7-apps/visual_dataset")

# Define split paths
paths = {
    "train": dataset_root / "train",
    "test": dataset_root / "test",
    "val": dataset_root / "valid"
}

# Verify paths
for name, path in paths.items():
    if path.exists():
        print(f"[OK] {name} path found: {path}")
    else:
        print(f"[ERROR] {name} path NOT found: {path}")

[OK] train path found: /Volumes/T7-apps/visual_dataset/train
[OK] test path found: /Volumes/T7-apps/visual_dataset/test
[OK] val path found: /Volumes/T7-apps/visual_dataset/valid


In [10]:
# verify the classes distribution 

def get_dataset_stats(split_paths):
    data = []
    for split_name, split_path in split_paths.items():
        if not split_path.exists():
            continue
            
        # Classes are subdirectories
        classes = sorted([d.name for d in split_path.iterdir() if d.is_dir() and not d.name.startswith('.')])
        
        for cls in classes:
            cls_path = split_path / cls
            # Count files in each class directory
            example_count = sum(1 for f in cls_path.iterdir() if f.is_file() and not f.name.startswith('.'))
            data.append({
                "split": split_name,
                "class": cls,
                "count": example_count
            })
    return pd.DataFrame(data)

df_stats = get_dataset_stats(paths)

# Create a summary table
summary = df_stats.groupby("split")["count"].agg([
    ("num_classes", "count"),
    ("total_examples", "sum"),
    ("min_per_class", "min"),
    ("max_per_class", "max"),
    ("mean_per_class", "mean")
])

print("Dataset split statistics:")
display(summary)

print("\nRAREST classes in the TRAINING set:")
train_stats = df_stats[df_stats['split'] == 'train']
display(train_stats.nsmallest(100,"count"))

Dataset split statistics:


Unnamed: 0_level_0,num_classes,total_examples,min_per_class,max_per_class,mean_per_class
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test,100,500,5,5,5.0
train,100,13493,59,191,134.93
val,100,500,5,5,5.0



RAREST classes in the TRAINING set:


Unnamed: 0,split,class,count
77,train,sky surfing,59
91,train,ultimate,97
99,train,wingsuit flying,98
3,train,arm wrestling,99
44,train,horseshoe pitching,102
...,...,...,...
41,train,hockey,172
7,train,baseball,174
58,train,nascar racing,189
32,train,formula 1 racing,190


In [None]:
# Data Augmentation for 'sky surfing' class
import random
from PIL import Image, ImageEnhance

# Target directory for 'sky surfing'
target_dir = paths["train"] / "sky surfing"
target_count = 100

if not target_dir.exists():
    print(f"Error: Directory {target_dir} does not exist.")
else:
    # Get existing images
    existing_images = [f for f in target_dir.iterdir() if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg', '.png'] and not f.name.startswith('.')]
    current_count = len(existing_images)
    print(f"Current count: {current_count}")

    if current_count < target_count:
        needed = target_count - current_count
        print(f"Need to generate {needed} images.")
        
        # Filter (avoid re-augmenting augments if possible)
        original_images = [f for f in existing_images if not f.name.startswith('aug_')]
        if not original_images:
            original_images = existing_images

        generated_count = 0
        for i in range(needed):
            try:
                # Random selection
                src_path = random.choice(original_images)
                img = Image.open(src_path)
                
                # Transformations
                # 1. Flip
                if random.random() > 0.5:
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                
                # 2. Brightness
                enhancer = ImageEnhance.Brightness(img)
                img = enhancer.enhance(random.uniform(0.8, 1.2))
                
                # Convert to RGB
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Save
                new_name = f"aug_{i}_{src_path.name}"
                dst_path = target_dir / new_name
                img.save(dst_path)
                generated_count += 1
            except Exception as e:
                print(f"Error processing {src_path}: {e}")
        
        print(f"Successfully generated {generated_count} images.")
        
        # Verify
        final_count = len([f for f in target_dir.iterdir() if f.is_file() and not f.name.startswith('.')])
        print(f"New count: {final_count}")
    else:
        print("Target count already reached.")

Current count: 59
Need to generate 41 images.
Successfully generated 41 images.
New count: 100


In [13]:
# Refresh dataset statistics to see the changes
df_stats_updated = get_dataset_stats(paths)

print("Updated stats for 'sky surfing':")
sky_surfing_stats = df_stats_updated[(df_stats_updated['split'] == 'train') & (df_stats_updated['class'] == 'sky surfing')]
display(sky_surfing_stats)

Updated stats for 'sky surfing':


Unnamed: 0,split,class,count
77,train,sky surfing,100
