# **Visual Information Processing and Management**
---
---

Università degli Studi Milano Bicocca \
CdLM Informatica — A.A 2025/2026

---
---

### **Componenti del gruppo:**
— Oleksandra Golub (856706) \
— Andrea Spagnolo (879254)


# **Visual Dataset Analysis**

This notebook imports and analyzes the `train`, `test`, and `valid` datasets located on the external drive `T7-apps`.

## **Librerie**


In [8]:
!pip install pandas pathlib numpy Pillow

import os, random
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image, ImageEnhance, ImageOps, ImageFilter



In [9]:
# Define base path
dataset_root = Path("/Volumes/T7-apps/visual_dataset")

# Define split paths
paths = {
    "train": dataset_root / "train",
    "test": dataset_root / "test",
    "val": dataset_root / "valid"
}

# Verify paths
for name, path in paths.items():
    if path.exists():
        print(f"[OK] {name} path found: {path}")
    else:
        print(f"[ERROR] {name} path NOT found: {path}")

[OK] train path found: /Volumes/T7-apps/visual_dataset/train
[OK] test path found: /Volumes/T7-apps/visual_dataset/test
[OK] val path found: /Volumes/T7-apps/visual_dataset/valid


In [10]:
# verify the classes distribution 

def get_dataset_stats(split_paths):
    data = []
    for split_name, split_path in split_paths.items():
        if not split_path.exists():
            continue
            
        # Classes are subdirectories
        classes = sorted([d.name for d in split_path.iterdir() if d.is_dir() and not d.name.startswith('.')])
        
        for cls in classes:
            cls_path = split_path / cls
            # Count files in each class directory
            example_count = sum(1 for f in cls_path.iterdir() if f.is_file() and not f.name.startswith('.'))
            data.append({
                "split": split_name,
                "class": cls,
                "count": example_count
            })
    return pd.DataFrame(data)

df_stats = get_dataset_stats(paths)

# Create a summary table
summary = df_stats.groupby("split")["count"].agg([
    ("num_classes", "count"),
    ("total_examples", "sum"),
    ("min_per_class", "min"),
    ("max_per_class", "max"),
    ("mean_per_class", "mean")
])

print("Dataset split statistics:")
display(summary)

print("\nRAREST classes in the TRAINING set:")
train_stats = df_stats[df_stats['split'] == 'train']
display(train_stats.nsmallest(100,"count"))

Dataset split statistics:


Unnamed: 0_level_0,num_classes,total_examples,min_per_class,max_per_class,mean_per_class
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test,100,500,5,5,5.0
train,100,13493,59,191,134.93
val,100,500,5,5,5.0



RAREST classes in the TRAINING set:


Unnamed: 0,split,class,count
77,train,sky surfing,59
91,train,ultimate,97
99,train,wingsuit flying,98
3,train,arm wrestling,99
44,train,horseshoe pitching,102
...,...,...,...
41,train,hockey,172
7,train,baseball,174
58,train,nascar racing,189
32,train,formula 1 racing,190


In [11]:
# 1. Determine the Target Count (The count of the majority class)
# We use the stats dataframe you already calculated
train_stats = df_stats[df_stats['split'] == 'train']
max_count = train_stats['count'].max()
print(f"Target count per class (Majority Class): {max_count}")

# 2. Define a Stronger Augmentation Pipeline (SOTA for offline generation)
def strong_augment(img):
    """
    Applies a random combination of strong augmentations to create diversity.
    This prevents the model from memorizing duplicates.
    """
    # Random Horizontal Flip (50% chance)
    if random.random() > 0.5:
        img = img.transpose(Image.FLIP_LEFT_RIGHT)

    # Random Rotation (+/- 15 degrees) - safer than 30 for general objects
    angle = random.uniform(-15, 15)
    img = img.rotate(angle, resample=Image.BICUBIC, expand=False)
    
    # Random Brightness
    if random.random() > 0.3:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(random.uniform(0.7, 1.3))
        
    # Random Contrast
    if random.random() > 0.3:
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(random.uniform(0.7, 1.3))

    # Random Sharpness
    if random.random() > 0.3:
        enhancer = ImageEnhance.Sharpness(img)
        img = enhancer.enhance(random.uniform(0.5, 2.0))
        
    # Gaussian Blur (simulate low focus) - low probability
    if random.random() > 0.8:
        img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0, 1.0)))

    return img

# 3. Main Loop: Iterate over ALL classes
train_path = paths["train"]
classes = [d for d in train_path.iterdir() if d.is_dir() and not d.name.startswith('.')]

print(f"Starting augmentation for {len(classes)} classes...")

total_generated = 0

for class_dir in classes:
    # Get all valid images
    images = [f for f in class_dir.iterdir() if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg', '.png'] and not f.name.startswith('.')]
    current_count = len(images)
    
    # Skip if we already have enough
    if current_count >= max_count:
        continue
        
    needed = max_count - current_count
    print(f"[{class_dir.name}] Generating {needed} images (Current: {current_count} -> Target: {max_count})")
    
    # Identify original images to source from (avoid re-augmenting augments if possible)
    source_images = [f for f in images if not f.name.startswith('aug_')]
    if not source_images:
        source_images = images # Fallback if only augments exist

    # Generate images
    for i in range(needed):
        try:
            # Pick a random source image
            src_path = random.choice(source_images)
            with Image.open(src_path) as img:
                # Convert to RGB to ensure consistency
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Apply Augmentation
                aug_img = strong_augment(img)
                
                # Save with unique name
                new_name = f"aug_{i}_{random.randint(1000,9999)}_{src_path.name}"
                dst_path = class_dir / new_name
                aug_img.save(dst_path)
                total_generated += 1
                
        except Exception as e:
            print(f"  [Error] Failed to process {src_path}: {e}")

print(f"\nDONE! Total images generated: {total_generated}")

# 4. Verification: Refresh stats
print("Verifying new counts...")
final_stats = get_dataset_stats(paths)
display(final_stats[final_stats['split'] == 'train'].nsmallest(10, "count"))

Target count per class (Majority Class): 191
Starting augmentation for 100 classes...
[lacrosse] Generating 40 images (Current: 151 -> Target: 191)
[shuffleboard] Generating 61 images (Current: 130 -> Target: 191)
[arm wrestling] Generating 92 images (Current: 99 -> Target: 191)
[golf] Generating 34 images (Current: 157 -> Target: 191)
[surfing] Generating 49 images (Current: 142 -> Target: 191)
[snow boarding] Generating 74 images (Current: 117 -> Target: 191)
[parallel bar] Generating 60 images (Current: 131 -> Target: 191)
[pole climbing] Generating 73 images (Current: 118 -> Target: 191)
[giant slalom] Generating 41 images (Current: 150 -> Target: 191)
[bobsled] Generating 53 images (Current: 138 -> Target: 191)
[baton twirling] Generating 83 images (Current: 108 -> Target: 191)
[weightlifting] Generating 57 images (Current: 134 -> Target: 191)
[hydroplane racing] Generating 83 images (Current: 108 -> Target: 191)
[barell racing] Generating 68 images (Current: 123 -> Target: 191)
[

Unnamed: 0,split,class,count
0,train,air hockey,191
1,train,ampute football,191
2,train,archery,191
3,train,arm wrestling,191
4,train,axe throwing,191
5,train,balance beam,191
6,train,barell racing,191
7,train,baseball,191
8,train,basketball,191
9,train,baton twirling,191
