# ðŸ§  Tumor Classifier â€” Exploratory Data Analysis
Run this after downloading your dataset to understand the data before training.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter

DATA_DIR = Path('../data/processed')
CLASSES = ['glioma', 'meningioma', 'pituitary', 'no_tumor']
print('Ready!')

In [None]:
# â”€â”€ Class Distribution â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
counts = {split: {} for split in ['train', 'val', 'test']}

for split in counts:
    for cls in CLASSES:
        path = DATA_DIR / split / cls
        if path.exists():
            counts[split][cls] = len(list(path.glob('*.jpg')) + list(path.glob('*.png')))

import pandas as pd
df = pd.DataFrame(counts).T
print(df)

df.plot(kind='bar', figsize=(10, 5), title='Class Distribution by Split')
plt.xticks(rotation=0)
plt.ylabel('Number of Images')
plt.tight_layout()
plt.show()

In [None]:
# â”€â”€ Sample Images â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, cls in enumerate(CLASSES):
    cls_path = DATA_DIR / 'train' / cls
    images = list(cls_path.glob('*.jpg'))[:2]
    
    for j, img_path in enumerate(images):
        ax = axes[j][i]
        img = Image.open(img_path).convert('RGB')
        ax.imshow(img)
        ax.set_title(f'{cls}\n{img.size}', fontsize=10, fontweight='bold')
        ax.axis('off')

plt.suptitle('Sample Images per Class', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# â”€â”€ Image Size Distribution â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
widths, heights = [], []

for cls in CLASSES:
    for img_path in (DATA_DIR / 'train' / cls).glob('*.jpg'):
        with Image.open(img_path) as img:
            w, h = img.size
            widths.append(w)
            heights.append(h)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(widths, bins=20, color='steelblue', edgecolor='white')
axes[0].set_title('Image Width Distribution')
axes[1].hist(heights, bins=20, color='coral', edgecolor='white')
axes[1].set_title('Image Height Distribution')
plt.tight_layout()
plt.show()

print(f'Width:  min={min(widths)}, max={max(widths)}, mean={np.mean(widths):.0f}')
print(f'Height: min={min(heights)}, max={max(heights)}, mean={np.mean(heights):.0f}')

In [None]:
# â”€â”€ Verify Augmentations Look Good â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import sys; sys.path.insert(0, '..')
from src.data.dataset import get_transforms
import numpy as np

transform = get_transforms('train', image_size=224)

# Pick one image
sample_path = list((DATA_DIR / 'train' / 'glioma').glob('*.jpg'))[0]
img = np.array(Image.open(sample_path).convert('RGB'))

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes[0][0].imshow(img)
axes[0][0].set_title('Original', fontweight='bold')
axes[0][0].axis('off')

for i in range(1, 8):
    aug = transform(image=img)['image'].numpy().transpose(1, 2, 0)
    # Denormalize
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    aug = std * aug + mean
    aug = np.clip(aug, 0, 1)
    ax = axes[i // 4][i % 4]
    ax.imshow(aug)
    ax.set_title(f'Aug {i}')
    ax.axis('off')

plt.suptitle('Augmentation Preview', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()