# Cats vs Dogs Dataset Exploration

This notebook explores the Cats vs Dogs dataset and visualizes sample images.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image
import sys

sys.path.append('..')

%matplotlib inline

## 1. Dataset Overview

In [None]:
# Count images in each split
data_dir = Path('../data/processed')

for split in ['train', 'val', 'test']:
    split_dir = data_dir / split
    if split_dir.exists():
        cat_count = len(list((split_dir / 'cat').glob('*.jpg')))
        dog_count = len(list((split_dir / 'dog').glob('*.jpg')))
        print(f"{split.capitalize()}: {cat_count} cats, {dog_count} dogs, Total: {cat_count + dog_count}")

## 2. Visualize Sample Images

In [None]:
def show_samples(data_dir, split='train', n_samples=8):
    """Display sample images from the dataset."""
    
    split_dir = Path(data_dir) / split
    
    # Get sample images
    cat_images = list((split_dir / 'cat').glob('*.jpg'))[:n_samples//2]
    dog_images = list((split_dir / 'dog').glob('*.jpg'))[:n_samples//2]
    
    images = cat_images + dog_images
    labels = ['Cat'] * len(cat_images) + ['Dog'] * len(dog_images)
    
    # Plot
    fig, axes = plt.subplots(2, n_samples//2, figsize=(15, 6))
    axes = axes.flatten()
    
    for i, (img_path, label) in enumerate(zip(images, labels)):
        img = Image.open(img_path)
        axes[i].imshow(img)
        axes[i].set_title(f"{label}\n{img.size[0]}x{img.size[1]}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

show_samples('../data/processed', split='train', n_samples=8)

## 3. Image Size Distribution

In [None]:
def analyze_image_sizes(data_dir, split='train', max_images=100):
    """Analyze distribution of image sizes."""
    
    split_dir = Path(data_dir) / split
    
    widths = []
    heights = []
    
    for class_name in ['cat', 'dog']:
        images = list((split_dir / class_name).glob('*.jpg'))[:max_images//2]
        for img_path in images:
            img = Image.open(img_path)
            widths.append(img.size[0])
            heights.append(img.size[1])
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].hist(widths, bins=30, alpha=0.7, label='Width')
    axes[0].set_xlabel('Width (pixels)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Image Width Distribution')
    axes[0].legend()
    
    axes[1].hist(heights, bins=30, alpha=0.7, label='Height', color='orange')
    axes[1].set_xlabel('Height (pixels)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Image Height Distribution')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    print(f"Width - Mean: {np.mean(widths):.0f}, Std: {np.std(widths):.0f}")
    print(f"Height - Mean: {np.mean(heights):.0f}, Std: {np.std(heights):.0f}")

analyze_image_sizes('../data/processed', split='train', max_images=200)

## 4. Data Augmentation Preview

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

def show_augmentations(image_path, n_augmentations=6):
    """Show augmented versions of an image."""
    
    # Load image
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = img_array.reshape((1,) + img_array.shape)
    
    # Create augmentation generator
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    
    # Generate augmented images
    fig, axes = plt.subplots(2, 3, figsize=(12, 8))
    axes = axes.flatten()
    
    i = 0
    for batch in datagen.flow(img_array, batch_size=1):
        axes[i].imshow(batch[0].astype('uint8'))
        axes[i].set_title(f'Augmentation {i+1}')
        axes[i].axis('off')
        i += 1
        if i >= n_augmentations:
            break
    
    plt.tight_layout()
    plt.show()

# Show augmentations for a sample image
sample_image = list((Path('../data/processed/train/cat')).glob('*.jpg'))[0]
show_augmentations(sample_image, n_augmentations=6)

## 5. Model Predictions (After Training)

In [None]:
# This cell can be used after training the model
import tensorflow as tf

# Load trained model
model = tf.keras.models.load_model('../models/cats_dogs_classifier.h5')

def predict_and_show(image_path, model):
    """Make prediction and show image."""
    
    # Load and preprocess image
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)
    
    # Predict
    prediction = model.predict(img_array)[0][0]
    
    # Display
    plt.figure(figsize=(6, 6))
    plt.imshow(img)
    plt.title(f"Prediction: {'Dog' if prediction > 0.5 else 'Cat'}\nConfidence: {max(prediction, 1-prediction):.2%}")
    plt.axis('off')
    plt.show()

# Test on a sample image
test_image = list((Path('../data/processed/test/cat')).glob('*.jpg'))[0]
predict_and_show(test_image, model)