# Rock-Paper-Scissors CNN Project
## 2. Data Preprocessing and Augmentation

This notebook handles data preprocessing, normalization, augmentation, and train/validation/test splitting.


In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from pathlib import Path
import shutil
import yaml
import warnings
warnings.filterwarnings('ignore')

# Add src to path for imports
sys.path.append('../src')

from data.data_loader import RockPaperScissorsDataLoader
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✅ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")


### Configuration and Setup

Let's load the configuration and set up the data preprocessing pipeline.


In [None]:
# Load configuration
config_path = '../config/config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Extract configuration parameters
data_config = config['data']
augmentation_config = config['augmentation']
classes = config['classes']

print("CONFIGURATION LOADED")
print("="*50)
print(f"Image size: {data_config['image_size']}")
print(f"Batch size: {data_config['batch_size']}")
print(f"Validation split: {data_config['validation_split']}")
print(f"Test split: {data_config['test_split']}")
print(f"Random seed: {data_config['random_seed']}")
print(f"Classes: {classes}")
print("="*50)

# Initialize data loader
loader = RockPaperScissorsDataLoader(config_path)
print("✅ Data loader initialized successfully!")


### Dataset Loading and Information

First, let's load the dataset and examine its structure.


In [None]:
# Load dataset information
dataset_info = loader.load_dataset_info()

# Display dataset summary
print("DATASET SUMMARY")
print("="*50)
total_images = 0
for class_name, info in dataset_info.items():
    if class_name != 'total':
        count = info['count']
        total_images += count
        print(f"{class_name.upper()}: {count:,} images")

print(f"\nTOTAL: {total_images:,} images")
print("="*50)

# Check if we have data
if total_images == 0:
    print("❌ No data found! Please ensure the dataset is in the correct location.")
    print("Expected structure:")
    print("data/raw/")
    print("├── rock/")
    print("├── paper/")
    print("└── scissors/")
else:
    print("✅ Dataset loaded successfully!")


### Data Splitting

Now let's split the dataset into training, validation, and test sets following proper ML practices.


In [None]:
# Split dataset into train/validation/test sets
if total_images > 0:
    print("SPLITTING DATASET...")
    print("="*50)
    
    # Perform the split
    split_info, split_dirs = loader.split_dataset(dataset_info)
    
    # Display split results
    print("\nSPLIT RESULTS:")
    print("-" * 30)
    total_train = 0
    total_val = 0
    total_test = 0
    
    for class_name, splits in split_info.items():
        train_count = len(splits['train'])
        val_count = len(splits['val'])
        test_count = len(splits['test'])
        
        total_train += train_count
        total_val += val_count
        total_test += test_count
        
        print(f"{class_name.upper()}:")
        print(f"  Train: {train_count:,} images")
        print(f"  Val:   {val_count:,} images")
        print(f"  Test:  {test_count:,} images")
        print()
    
    print("TOTAL SPLIT:")
    print(f"  Train: {total_train:,} images ({total_train/(total_train+total_val+total_test)*100:.1f}%)")
    print(f"  Val:   {total_val:,} images ({total_val/(total_train+total_val+total_test)*100:.1f}%)")
    print(f"  Test:  {total_test:,} images ({total_test/(total_train+total_val+total_test)*100:.1f}%)")
    
    # Save split information
    loader.save_split_info(split_info)
    print("\n✅ Dataset split completed and saved!")
    
    # Visualize the split
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot of split
    split_counts = [total_train, total_val, total_test]
    split_labels = ['Train', 'Validation', 'Test']
    colors = ['#2E8B57', '#4169E1', '#DC143C']
    
    bars = ax1.bar(split_labels, split_counts, color=colors)
    ax1.set_title('Dataset Split Distribution', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Number of Images')
    
    # Add value labels on bars
    for bar, count in zip(bars, split_counts):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 10,
                f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    # Pie chart of split
    ax2.pie(split_counts, labels=split_labels, autopct='%1.1f%%', 
            colors=colors, startangle=90)
    ax2.set_title('Dataset Split Percentages', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("❌ Cannot split dataset - no data available")


### Data Augmentation

Let's set up data augmentation to improve model generalization and prevent overfitting.


In [None]:
# Set up data augmentation
print("DATA AUGMENTATION CONFIGURATION")
print("="*50)

# Training data generator with augmentation
train_datagen = ImageDataGenerator(
    rotation_range=augmentation_config['rotation_range'],
    width_shift_range=augmentation_config['width_shift_range'],
    height_shift_range=augmentation_config['height_shift_range'],
    horizontal_flip=augmentation_config['horizontal_flip'],
    zoom_range=augmentation_config['zoom_range'],
    fill_mode=augmentation_config['fill_mode'],
    rescale=1./255  # Normalize pixel values to [0,1]
)

# Validation and test data generators (no augmentation, only normalization)
val_test_datagen = ImageDataGenerator(rescale=1./255)

print("Training Data Augmentation:")
print(f"  Rotation range: ±{augmentation_config['rotation_range']}°")
print(f"  Width shift: ±{augmentation_config['width_shift_range']*100:.1f}%")
print(f"  Height shift: ±{augmentation_config['height_shift_range']*100:.1f}%")
print(f"  Horizontal flip: {augmentation_config['horizontal_flip']}")
print(f"  Zoom range: ±{augmentation_config['zoom_range']*100:.1f}%")
print(f"  Fill mode: {augmentation_config['fill_mode']}")
print(f"  Rescaling: 1/255 (normalization)")
print("\nValidation/Test Data:")
print(f"  Rescaling: 1/255 (normalization only)")
print("="*50)

# Visualize augmentation effects
if total_images > 0:
    print("\nVISUALIZING DATA AUGMENTATION...")
    
    # Get a sample image
    sample_class = classes[0]
    sample_path = Path(f"../data/processed/train/{sample_class}")
    if sample_path.exists():
        sample_images = list(sample_path.glob('*.png'))
        if sample_images:
            # Load original image
            original_img = Image.open(sample_images[0])
            original_array = np.array(original_img) / 255.0
            
            # Create augmented versions
            fig, axes = plt.subplots(2, 4, figsize=(16, 8))
            fig.suptitle(f'Data Augmentation Examples - {sample_class.capitalize()}', 
                        fontsize=16, fontweight='bold')
            
            # Original image
            axes[0, 0].imshow(original_array)
            axes[0, 0].set_title('Original', fontweight='bold')
            axes[0, 0].axis('off')
            
            # Generate augmented images
            augmented_images = []
            for i in range(7):
                # Reshape for ImageDataGenerator
                img_reshaped = original_array.reshape((1,) + original_array.shape)
                
                # Generate augmented image
                aug_iter = train_datagen.flow(img_reshaped, batch_size=1)
                aug_img = next(aug_iter)[0]
                augmented_images.append(aug_img)
            
            # Display augmented images
            titles = ['Rotation', 'Width Shift', 'Height Shift', 'Horizontal Flip']
            for i, (img, title) in enumerate(zip(augmented_images[:4], titles)):
                axes[0, i+1].imshow(img)
                axes[0, i+1].set_title(title, fontweight='bold')
                axes[0, i+1].axis('off')
            
            # More augmentations
            titles2 = ['Zoom', 'Combined 1', 'Combined 2']
            for i, (img, title) in enumerate(zip(augmented_images[4:7], titles2)):
                axes[1, i].imshow(img)
                axes[1, i].set_title(title, fontweight='bold')
                axes[1, i].axis('off')
            
            # Hide the last subplot
            axes[1, 3].axis('off')
            
            plt.tight_layout()
            plt.show()
            
            print("✅ Data augmentation visualization completed!")
        else:
            print("⚠️ No sample images found for visualization")
    else:
        print("⚠️ Sample directory not found for visualization")


### Data Generators Setup

Now let's create the data generators for training, validation, and testing.


In [None]:
# Create data generators
if total_images > 0:
    print("CREATING DATA GENERATORS...")
    print("="*50)
    
    # Create generators using the data loader
    train_generator, val_generator, test_generator = loader.create_data_generators(
        str(split_dirs['train']),
        str(split_dirs['val']),
        str(split_dirs['test'])
    )
    
    print("✅ Data generators created successfully!")
    print(f"Training batches: {len(train_generator)}")
    print(f"Validation batches: {len(val_generator)}")
    print(f"Test batches: {len(test_generator)}")
    
    # Display generator information
    print("\nGENERATOR INFORMATION:")
    print("-" * 30)
    print(f"Batch size: {data_config['batch_size']}")
    print(f"Image size: {data_config['image_size']}")
    print(f"Number of classes: {len(classes)}")
    print(f"Class indices: {train_generator.class_indices}")
    
    # Test the generators
    print("\nTESTING GENERATORS...")
    print("-" * 30)
    
    # Get a batch from training generator
    train_batch_x, train_batch_y = next(train_generator)
    print(f"Training batch shape: {train_batch_x.shape}")
    print(f"Training labels shape: {train_batch_y.shape}")
    print(f"Training batch data type: {train_batch_x.dtype}")
    print(f"Training batch value range: [{train_batch_x.min():.3f}, {train_batch_x.max():.3f}]")
    
    # Get a batch from validation generator
    val_batch_x, val_batch_y = next(val_generator)
    print(f"Validation batch shape: {val_batch_x.shape}")
    print(f"Validation labels shape: {val_batch_y.shape}")
    
    # Visualize a batch of training images
    print("\nVISUALIZING TRAINING BATCH...")
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    fig.suptitle('Sample Training Batch (with Augmentation)', fontsize=16, fontweight='bold')
    
    for i in range(8):
        row = i // 4
        col = i % 4
        
        # Display image
        axes[row, col].imshow(train_batch_x[i])
        
        # Get class label
        class_idx = np.argmax(train_batch_y[i])
        class_name = list(train_generator.class_indices.keys())[class_idx]
        
        axes[row, col].set_title(f'{class_name.capitalize()}', fontweight='bold')
        axes[row, col].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Data generators are working correctly!")
    
else:
    print("❌ Cannot create data generators - no data available")


### Summary and Next Steps

Let's summarize what we've accomplished in the data preprocessing phase and prepare for model development.

**Data Preprocessing Summary:**
1. **Dataset Loading**: Successfully loaded and analyzed the Rock-Paper-Scissors dataset
2. **Data Splitting**: Properly split data into train/validation/test sets (70/20/10)
3. **Data Augmentation**: Implemented comprehensive augmentation strategies
4. **Data Generators**: Created efficient data generators for training

**Key Preprocessing Steps Completed:**
✅ **Image Resizing**: Images resized to 224x224 pixels
✅ **Normalization**: Pixel values normalized to [0,1] range
✅ **Data Augmentation**: Rotation, shift, zoom, and flip augmentations
✅ **Train/Val/Test Split**: Proper splitting with no data leakage
✅ **Data Generators**: Efficient batch loading with augmentation

**Project Requirements Addressed:**
✅ **Data Preprocessing**: Image resizing and normalization implemented
✅ **Data Augmentation**: Comprehensive augmentation techniques applied
✅ **Data Splitting**: Proper train/validation/test split (no test set information used)
✅ **Sound Methodology**: Following ML best practices for data handling

**Next Steps:**
- Model architecture design (Simple, Medium, Complex CNNs)
- Model training with proper callbacks
- Hyperparameter tuning
- Model evaluation and analysis
