# üìä Data Preparation - AI-Generated Image Detection

**Project:** Truth in Pixels - Detecting AI-Generated Images Beyond Faces  
**Team Member 1:** Data Preparation Specialist  
**Objective:** Explore, preprocess, and prepare the dataset for training

## üéØ Notebook Overview

This notebook handles all aspects of data preparation for our AI-generated image detection project:

1. **Environment Setup** - Configure libraries and reproducibility
2. **Dataset Loading** - Download and explore the Hugging Face dataset
3. **Exploratory Data Analysis** - Understand data distribution and characteristics  
4. **Data Preprocessing** - Implement transforms and augmentation strategies
5. **Dataset Splitting** - Create train/validation/test splits
6. **Data Quality Assessment** - Validate data integrity and class balance

---

In [None]:
# Environment Setup and Library Imports - Cross Platform Compatible
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Detect platform and setup accordingly
def detect_platform():
    """Detect if running on Colab or local machine"""
    try:
        import google.colab
        return "colab"
    except ImportError:
        return "local"

platform = detect_platform()
print(f"üîç Running on: {platform}")

# Add project root to path
if platform == "colab":
    # For Colab - project should be in current directory
    project_root = "/content/AAI-521-Computer-Vision-Image-Classification-Project"
    if not os.path.exists(project_root):
        print("‚ùå Repository not found. Please run the Colab setup notebook first!")
        print("üìñ Open: notebooks/00_colab_setup.ipynb")
    else:
        os.chdir(project_root)
        sys.path.insert(0, project_root)
else:
    # For local - navigate to project root
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
    if project_root not in sys.path:
        sys.path.insert(0, project_root)

print(f"üìÅ Project root: {project_root}")

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from typing import Dict, List, Tuple, Optional

# PyTorch and computer vision
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
from torchvision.utils import make_grid

# Image processing
from PIL import Image, ImageStat
import cv2

# Dataset handling
from datasets import load_dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Progress bars and utilities
from tqdm.auto import tqdm
import time
from collections import Counter

# Set random seeds for reproducibility
def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Device configuration - optimized for Mac M4 and Colab
def get_device():
    """Get the best available device (CUDA > MPS > CPU)"""
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f"üöÄ Using CUDA: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        device = torch.device('mps')
        print(f"üçé Using MPS: Apple Silicon acceleration")
    else:
        device = torch.device('cpu')
        print(f"üíª Using CPU")
        if platform == "colab":
            print("üí° Enable GPU in Runtime > Change runtime type")
    return device

device = get_device()

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Platform-specific configurations
if platform == "colab":
    # Colab optimizations
    os.environ['WANDB_SILENT'] = 'true'  # Suppress wandb warnings
    torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
    
print(f"‚úÖ Environment setup complete for {platform}!")
print(f"üéØ Device: {device}")
print(f"üêç Python: {sys.version.split()[0]}")
print(f"üî• PyTorch: {torch.__version__}")

## üîç Dataset Loading and Exploration

### Dataset Information
- **Source:** Hugging Face - AI-Generated-vs-Real-Images-Datasets
- **URL:** https://huggingface.co/datasets/Hemg/AI-Generated-vs-Real-Images-Datasets
- **Task:** Binary classification (Real vs AI-generated)
- **Content:** Diverse images including people, objects, scenery

In [None]:
# Configure data directories
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
SPLITS_DIR = DATA_DIR / "splits"

# Create directories
for dir_path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, SPLITS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print(f"Data directory structure:")
print(f"üìÅ {DATA_DIR}")
print(f"  üìÅ {RAW_DIR.name}/")
print(f"  üìÅ {PROCESSED_DIR.name}/")
print(f"  üìÅ {SPLITS_DIR.name}/")

# Dataset configuration
DATASET_NAME = "Hemg/AI-Generated-vs-Real-Images-Datasets"
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32