Project : Develop model to detect deepfake video with Highest accuracy (possible by us) which has explainability . Will try to create few model to generate comparisons for comparison and then picking one final one as our "final model".

Business Value: Flagging misinformation/ protecting digital identity

In [2]:
# # Deep Fake Detection Project
# ## Complete Pipeline: Data Analysis ‚Üí Feature Engineering ‚Üí Model Training ‚Üí Hyperparameter Tuning
#
# **Dataset**: Google Drive - Celeb-Real, Celeb-Fake, and Testing folders
# - **Celeb-Real**: Real/Original videos (Label: 0, "Celeb-Real")
# - **Celeb-Fake**: Fake/Manipulated videos (Label: 1, "Fake")
# - **Testing**: Test videos for evaluation
#
# **Dataset Link**: https://drive.google.com/drive/folders/1nBKjUpi2wQyMfWDuNsreqY11DVZrbk7x
#
# **Objective**: Detect original vs AI-generated images and videos
#
# **Approach**:
# - Comprehensive EDA
# - Feature engineering (spatial, frequency, texture features)
# - Multiple CNN architectures + Transfer Learning
# - Hyperparameter optimization
# - Model evaluation and comparison
#
%pip install optuna
%pip install gdown
# Note: torchcodec is not needed - we use OpenCV for video processing
# %%
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
from torchvision import transforms, models
import timm
# torchcodec not needed - using OpenCV for video processing instead
# import torchcodec  # Optional: requires FFmpeg installation

# Computer Vision
import cv2
from PIL import Image
from skimage import feature, filters
from skimage.feature import local_binary_pattern

# ML & Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report,
    roc_curve, auc, roc_auc_score
)
from sklearn.preprocessing import StandardScaler

# HuggingFace
from datasets import load_dataset

# Hyperparameter Tuning
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

# Utilities
from tqdm.auto import tqdm
import time
from datetime import datetime
import json
import joblib

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("Using CPU - training will be slower")


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Using device: cpu
Using CPU - training will be slower


In [None]:
# ## 2. Data Loading from Google Drive Folders
# Dataset Structure:
# - celeb-real/: Real videos (label: 0, "Celeb-Real")
# - celeb-fake/: Fake videos (label: 1, "Fake")
# - testing/: Test videos (for evaluation)

print("="*80)
print("LOADING DATASET FROM GOOGLE DRIVE FOLDERS")
print("="*80)

# Install gdown for downloading from Google Drive
%pip install gdown

import gdown
from pathlib import Path
import glob

# Set up data directory - using absolute path from project root
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

# Google Drive folder IDs (extracted from the share link)
GOOGLE_DRIVE_FOLDER_ID = "1nBKjUpi2wQyMfWDuNsreqY11DVZrbk7x"

# Try to find folders with case-insensitive matching
def find_folder(base_dir, possible_names):
    """Find folder with case-insensitive matching"""
    base_dir = Path(base_dir)
    if not base_dir.exists():
        return None
    
    # First try exact match
    for name in possible_names:
        folder = base_dir / name
        if folder.exists() and folder.is_dir():
            return folder
    
    # Then try case-insensitive match
    for item in base_dir.iterdir():
        if item.is_dir():
            item_lower = item.name.lower()
            for name in possible_names:
                if item_lower == name.lower():
                    return item
    
    return None

# Find folders with various possible names
# Note: Actual folder names may vary (Celeb-real, Celeb-synthesis, etc.)
CELEB_REAL_FOLDER = find_folder(DATA_DIR, [
    "Celeb-Real", "celeb-real", "Celeb-real", "Celeb_Real", "celeb_real", 
    "CelebReal", "Celeb-Real", "real", "Real"
])
CELEB_FAKE_FOLDER = find_folder(DATA_DIR, [
    "Celeb-Fake", "celeb-fake", "Celeb-fake", "Celeb_Fake", "celeb_fake", 
    "CelebFake", "Celeb-synthesis", "celeb-synthesis", "Celeb-Synthesis",
    "synthesis", "Synthesis", "fake", "Fake"
])
TESTING_FOLDER = find_folder(DATA_DIR, ["Testing", "testing", "Test", "test"])

def download_from_google_drive(folder_id, output_dir):
    """Download folder from Google Drive using gdown"""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Downloading from Google Drive folder: {folder_id}")
    print(f"Output directory: {output_dir}")
    
    # Download folder as zip
    url = f"https://drive.google.com/drive/u/0/folders/{folder_id}"         # f"https://drive.google.com/uc?id={folder_id}"
    zip_path = output_dir / "dataset.zip"
    
    try:
        gdown.download_folder(url, output=str(output_dir), quiet=False, use_cookies=False)
        print(f"‚úÖ Downloaded to {output_dir}")
        return True
    except Exception as e:
        print(f"‚ö†Ô∏è  Download error: {e}")
        print("Please download manually from:")
        print(f"https://drive.google.com/drive/folders/{folder_id}")
        print(f"Extract to: {output_dir}")
        return False

def load_videos_from_folder(folder_path, label, label_name):
    """Load all video files from a folder"""
    folder_path = Path(folder_path)
    if not folder_path.exists():
        return []
    
    # Supported video formats
    video_extensions = ['*.mp4', '*.avi', '*.mov', '*.mkv', '*.flv', '*.wmv', '*.webm']
    video_files = []
    
    for ext in video_extensions:
        video_files.extend(glob.glob(str(folder_path / "**" / ext), recursive=True))
    
    # Create data entries
    data_list = []
    for video_path in video_files:
        data_list.append({
            'video_path': video_path,
            'label': label,
            'label_name': label_name,
            'folder': folder_path.name
        })
    
    return data_list

# Download or check for local dataset
print("\n" + "="*80)
print("CHECKING FOR LOCAL DATASET FOLDERS")
print("="*80)

# List all items in data directory
if DATA_DIR.exists():
    print(f"\nüìÅ Contents of data directory ({DATA_DIR}):")
    items = list(DATA_DIR.iterdir())
    if items:
        for item in sorted(items):
            item_type = "üìÅ" if item.is_dir() else "üìÑ"
            print(f"   {item_type} {item.name}")
    else:
        print("   (empty)")
else:
    print(f"\n‚ö†Ô∏è  Data directory does not exist: {DATA_DIR}")

print(f"\n[INFO] Looking for dataset folders...")
print(f"Celeb-Real folder: {CELEB_REAL_FOLDER}")
print(f"Celeb-Fake folder: {CELEB_FAKE_FOLDER}")
print(f"Testing folder: {TESTING_FOLDER}")

# Check if folders exist locally
if CELEB_REAL_FOLDER is None or CELEB_FAKE_FOLDER is None:
    print("\n[INFO] Dataset folders not found locally.")
    print("Attempting to download from Google Drive...")
    print("Note: If download fails, please download manually and place folders in 'data/' directory")
    
    # Try to download (this might not work for large folders, manual download recommended)
    download_success = download_from_google_drive(GOOGLE_DRIVE_FOLDER_ID, DATA_DIR)
    
    if not download_success:
        print("\n" + "="*80)
        print("MANUAL DOWNLOAD REQUIRED")
        print("="*80)
        print("Please follow these steps:")
        print(f"1. Open: https://drive.google.com/drive/folders/{GOOGLE_DRIVE_FOLDER_ID}")
        print("2. Download the three folders: Celeb-Real, Celeb-Fake, Testing")
        print(f"3. Extract them to: {DATA_DIR.absolute()}")
        print("4. Ensure folder structure:")
        print(f"   {DATA_DIR}/Celeb-Real/")
        print(f"   {DATA_DIR}/Celeb-Fake/")
        print(f"   {DATA_DIR}/Testing/")
        print("\nThen re-run this cell.")
else:
    print("‚úÖ Dataset folders found locally!")

# Load videos from folders
print("\n" + "="*80)
print("LOADING VIDEOS FROM FOLDERS")
print("="*80)

# Load training data (Celeb-Real and Celeb-Fake)
train_data_list = []
if CELEB_REAL_FOLDER and CELEB_REAL_FOLDER.exists():
    real_videos = load_videos_from_folder(CELEB_REAL_FOLDER, label=0, label_name="Celeb-Real")
    train_data_list.extend(real_videos)
    print(f"   ‚úÖ Loaded {len(real_videos)} videos from {CELEB_REAL_FOLDER.name} folder (Label: 0, 'Celeb-Real')")
else:
    print("   ‚ö†Ô∏è  Celeb-Real folder not found or empty")
    print(f"   Expected folder names: Celeb-Real, celeb-real, Celeb-real, etc.")

if CELEB_FAKE_FOLDER and CELEB_FAKE_FOLDER.exists():
    fake_videos = load_videos_from_folder(CELEB_FAKE_FOLDER, label=1, label_name="Fake")
    train_data_list.extend(fake_videos)
    print(f"   ‚úÖ Loaded {len(fake_videos)} videos from {CELEB_FAKE_FOLDER.name} folder (Label: 1, 'Fake')")
else:
    print("   ‚ö†Ô∏è  Celeb-Fake folder not found or empty")
    print(f"   Expected folder names: Celeb-Fake, Celeb-synthesis, celeb-fake, etc.")

# Load test data (Testing folder)
if TESTING_FOLDER and TESTING_FOLDER.exists():
    test_data_list = load_videos_from_folder(TESTING_FOLDER, label=None, label_name="Testing")
    print(f"   Loaded {len(test_data_list)} videos from Testing folder")
else:
    test_data_list = []
    print("   ‚ö†Ô∏è  Testing folder not found or empty (optional for training)")

# Convert to DataFrame for analysis
train_df = pd.DataFrame(train_data_list)
test_df = pd.DataFrame(test_data_list)

print(f"\n‚úÖ Training videos loaded: {len(train_df)}")
print(f"‚úÖ Test videos loaded: {len(test_df)}")

if len(train_df) > 0:
    print("\n" + "="*80)
    print("LABEL DISTRIBUTION ANALYSIS (Training Data)")
    print("="*80)
    print(f"\nTotal training samples: {len(train_df)}")
    print(f"\nLabel Distribution:")
    print(train_df['label_name'].value_counts())
    print(f"\nLabel Percentages:")
    print(train_df['label_name'].value_counts(normalize=True) * 100)
    
    # Check if we have both classes
    unique_labels = train_df['label'].unique()
    print(f"\nUnique labels found: {unique_labels}")
    
    if len(unique_labels) == 1:
        print("\n‚ö†Ô∏è  WARNING: Only one class found in the dataset!")
        print(f"   All samples are labeled as: {train_df['label_name'].iloc[0]}")
    else:
        print("\n‚úì Both classes found in the dataset!")
        print("   The dataset contains both Celeb-Real and Fake videos.")
    
    # Show sample paths
    print("\n" + "="*80)
    print("SAMPLE PATHS (First 10):")
    print("="*80)
    for idx, row in train_df.head(10).iterrows():
        print(f"Sample {idx}: {row['label_name']}")
        print(f"  Path: {row['video_path']}")
    
    # Visualize label distribution
    if len(train_df) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Count plot
        label_counts = train_df['label_name'].value_counts()
        axes[0].bar(label_counts.index, label_counts.values, color=['#2ecc71', '#e74c3c'])
        axes[0].set_title('Label Distribution (Count)', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('Label', fontsize=12)
        axes[0].set_ylabel('Count', fontsize=12)
        axes[0].grid(True, alpha=0.3)
        
        # Pie chart
        colors = ['#2ecc71', '#e74c3c']
        axes[1].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%',
                   colors=colors[:len(label_counts)], startangle=90)
        axes[1].set_title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig('label_distribution_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    print(f"\n‚úì Training dataset loaded: {len(train_df)} videos")
    print(f"‚úì Test dataset loaded: {len(test_df)} videos")
else:
    print("\n‚ö†Ô∏è  No training videos found!")
    print("Please ensure the dataset folders are downloaded and placed in the 'data/' directory.")


LOADING DATASET FROM GOOGLE DRIVE FOLDERS
Note: you may need to restart the kernel to use updated packages.

[INFO] Checking for dataset folders...
Celeb-Real folder: data/Celeb-Real
Celeb-Fake folder: data/Celeb-Fake
Testing folder: data/Testing

[INFO] Dataset folders not found locally.
Attempting to download from Google Drive...
Note: If download fails, please download manually and place folders in 'data/' directory
Downloading from Google Drive folder: 1nBKjUpi2wQyMfWDuNsreqY11DVZrbk7x
Output directory: data


Retrieving folder contents


Retrieving folder 18qhZEXhRWb0NmUHoYQyADJvai9sS3xTi Celeb-Fake
Processing file 1-oESGLoEYUhsqerwvocu3o4eE2WK0PjQ id2_id0_0000.mp4
Processing file 1BPgDGKNxhsvNM26yThzdbJnBMP0lWi8M id2_id0_0003.mp4
Processing file 1t5QnYJ7HXTaYv_gvNFu1RFlOclrVVDr4 id2_id0_0004.mp4
Processing file 1Oz8qm_50xnGfO8F0VLoBoOTt0WinOKP0 id2_id3_0003.mp4
Processing file 1g0KCsT11YKvGbAJGdWHHFlQc9qWdVNEb id2_id3_0007.mp4
Processing file 1h2q59nSIHypEIxDnTlA6Iui0gDbtS6id id2_id4_0009.mp4
Processing file 16NNAKxLpviTm8zMRo-KaL6Dxvpr6loUG id2_id6_0008.mp4
Processing file 1rGcakJoSKJT_CfC5tT9GQ8vzdC4XkBjO id2_id9_0005.mp4
Processing file 1_C5BJHbh-Dz3XlPGArYA5ZgCHCnUMfOE id2_id16_0001.mp4
Processing file 1J6XN307RMtUGsQqgUfXflWib4eyJ9ghD id2_id17_0000.mp4
Processing file 1-9Gq18b5mMZ5ei_biBeoW-qZ-PxeviWa id2_id17_0005.mp4
Processing file 1a3ObMXQpTadQta9JtnfYd0KvcTbFJ_XC id2_id20_0004.mp4
Processing file 1uXUGKYwCRPp9cXlZBqMqUxvFf8veKetk id2_id21_0001.mp4
Processing file 14s1g3gTtKw3ibsNor230xvlF8QvOS03P id2_id23_00

In [None]:
# Additional analysis: Check dataset structure and video files
print("="*80)
print("DATASET STRUCTURE ANALYSIS")
print("="*80)

if 'train_df' in locals() and len(train_df) > 0:
    print(f"\n‚úÖ Training Dataset Structure:")
    print(f"   Total videos: {len(train_df)}")
    print(f"   Columns: {train_df.columns.tolist()}")
    print(f"\n   Label distribution:")
    print(train_df['label_name'].value_counts())
    
    # Check video file formats
    print(f"\nüìπ Video File Formats:")
    train_df['extension'] = train_df['video_path'].apply(lambda x: Path(x).suffix.lower())
    print(train_df['extension'].value_counts())
    
    # Sample video paths by label
    print(f"\nüìÅ Sample Video Paths by Label:")
    for label_name in train_df['label_name'].unique():
        print(f"\n   {label_name} videos:")
        sample_paths = train_df[train_df['label_name'] == label_name]['video_path'].head(3)
        for path in sample_paths:
            print(f"     - {Path(path).name}")
    
    if 'test_df' in locals() and len(test_df) > 0:
        print(f"\n‚úÖ Test Dataset Structure:")
        print(f"   Total test videos: {len(test_df)}")
        print(f"   Test videos are in: {TESTING_FOLDER}")
    
    print(f"\nüìä Dataset Summary:")
    print(f"   Training: {len(train_df)} videos")
    print(f"   Testing: {len(test_df) if 'test_df' in locals() else 0} videos")
    print(f"   Total: {len(train_df) + (len(test_df) if 'test_df' in locals() else 0)} videos")
else:
    print("‚ö†Ô∏è  Dataset not loaded. Please run the previous cell first.")
    print("Make sure the dataset folders are downloaded and placed in the 'data/' directory.")


In [None]:
# Helper function to extract frame from video file
def extract_frame_from_video(video_path, frame_idx=0):
    """Extract a frame from video file path"""
    try:
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            print(f"Error: Could not open video {video_path}")
            return None
        
        # Get total frames
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Set frame position
        if frame_idx >= total_frames:
            frame_idx = total_frames - 1
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        cap.release()
        
        if ret:
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            return frame_rgb
        else:
            return None
    except Exception as e:
        print(f"Error extracting frame from {video_path}: {e}")
        return None

# Test video loading
if 'train_df' in locals() and len(train_df) > 0:
    print("Testing video frame extraction...")
    sample_video = train_df.iloc[0]['video_path']
    print(f"Sample video: {sample_video}")
    
    frame = extract_frame_from_video(sample_video, frame_idx=0)
    if frame is not None:
        print(f"‚úÖ Frame extracted successfully!")
        print(f"   Frame shape: {frame.shape}")
        
        # Display sample frame
        plt.figure(figsize=(8, 6))
        plt.imshow(frame)
        plt.title(f"Sample Frame from: {Path(sample_video).name}\nLabel: {train_df.iloc[0]['label_name']}")
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('sample_video_frame.png', dpi=300, bbox_inches='tight')
        plt.show()
    else:
        print("‚ö†Ô∏è  Could not extract frame. Check video file format.")
else:
    print("‚ö†Ô∏è  No training data available. Please load dataset first.")


In [None]:
# PyTorch Dataset Class for Video Loading
class VideoDataset(Dataset):
    """
    Custom PyTorch Dataset for loading videos from local folders.
    Labels are assigned based on folder location:
    - Celeb-Real folder ‚Üí Label 0 ("Celeb-Real")
    - Celeb-Fake folder ‚Üí Label 1 ("Fake")
    """
    
    def __init__(self, dataframe, transform=None, num_frames=16, frame_interval=1):
        """
        Args:
            dataframe: DataFrame with columns ['video_path', 'label', 'label_name', 'folder']
            transform: Optional transform to be applied on frames
            num_frames: Number of frames to extract from each video
            frame_interval: Interval between frames (1 = consecutive frames)
        """
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform
        self.num_frames = num_frames
        self.frame_interval = frame_interval
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        video_path = self.dataframe.iloc[idx]['video_path']
        label = self.dataframe.iloc[idx]['label']
        label_name = self.dataframe.iloc[idx]['label_name']
        folder = self.dataframe.iloc[idx]['folder']
        
        # Extract frames from video
        frames = self.extract_frames(video_path)
        
        # Apply transforms if provided
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        
        # Convert list of frames to tensor
        # Stack frames: [num_frames, C, H, W]
        frames_tensor = torch.stack(frames)
        
        return {
            'frames': frames_tensor,
            'label': torch.tensor(label, dtype=torch.long),
            'label_name': label_name,
            'folder': folder,
            'video_path': video_path
        }
    
    def extract_frames(self, video_path):
        """Extract frames from video file"""
        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened():
            raise ValueError(f"Could not open video: {video_path}")
        
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        
        # Calculate frame indices to extract
        if total_frames < self.num_frames * self.frame_interval:
            # If video is shorter, extract all frames and pad
            frame_indices = list(range(0, total_frames, self.frame_interval))
        else:
            # Extract evenly spaced frames
            step = max(1, total_frames // (self.num_frames * self.frame_interval))
            frame_indices = [i * step for i in range(self.num_frames)]
        
        for frame_idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if ret:
                # Convert BGR to RGB
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Convert to PIL Image for transforms
                frame_pil = Image.fromarray(frame_rgb)
                frames.append(frame_pil)
            else:
                # If frame read fails, use last successful frame
                if frames:
                    frames.append(frames[-1])
                else:
                    # Create black frame as fallback
                    frame_pil = Image.new('RGB', (224, 224), (0, 0, 0))
                    frames.append(frame_pil)
        
        cap.release()
        
        # Pad or trim to exact number of frames
        while len(frames) < self.num_frames:
            frames.append(frames[-1] if frames else Image.new('RGB', (224, 224), (0, 0, 0)))
        
        frames = frames[:self.num_frames]
        
        return frames

# Define transforms for training and validation
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets if training data is available
if 'train_df' in locals() and len(train_df) > 0:
    print("Creating PyTorch datasets...")
    
    # Split training data into train and validation sets
    train_split_df, val_split_df = train_test_split(
        train_df, 
        test_size=0.2, 
        random_state=SEED, 
        stratify=train_df['label']
    )
    
    # Create datasets
    train_dataset = VideoDataset(train_split_df, transform=train_transform, num_frames=16)
    val_dataset = VideoDataset(val_split_df, transform=val_transform, num_frames=16)
    
    # Create test dataset if available
    if 'test_df' in locals() and len(test_df) > 0:
        test_dataset = VideoDataset(test_df, transform=val_transform, num_frames=16)
        print(f"‚úÖ Test dataset created: {len(test_dataset)} videos")
    else:
        test_dataset = None
        print("‚ö†Ô∏è  No test dataset available")
    
    print(f"‚úÖ Training dataset: {len(train_dataset)} videos")
    print(f"‚úÖ Validation dataset: {len(val_dataset)} videos")
    print(f"\nDataset splits:")
    print(f"  Train: {len(train_split_df)} videos")
    print(f"  Validation: {len(val_split_df)} videos")
    print(f"  Test: {len(test_df) if 'test_df' in locals() else 0} videos")
    
    # Test dataset loading
    print("\nTesting dataset loading...")
    sample = train_dataset[0]
    print(f"‚úÖ Sample loaded successfully!")
    print(f"   Frames shape: {sample['frames'].shape}")
    print(f"   Label: {sample['label'].item()} ({sample['label_name']})")
    print(f"   Folder: {sample['folder']}")
else:
    print("‚ö†Ô∏è  No training data available. Please load dataset first (run Cell 2).")
