In [1]:
# -*- coding: utf-8 -*-
"""model_v8 best.ipynb - UPDATED VERSION

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1hLEuxwylY6keQUVYxoMtu24PYsojRtiD

labelling upd
pos encoder upd
lr dec
dropout inv


if random.random() > 0.5:
    frames = frames[::-1]  # Reverse sequence

FEATURE_DIM = 256
lr inc

fut:
decrase lr
inc epoch

hand tailed labelas

smooth labels can be used

lstm exper
min exp

2x more data.
"""

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import json
import cv2
import numpy as np
import random
from pathlib import Path
import os

import torch.optim as optim
import torchvision.models as models
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from tqdm import tqdm

from typing import List, Dict, Tuple, Optional, Union

# Global parameters
VIDEOS_DIR = '/home/kaan-eren/projects/V4_BITIRME'
MAX_FRAMES = 30  # Frame window size
FRAME_SIZE = (224, 224)  # Standard size for most pre-trained models
# UPDATED: Reduced batch size for stability with larger model
BATCH_SIZE = 4  # Reduced from 4 for better gradient stability
NUM_WORKERS = 1
# UPDATED: Increased epochs with early stopping
NUM_EPOCHS = 5  # Increased from 5
# UPDATED: Lower learning rate to prevent overfitting
LEARNING_RATE = 3e-5  # Reduced from 3e-5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# UPDATED: Reduced feature dimension for better regularization
FEATURE_DIM = 256  # Reduced from 1024
NUM_HEADS = 4  # Number of attention heads
# UPDATED: Increased transformer layers
NUM_LAYERS = 3 # Increased from 3
# UPDATED: Increased dropout for better regularization
DROPOUT = 0.4  # Increased from 0.3

# Transformations for video frames
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(FRAME_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

class VideoDataset(Dataset):
    """
    Dataset class to handle video data for effect detection model

    This dataset processes an entire video file and extracts frames at regular intervals,
    creating a dataset of consecutive frame windows for training the effect detection model.
    """

    def __init__(
        self,
        json_data: List[Dict],
        videos_dir: str,
        max_frames: int = MAX_FRAMES,
        frame_size: Tuple[int, int] = FRAME_SIZE,
        transform=None,
        mode: str = 'train',
        # UPDATED: Reduced window stride for more training data
        window_stride: int = 10  # Reduced from 15 for more overlapping windows
    ):
        """
        Initialize the dataset

        Args:
            json_data: List of dictionaries containing video data (for reference only)
            videos_dir: Directory containing the video files
            max_frames: Number of frames in each sliding window
            frame_size: Size to resize frames to
            transform: Additional transforms to apply to frames
            mode: Dataset mode ('train', 'val', or 'test') for file mapping
            window_stride: Number of frames to advance for each new window
        """
        self.json_data = json_data
        self.videos_dir = videos_dir
        self.max_frames = max_frames
        self.frame_size = frame_size
        self.mode = mode
        self.window_stride = window_stride

        # Create default transform if none provided
        if transform is None:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transform

        # Generate samples based on the video file
        self.video_path, self.effect_map = self._get_video_info()
        # UPDATED: Use improved sample generation
        self.samples = self._generate_improved_samples()  # Changed method name

    def _get_video_info(self) -> Tuple[str, Dict[int, int]]:
        """
        Get video path and create a mapping of frame numbers to effects

        Returns:
            Tuple of (video_path, effect_map)
        """
        # Map the filename to the correct video file based on mode
        if self.mode == 'train':
            video_filename = 'train_set.mp4'
        elif self.mode == 'val':
            video_filename = 'val_set.mp4'
        elif self.mode == 'test':
            video_filename = 'test_set.mp4'
        else:
            # Use the filename from the first JSON entry as fallback
            video_filename = self.json_data[0]["filename"]

        video_path = os.path.join(self.videos_dir, video_filename)

        # Create a mapping of frame numbers to effect labels
        effect_map = {}

        # Go through all segments in JSON data and mark the effect labels
        for video_data in self.json_data:
            for segment in video_data["segments"]:
                start_frame = segment["start_frame"]
                end_frame = segment["end_frame"]
                effect = segment["effect"]

                # Mark every frame in this segment with the effect label
                for frame_idx in range(start_frame, end_frame):
                    effect_map[frame_idx] = effect

        return video_path, effect_map

    # UPDATED: Improved sample generation method
    def _generate_improved_samples(self) -> List[Dict]:
        """
        Generate a list of sliding windows from the video with better labeling strategy

        Returns:
            List of dictionaries containing window info
        """
        # Open video to get properties
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {self.video_path}")

        # Get video total frame count
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()

        samples = []
        positive_samples = []
        negative_samples = []

        # Create sliding windows over the entire video
        for start_idx in range(0, total_frames - self.max_frames + 1, self.window_stride):
            end_idx = start_idx + self.max_frames

            # UPDATED: Better labeling strategy using ratio instead of simple majority
            frame_labels = [self.effect_map.get(i, 0) for i in range(start_idx, end_idx)]
            effect_ratio = sum(frame_labels) / len(frame_labels)
            
            # UPDATED: Use threshold-based approach for more balanced dataset
            effect = 1 if effect_ratio > 0.4 else 0  # Changed from majority vote (0.5) to 0.4

            sample_info = {
                "start_frame": start_idx,
                "end_frame": end_idx,
                "effect": effect,
                # UPDATED: Store effect ratio for analysis
                "effect_ratio": effect_ratio
            }

            # UPDATED: Separate positive and negative samples for balancing
            if effect == 1:
                positive_samples.append(sample_info)
            else:
                negative_samples.append(sample_info)

        # UPDATED: Balance the dataset during training
        if self.mode == 'train':
            min_samples = min(len(positive_samples), len(negative_samples))
            if min_samples > 0:
                # Randomly sample to balance classes
                positive_samples = random.sample(positive_samples, min_samples)
                negative_samples = random.sample(negative_samples, min_samples)
                print(f"Balanced training set: {len(positive_samples)} positive, {len(negative_samples)} negative")

        samples = positive_samples + negative_samples
        random.shuffle(samples)

        return samples

    def _extract_frames(self, start_frame: int, end_frame: int) -> np.ndarray:
        """
        Extract frames from a video segment

        Args:
            start_frame: Start frame index
            end_frame: End frame index

        Returns:
            Numpy array of frames
        """
        # Open video file
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {self.video_path}")

        # Extract consecutive frames (we want exactly max_frames)
        frames = []
        for frame_idx in range(start_frame, end_frame):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if not ret:
                break

            # Resize frame
            frame = cv2.resize(frame, self.frame_size)

            # Convert from BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            frames.append(frame)

        cap.release()

        # If no frames were extracted or fewer than expected
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from {self.video_path}")
        elif len(frames) < self.max_frames:
            # Pad with the last frame
            last_frame = frames[-1]
            frames.extend([last_frame] * (self.max_frames - len(frames)))

        return np.array(frames)

    def __len__(self) -> int:
        """Return the number of samples (windows) in the dataset"""
        return len(self.samples)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        """
        Get a sample from the dataset

        Args:
            idx: Index of the sample

        Returns:
            Tuple of (frames, label)
        """
        sample = self.samples[idx]

        try:
            # Extract frames
            frames = self._extract_frames(
                sample["start_frame"],
                sample["end_frame"]
            )

            # UPDATED: Reduced temporal augmentation probability for stability
            if self.mode == 'train' and random.random() > 0.7:  # Changed from 0.5 to 0.7
                frames = frames[::-1]

            # Apply transforms to each frame
            transformed_frames = []
            for frame in frames:
                transformed_frame = self.transform(frame)
                transformed_frames.append(transformed_frame)

            # Stack frames
            frames_tensor = torch.stack(transformed_frames)

            # Get label
            label = sample["effect"]

            return frames_tensor, label

        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            # Return a default/placeholder
            # Create a tensor of zeros with the expected shape
            empty_frames = torch.zeros(self.max_frames, 3, self.frame_size[0], self.frame_size[1])
            # Return default label as 0
            return empty_frames, 0

def create_dataloaders(
    train_json_path: str,
    val_json_path: str,
    test_json_path: str,
    videos_dir: str = VIDEOS_DIR,
    batch_size: int = BATCH_SIZE,
    max_frames: int = MAX_FRAMES,
    frame_size: Tuple[int, int] = FRAME_SIZE,
    num_workers: int = NUM_WORKERS,
    # UPDATED: Reduced default window stride
    window_stride: int = 10  # Reduced from 30
) -> Tuple[DataLoader, DataLoader, DataLoader]:
    """
    Create train, validation, and test dataloaders from separate JSON files

    Args:
        train_json_path: Path to training data JSON
        val_json_path: Path to validation data JSON
        test_json_path: Path to test data JSON
        videos_dir: Directory containing the video files
        batch_size: Batch size for dataloaders
        max_frames: Maximum number of frames in each sliding window
        frame_size: Size to resize frames to
        num_workers: Number of worker processes for dataloaders
        window_stride: Number of frames to advance for each new window

    Returns:
        Tuple of (train_loader, val_loader, test_loader)
    """
    # Load JSON data
    with open(train_json_path, 'r') as f:
        train_data = json.load(f)

    with open(val_json_path, 'r') as f:
        val_data = json.load(f)

    with open(test_json_path, 'r') as f:
        test_data = json.load(f)

    # Create datasets
    train_dataset = VideoDataset(
        json_data=train_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='train',
        window_stride=window_stride
    )

    val_dataset = VideoDataset(
        json_data=val_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='val',
        window_stride=window_stride
    )

    test_dataset = VideoDataset(
        json_data=test_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='test',
        window_stride=window_stride
    )

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    return train_loader, val_loader, test_loader

# Usage example:
if __name__ == "__main__":
    # Create dataloaders from separate JSON files
    train_loader, val_loader, test_loader = create_dataloaders(
        train_json_path='train_set.json',
        val_json_path='val_set.json',
        test_json_path='test_set.json',
        videos_dir=VIDEOS_DIR
    )

    # Print dataset statistics
    print(f"Train dataset size: {len(train_loader.dataset)}")
    print(f"Validation dataset size: {len(val_loader.dataset)}")
    print(f"Test dataset size: {len(test_loader.dataset)}")

    # Test a batch
    for frames, labels in train_loader:
        print(f"Batch shape: {frames.shape}")  # Should be [batch_size, max_frames, 3, height, width]
        print(f"Labels: {labels}")
        break

class FrameFeatureExtractor(nn.Module):
    # UPDATED: Enhanced feature extractor with better regularization
    def __init__(self, pretrained=True, dropout=DROPOUT):
        super(FrameFeatureExtractor, self).__init__()
        # UPDATED: Use EfficientNet-B0 for better performance
        base_model = models.efficientnet_b0(weights='DEFAULT' if pretrained else None)
        # Remove the classifier layer
        self.features = nn.Sequential(*list(base_model.children())[:-1])
        self.pool = nn.AdaptiveAvgPool2d(1)
        
        # UPDATED: Enhanced feature projection with batch norm and dropout
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            # UPDATED: EfficientNet-B1 outputs 1280 features
            nn.Linear(1280, 512),  # Changed from direct mapping to FEATURE_DIM
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, FEATURE_DIM)
        )

    def forward(self, x):
        # x shape: (batch_size * num_frames, channels, height, width)
        x = self.features(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x

class TransformerClassifier(nn.Module):
    def __init__(self):
        super(TransformerClassifier, self).__init__()
        # UPDATED: Use enhanced feature extractor
        self.feature_extractor = FrameFeatureExtractor(dropout=DROPOUT)

        # UPDATED: Improved positional encoding initialization
        self.pos_encoder = nn.Parameter(torch.randn(1, MAX_FRAMES, FEATURE_DIM) * 0.02)
        # UPDATED: Add layer normalization and dropout for positional encoding
        self.pos_dropout = nn.Dropout(DROPOUT)
        self.layer_norm = nn.LayerNorm(FEATURE_DIM)

        # UPDATED: Enhanced transformer encoder with GELU activation
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=FEATURE_DIM,
            nhead=NUM_HEADS,
            dim_feedforward=FEATURE_DIM * 4,
            dropout=DROPOUT,
            batch_first=True,
            activation='gelu'  # UPDATED: GELU instead of ReLU
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=NUM_LAYERS)

        # UPDATED: Enhanced classification head with batch normalization
        self.classifier = nn.Sequential(
            nn.Linear(FEATURE_DIM, 512),
            nn.BatchNorm1d(512),  # UPDATED: Added batch norm
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),  # UPDATED: Added batch norm
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x shape: (batch_size, num_frames, channels, height, width)
        batch_size, num_frames, channels, height, width = x.shape

        # Reshape for feature extraction
        x = x.view(batch_size * num_frames, channels, height, width)

        # Extract features
        features = self.feature_extractor(x)

        # Reshape back to (batch_size, num_frames, feature_dim)
        features = features.view(batch_size, num_frames, -1)

        # UPDATED: Apply layer normalization before adding positional encoding
        features = self.layer_norm(features + self.pos_encoder)
        features = self.pos_dropout(features)

        # Apply transformer encoder
        transformer_output = self.transformer_encoder(features)

        # UPDATED: Use attention-weighted pooling instead of mean pooling
        attention_weights = torch.softmax(
            torch.mean(transformer_output, dim=-1), dim=1
        ).unsqueeze(-1)
        pooled_output = torch.sum(transformer_output * attention_weights, dim=1)

        # Classification
        logits = self.classifier(pooled_output)

        return logits.squeeze(-1)

# UPDATED: Enhanced loss function - Focal Loss for better class balance
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * bce_loss
        return focal_loss.mean()

def train_epoch(model, dataloader, criterion, optimizer, device, scaler):
    model.train()
    epoch_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        frames, labels = batch
        frames = frames.to(device)
        labels = labels.float().to(device)

        optimizer.zero_grad()
        
        # UPDATED: Use new autocast format to avoid deprecation warning
        with torch.amp.autocast('cuda'):
            outputs = model(frames)
            loss = criterion(outputs, labels)

        # UPDATED: Use gradient scaler for mixed precision training
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

        # Calculate predictions
        preds = (torch.sigmoid(outputs) > 0.5).float().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({"batch_loss": f"{loss.item():.4f}"})

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)

    return epoch_loss / len(dataloader), accuracy, precision, recall, f1


def validate(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            frames, labels = batch
            frames = frames.to(device)
            labels = labels.float().to(device)

            outputs = model(frames)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            # Calculate predictions
            preds = (torch.sigmoid(outputs) > 0.5).float().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)

    return val_loss / len(dataloader), accuracy, precision, recall, f1

def plot_metrics(train_values, val_values, metric_name):
    """
    Plot the training and validation metrics
    """
    plt.figure(figsize=(10, 6))
    plt.plot(train_values, label=f'Train {metric_name}')
    plt.plot(val_values, label=f'Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.title(f'Training and Validation {metric_name}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{metric_name.lower().replace(" ", "_")}_plot.png')
    plt.close()

def train_model():
    # Create data loaders
    print(f"Using device: {DEVICE}")
    print("Preparing data...")

    train_loader, val_loader, test_loader = create_dataloaders(
        train_json_path='train_set.json',
        val_json_path='val_set.json',
        test_json_path='test_set.json',
        videos_dir=VIDEOS_DIR,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS
    )

    print("Data loaded!")
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")

    # Create model
    print("Initializing model...")
    model = TransformerClassifier().to(DEVICE)

    # Print model summary
    print(f"Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

    # UPDATED: Use Focal Loss instead of BCE
    criterion = FocalLoss(alpha=1, gamma=2)
    
    # UPDATED: Enhanced optimizer with better parameters
    optimizer = optim.AdamW(
        model.parameters(), 
        lr=LEARNING_RATE, 
        weight_decay=1e-3,  # Increased from 1e-4
        betas=(0.9, 0.999)
    )

    # UPDATED: Use Cosine Annealing scheduler instead of ReduceLROnPlateau
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, 
        T_max=NUM_EPOCHS,
        eta_min=1e-6
    )

    # UPDATED: Add gradient scaler for mixed precision training
    scaler = torch.cuda.amp.GradScaler()

    # Training loop
    print("Starting training...")

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    train_f1s, val_f1s = [], []

    best_val_f1 = 0
    best_model_path = 'best_effect_detection_model.pth'
    
    # UPDATED: Add early stopping
    patience = 5
    patience_counter = 0

    for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")

        # UPDATED: Train with gradient scaler
        train_loss, train_acc, train_prec, train_rec, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, DEVICE, scaler
        )

        # Validate
        val_loss, val_acc, val_prec, val_rec, val_f1 = validate(
            model, val_loader, criterion, DEVICE
        )

        # UPDATED: Use cosine annealing scheduler
        scheduler.step()

        # Save metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        train_f1s.append(train_f1)
        val_f1s.append(val_f1)

        # UPDATED: Print current learning rate
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Prec: {train_prec:.4f}, Rec: {train_rec:.4f}, F1: {train_f1:.4f}")
        print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Prec: {val_prec:.4f}, Rec: {val_rec:.4f}, F1: {val_f1:.4f}")
        print(f"LR: {current_lr:.6f}")

        # UPDATED: Enhanced model saving with early stopping
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with validation F1: {val_f1:.4f}")
            patience_counter = 0
        else:
            patience_counter += 1
            
        # UPDATED: Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

    print("\nTraining completed!")

    # Plot metrics
    print("Plotting metrics...")
    plot_metrics(train_losses, val_losses, 'Loss')
    plot_metrics(train_accuracies, val_accuracies, 'Accuracy')
    plot_metrics(train_f1s, val_f1s, 'F1 Score')

    # Evaluate on test set
    print("\nEvaluating on test set...")
    # UPDATED: Use weights_only=True to avoid warning
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    test_loss, test_acc, test_prec, test_rec, test_f1 = validate(
        model, test_loader, criterion, DEVICE
    )

    print(f"Test Results - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, Prec: {test_prec:.4f}, Rec: {test_rec:.4f}, F1: {test_f1:.4f}")

    return model

# Evaluation on single sequences
def evaluate_sequence(model, video_path, start_frame, num_frames, transform):
    """
    Evaluate a single video sequence for effect detection
    """
    model.eval()
    cap = cv2.VideoCapture(video_path)

    frames = []
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame + i)
        ret, frame = cap.read()
        if not ret:
            break

        # Apply transform
        frame = cv2.resize(frame, FRAME_SIZE)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if transform:
            frame = transform(frame)
        else:
            # Convert to tensor manually
            frame = torch.FloatTensor(frame / 255.0).permute(2, 0, 1)

        frames.append(frame)

    cap.release()

    # Stack frames
    if len(frames) < num_frames:
        # Pad with zeros if not enough frames
        for _ in range(num_frames - len(frames)):
            frames.append(torch.zeros_like(frames[0]))

    frames_tensor = torch.stack(frames).unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        outputs = model(frames_tensor.to(DEVICE))
        probability = torch.sigmoid(outputs).item()
        prediction = 1 if probability > 0.5 else 0

    return prediction, probability

if __name__ == "__main__":
    # Train the model
    model = train_model()

    # Save the final model
    torch.save(model.state_dict(), 'final_effect_detection_model.pth')
    print("Final model saved!")

# Check total GPU memory
print(f"GPU Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Check current memory usage
print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

# Check free memory
free_memory, total_memory = torch.cuda.mem_get_info(0)
print(f"GPU Memory Free: {free_memory / 1024**3:.2f} GB")

Balanced training set: 1406 positive, 1406 negative
Train dataset size: 2812
Validation dataset size: 819
Test dataset size: 819
Batch shape: torch.Size([4, 30, 3, 224, 224])
Labels: tensor([0, 0, 1, 1])


  scaler = torch.cuda.amp.GradScaler()


Using device: cuda
Preparing data...
Balanced training set: 1406 positive, 1406 negative
Data loaded!
Train batches: 703
Val batches: 205
Test batches: 205
Initializing model...
Model initialized with 7,454,205 trainable parameters
Starting training...

Epoch 1/5


Training: 100%|████████████| 703/703 [08:16<00:00,  1.42it/s, batch_loss=0.1862]
Validation: 100%|█████████████████████████████| 205/205 [02:33<00:00,  1.34it/s]


Train - Loss: 0.1744, Acc: 0.5661, Prec: 0.5819, Rec: 0.4701, F1: 0.5201
Val   - Loss: 0.1466, Acc: 0.6642, Prec: 0.7289, Rec: 0.8359, F1: 0.7788
LR: 0.000027
New best model saved with validation F1: 0.7788

Epoch 2/5


Training:  22%|██▌         | 153/703 [01:47<06:25,  1.43it/s, batch_loss=0.1452]


KeyboardInterrupt: 