In [1]:
"""

labelling 
pos encoder
lr
dropout


if random.random() > 0.5:
    frames = frames[::-1]  # Reverse sequence

FEATURE_DIM = 256 

better feature
label smooth// basic

"""

'\n\nlabelling \npos encoder\nlr\ndropout\n\n\nif random.random() > 0.5:\n    frames = frames[::-1]  # Reverse sequence\n\nFEATURE_DIM = 256 \n\nbetter feature\nlabel smooth// basic\n\n'

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import json
import cv2
import numpy as np
import random
from pathlib import Path
import os

import torch.optim as optim
import torchvision.models as models
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from tqdm import tqdm

from typing import List, Dict, Tuple, Optional, Union

In [3]:
# Global parameters
VIDEOS_DIR = '/home/kaan-eren/projects/V4_BITIRME'
MAX_FRAMES = 30  # Frame window size
FRAME_SIZE = (224, 224)  # Standard size for most pre-trained models
BATCH_SIZE = 4  # Adjust based on VRAM
NUM_WORKERS = 1  
NUM_EPOCHS = 5
LEARNING_RATE = 3e-5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
FEATURE_DIM = 256  # Dimensionality of frame features
NUM_HEADS = 4  # Number of attention heads
NUM_LAYERS = 3  # Number of transformer layers
DROPOUT = 0.3

In [4]:
# Transformations for video frames
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(FRAME_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

In [5]:
class VideoDataset(Dataset):
    """
    Dataset class to handle video data for effect detection model
    
    This dataset processes an entire video file and extracts frames at regular intervals,
    creating a dataset of consecutive frame windows for training the effect detection model.
    """
    
    def __init__(
        self, 
        json_data: List[Dict], 
        videos_dir: str,
        max_frames: int = MAX_FRAMES,
        frame_size: Tuple[int, int] = FRAME_SIZE,
        transform=None,
        mode: str = 'train',
        window_stride: int = 15  # How many frames to advance for each new window
    ):
        """
        Initialize the dataset
        
        Args:
            json_data: List of dictionaries containing video data (for reference only)
            videos_dir: Directory containing the video files
            max_frames: Number of frames in each sliding window
            frame_size: Size to resize frames to
            transform: Additional transforms to apply to frames
            mode: Dataset mode ('train', 'val', or 'test') for file mapping
            window_stride: Number of frames to advance for each new window
        """
        self.json_data = json_data
        self.videos_dir = videos_dir
        self.max_frames = max_frames
        self.frame_size = frame_size
        self.mode = mode
        self.window_stride = window_stride
        
        # Create default transform if none provided
        if transform is None:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transform
        
        # Generate samples based on the video file
        self.video_path, self.effect_map = self._get_video_info()
        self.samples = self._generate_samples()
        
    def _get_video_info(self) -> Tuple[str, Dict[int, int]]:
        """
        Get video path and create a mapping of frame numbers to effects
        
        Returns:
            Tuple of (video_path, effect_map)
        """
        # Map the filename to the correct video file based on mode
        if self.mode == 'train':
            video_filename = 'train_set.mp4'
        elif self.mode == 'val':
            video_filename = 'val_set.mp4'
        elif self.mode == 'test':
            video_filename = 'test_set.mp4'
        else:
            # Use the filename from the first JSON entry as fallback
            video_filename = self.json_data[0]["filename"]
            
        video_path = os.path.join(self.videos_dir, video_filename)
        
        # Create a mapping of frame numbers to effect labels
        effect_map = {}
        
        # Go through all segments in JSON data and mark the effect labels
        for video_data in self.json_data:
            for segment in video_data["segments"]:
                start_frame = segment["start_frame"]
                end_frame = segment["end_frame"]
                effect = segment["effect"]
                
                # Mark every frame in this segment with the effect label
                for frame_idx in range(start_frame, end_frame):
                    effect_map[frame_idx] = effect
        
        return video_path, effect_map
        
    def _generate_samples(self) -> List[Dict]:
        """
        Generate a list of sliding windows from the video with proportional label smoothing
        """
        # Open video to get properties
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {self.video_path}")
    
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
    
        samples = []
    
        for start_idx in range(0, total_frames - self.max_frames + 1, self.window_stride):
            end_idx = start_idx + self.max_frames
    
            # Count frames with effects in this window
            effect_frames = sum(1 for i in range(start_idx, end_idx) 
                              if self.effect_map.get(i, 0) == 1)
            
            # Calculate proportion of effect frames
            effect_proportion = effect_frames / self.max_frames
            
            # Custom label smoothing mapping
            label_mapping = {
                0: 0.0,    # No effects
                1: 0.10,   # 1 frame
                2: 0.15,   # 2 frames  
                3: 0.30,   # 3 frames
                4: 0.65,   # 4 frames
                5: 0.85,   # 5 frames
            }
            
            # For 6 or more frames, label as 1.0
            if effect_frames >= 6:
                effect_label = 1.0
            else:
                effect_label = label_mapping.get(effect_frames, 0.0)
    
            sample_info = {
                "start_frame": start_idx,
                "end_frame": end_idx, 
                "effect": effect_label,
                "effect_frames_count": effect_frames,
                "effect_proportion": effect_proportion
            }
    
            samples.append(sample_info)
    
        return samples
    
    def _extract_frames(self, start_frame: int, end_frame: int) -> np.ndarray:
        """
        Extract frames from a video segment
        
        Args:
            start_frame: Start frame index
            end_frame: End frame index
            
        Returns:
            Numpy array of frames
        """
        # Open video file
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {self.video_path}")
        
        # Extract consecutive frames (we want exactly max_frames)
        frames = []
        for frame_idx in range(start_frame, end_frame):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if not ret:
                break
                
            # Resize frame
            frame = cv2.resize(frame, self.frame_size)
            
            # Convert from BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            frames.append(frame)
        
        cap.release()
        
        # If no frames were extracted or fewer than expected
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from {self.video_path}")
        elif len(frames) < self.max_frames:
            # Pad with the last frame
            last_frame = frames[-1]
            frames.extend([last_frame] * (self.max_frames - len(frames)))
        
        return np.array(frames)
    
    def __len__(self) -> int:
        """Return the number of samples (windows) in the dataset"""
        return len(self.samples)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        """
        Get a sample from the dataset
        
        Args:
            idx: Index of the sample
            
        Returns:
            Tuple of (frames, label)
        """
        sample = self.samples[idx]
        
        try:
            # Extract frames
            frames = self._extract_frames(
                sample["start_frame"],
                sample["end_frame"]
            )

            if random.random() > 0.5:
                frames = frames[::-1]
            
            # Apply transforms to each frame
            transformed_frames = []
            for frame in frames:
                transformed_frame = self.transform(frame)
                transformed_frames.append(transformed_frame)
            
            # Stack frames
            frames_tensor = torch.stack(transformed_frames)
            
            # Get label
            label = sample["effect"]
            
            return frames_tensor, label
            
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            # Return a default/placeholder
            # Create a tensor of zeros with the expected shape
            empty_frames = torch.zeros(self.max_frames, 3, self.frame_size[0], self.frame_size[1])
            # Return default label as 0
            return empty_frames, 0

In [6]:
def create_dataloaders(
    train_json_path: str,
    val_json_path: str,
    test_json_path: str,
    videos_dir: str = VIDEOS_DIR,
    batch_size: int = BATCH_SIZE,
    max_frames: int = MAX_FRAMES,
    frame_size: Tuple[int, int] = FRAME_SIZE,
    num_workers: int = NUM_WORKERS,
    window_stride: int = 30  # Added parameter
) -> Tuple[DataLoader, DataLoader, DataLoader]:
    """
    Create train, validation, and test dataloaders from separate JSON files
    
    Args:
        train_json_path: Path to training data JSON
        val_json_path: Path to validation data JSON
        test_json_path: Path to test data JSON
        videos_dir: Directory containing the video files
        batch_size: Batch size for dataloaders
        max_frames: Maximum number of frames in each sliding window
        frame_size: Size to resize frames to
        num_workers: Number of worker processes for dataloaders
        window_stride: Number of frames to advance for each new window

    Returns:
        Tuple of (train_loader, val_loader, test_loader)
    """
    # Load JSON data
    with open(train_json_path, 'r') as f:
        train_data = json.load(f)
    
    with open(val_json_path, 'r') as f:
        val_data = json.load(f)
    
    with open(test_json_path, 'r') as f:
        test_data = json.load(f)
    
    # Create datasets
    train_dataset = VideoDataset(
        json_data=train_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='train',
        window_stride=window_stride  # Added parameter
    )
    
    val_dataset = VideoDataset(
        json_data=val_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='val',
        window_stride=window_stride  # Added parameter
    )
    
    test_dataset = VideoDataset(
        json_data=test_data,
        videos_dir=videos_dir,
        max_frames=max_frames,
        frame_size=frame_size,
        mode='test',
        window_stride=window_stride  # Added parameter
    )
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    
    return train_loader, val_loader, test_loader

In [7]:
# Usage example:
if __name__ == "__main__":
    # Create dataloaders from separate JSON files
    train_loader, val_loader, test_loader = create_dataloaders(
        train_json_path='train_set.json',
        val_json_path='val_set.json',
        test_json_path='test_set.json',
        videos_dir=VIDEOS_DIR
    )
    
    # Print dataset statistics
    print(f"Train dataset size: {len(train_loader.dataset)}")
    print(f"Validation dataset size: {len(val_loader.dataset)}")
    print(f"Test dataset size: {len(test_loader.dataset)}")
    
    # Test a batch
    for frames, labels in train_loader:
        print(f"Batch shape: {frames.shape}")  # Should be [batch_size, max_frames, 3, height, width]
        print(f"Labels: {labels}")
        break

Train dataset size: 1391
Validation dataset size: 285
Test dataset size: 294
Batch shape: torch.Size([4, 30, 3, 224, 224])
Labels: tensor([1., 0., 0., 0.], dtype=torch.float64)


In [8]:
class FrameFeatureExtractor(nn.Module):
    def __init__(self, pretrained=True):
        super(FrameFeatureExtractor, self).__init__()
        base_model = models.efficientnet_b0(weights='DEFAULT' if pretrained else None)
        
        # Extract multiple layers for multi-scale features
        self.backbone = base_model.features
        
        # Get features from different scales
        self.early_features = nn.Sequential(*list(self.backbone.children())[:3])
        self.mid_features = nn.Sequential(*list(self.backbone.children())[3:5])     
        self.late_features = nn.Sequential(*list(self.backbone.children())[5:])
        
        # Auto-detect channel dimensions by running a dummy forward pass
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 224, 224)
            early_out = self.early_features(dummy_input)
            mid_out = self.mid_features(early_out)
            late_out = self.late_features(mid_out)
            
            early_channels = early_out.shape[1]
            mid_channels = mid_out.shape[1] 
            late_channels = late_out.shape[1]
            
            print(f"Detected channels - Early: {early_channels}, Mid: {mid_channels}, Late: {late_channels}")
        
        # Multiple pooling strategies for each scale
        self.early_pools = nn.ModuleList([
            nn.AdaptiveAvgPool2d((4, 4)),
            nn.AdaptiveMaxPool2d((4, 4))
        ])
        self.mid_pools = nn.ModuleList([
            nn.AdaptiveAvgPool2d((2, 2)),
            nn.AdaptiveMaxPool2d((2, 2))
        ])
        self.late_pools = nn.ModuleList([
            nn.AdaptiveAvgPool2d(1),
            nn.AdaptiveMaxPool2d(1)
        ])
        
        # Feature extractors with correct dimensions
        self.early_fc = nn.Sequential(
            nn.Linear(early_channels * 4 * 4 * 2, 256),  # *2 for avg+max pooling
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 256)
        )
        self.mid_fc = nn.Sequential(
            nn.Linear(mid_channels * 2 * 2 * 2, 256),
            nn.ReLU(), 
            nn.Dropout(0.4),
            nn.Linear(256, 256)
        )
        self.late_fc = nn.Sequential(
            nn.Linear(late_channels * 2, 256),  # *2 for avg+max pooling
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 256)
        )
        
        # Scale attention with dropout
        self.scale_attention = nn.Sequential(
            nn.Linear(768, 256),  # 256 * 3 scales = 768
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 3),
            nn.Softmax(dim=1)
        )
        self.final_fc = nn.Sequential(
            nn.LayerNorm(256),           # Change to 256
            nn.Linear(256, FEATURE_DIM), # Change to 256
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.LayerNorm(FEATURE_DIM),
            nn.Linear(FEATURE_DIM, FEATURE_DIM),
            nn.Dropout(0.2)
        )
    def forward(self, x):
        batch_size = x.size(0)
        
        # Extract multi-scale features
        early = self.early_features(x)
        mid = self.mid_features(early)
        late = self.late_features(mid)
        
        # Multiple pooling for each scale
        early_pools = [pool(early).view(batch_size, -1) for pool in self.early_pools]
        mid_pools = [pool(mid).view(batch_size, -1) for pool in self.mid_pools]
        late_pools = [pool(late).view(batch_size, -1) for pool in self.late_pools]
        
        # Concatenate pooling results
        early_concat = torch.cat(early_pools, dim=1)
        mid_concat = torch.cat(mid_pools, dim=1)
        late_concat = torch.cat(late_pools, dim=1)
        
        # Project to same dimension
        early_feat = self.early_fc(early_concat)
        mid_feat = self.mid_fc(mid_concat)
        late_feat = self.late_fc(late_concat)
        
        # Concatenate all scales
        multi_scale = torch.cat([early_feat, mid_feat, late_feat], dim=1)  # [batch, 768]
        
        # Apply attention weighting
        attention_weights = self.scale_attention(multi_scale)
        attended_features = (
            attention_weights[:, 0:1] * early_feat +
            attention_weights[:, 1:2] * mid_feat + 
            attention_weights[:, 2:3] * late_feat
        )
        
        # Use ONLY attended features (don't concatenate with original)
        output = self.final_fc(attended_features)  # attended_features is [batch, 256]
        
        return output

In [9]:
class TransformerClassifier(nn.Module):
    def __init__(self):
        super(TransformerClassifier, self).__init__()
        self.feature_extractor = FrameFeatureExtractor()

        # Positional encoding
        self.pos_encoder = nn.Parameter(torch.randn(1, MAX_FRAMES, FEATURE_DIM) * 0.02)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=FEATURE_DIM,
            nhead=NUM_HEADS,
            dim_feedforward=FEATURE_DIM * 4,
            dropout=DROPOUT,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=NUM_LAYERS)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(FEATURE_DIM, 256),
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x shape: (batch_size, num_frames, channels, height, width)
        batch_size, num_frames, channels, height, width = x.shape

        # Reshape for feature extraction
        x = x.view(batch_size * num_frames, channels, height, width)

        # Extract features
        features = self.feature_extractor(x)

        # Reshape back to (batch_size, num_frames, feature_dim)
        features = features.view(batch_size, num_frames, -1)

        # Add positional encoding
        features = features + self.pos_encoder

        # Apply transformer encoder
        transformer_output = self.transformer_encoder(features)

        # Global pooling over sequence dimension
        pooled_output = torch.mean(transformer_output, dim=1)

        # Classification
        logits = self.classifier(pooled_output)

        return logits.squeeze(-1)

In [10]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        frames, labels = batch
        frames = frames.to(device)
        labels = labels.float().to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(frames)
            loss = criterion(outputs, labels)

        loss.backward()
        
        for param in model.parameters():
            if param.grad is not None:
                param.grad += torch.randn_like(param.grad) * 0.01
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate predictions
        preds = (torch.sigmoid(outputs) > 0.4).float().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({"batch_loss": f"{loss.item():.4f}"})

    # Calculate metrics
    all_preds = [1 if pred >= 0.4 else 0 for pred in all_preds]
    all_labels = [1 if label >= 0.4 else 0 for label in all_labels]
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)

    return epoch_loss / len(dataloader), accuracy, precision, recall, f1


def validate(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            frames, labels = batch
            frames = frames.to(device)
            labels = labels.float().to(device)

            outputs = model(frames)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            # Calculate predictions
            preds = (torch.sigmoid(outputs) > 0.4).float().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    all_preds = [1 if pred >= 0.4 else 0 for pred in all_preds]
    all_labels = [1 if label >= 0.4 else 0 for label in all_labels]
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)

    return val_loss / len(dataloader), accuracy, precision, recall, f1

In [11]:
def plot_metrics(train_values, val_values, metric_name):
    """
    Plot the training and validation metrics
    """
    plt.figure(figsize=(10, 6))
    plt.plot(train_values, label=f'Train {metric_name}')
    plt.plot(val_values, label=f'Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.title(f'Training and Validation {metric_name}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{metric_name.lower().replace(" ", "_")}_plot.png')
    plt.close()

In [12]:
def train_model():
    # Create data loaders
    print(f"Using device: {DEVICE}")
    print("Preparing data...")

    train_loader, val_loader, test_loader = create_dataloaders(
        train_json_path='train_set.json',
        val_json_path='val_set.json',
        test_json_path='test_set.json',
        videos_dir=VIDEOS_DIR,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS
    )

    print("Data loaded!")
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    print(f"Test batches: {len(test_loader)}")

    # Create model
    print("Initializing model...")
    model = TransformerClassifier().to(DEVICE)

    # Print model summary
    print(f"Model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

    # Loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()  # Add label smoothing
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )

    # Training loop
    print("Starting training...")

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    train_f1s, val_f1s = [], []

    best_val_f1 = 0
    best_model_path = 'best_effect_detection_model.pth'

    for epoch in range(NUM_EPOCHS):
        print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")

        # Train
        train_loss, train_acc, train_prec, train_rec, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, DEVICE
        )

        # Validate
        val_loss, val_acc, val_prec, val_rec, val_f1 = validate(
            model, val_loader, criterion, DEVICE
        )

        # Update learning rate
        scheduler.step(val_loss)

        # Save metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        train_f1s.append(train_f1)
        val_f1s.append(val_f1)

        # Print metrics
        print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Prec: {train_prec:.4f}, Rec: {train_rec:.4f}, F1: {train_f1:.4f}")
        print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Prec: {val_prec:.4f}, Rec: {val_rec:.4f}, F1: {val_f1:.4f}")

        # Save best model (using F1 score as the metric)
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with validation F1: {val_f1:.4f}")

    print("\nTraining completed!")

    # Plot metrics
    print("Plotting metrics...")
    plot_metrics(train_losses, val_losses, 'Loss')
    plot_metrics(train_accuracies, val_accuracies, 'Accuracy')
    plot_metrics(train_f1s, val_f1s, 'F1 Score')

    # Evaluate on test set
    print("\nEvaluating on test set...")
    model.load_state_dict(torch.load(best_model_path))
    test_loss, test_acc, test_prec, test_rec, test_f1 = validate(
        model, test_loader, criterion, DEVICE
    )

    print(f"Test Results - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, Prec: {test_prec:.4f}, Rec: {test_rec:.4f}, F1: {test_f1:.4f}")

    return model


In [13]:
# Evaluation on single sequences
def evaluate_sequence(model, video_path, start_frame, num_frames, transform):
    """
    Evaluate a single video sequence for effect detection
    """
    model.eval()
    cap = cv2.VideoCapture(video_path)

    frames = []
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame + i)
        ret, frame = cap.read()
        if not ret:
            break

        # Apply transform
        frame = cv2.resize(frame, FRAME_SIZE)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        if transform:
            frame = transform(frame)
        else:
            # Convert to tensor manually
            frame = torch.FloatTensor(frame / 255.0).permute(2, 0, 1)

        frames.append(frame)

    cap.release()

    # Stack frames
    if len(frames) < num_frames:
        # Pad with zeros if not enough frames
        for _ in range(num_frames - len(frames)):
            frames.append(torch.zeros_like(frames[0]))

    frames_tensor = torch.stack(frames).unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        outputs = model(frames_tensor.to(DEVICE))
        probability = torch.sigmoid(outputs).item()
        prediction = 1 if probability > 0.4 else 0

    return prediction, probability

In [14]:
if __name__ == "__main__":
    # Train the model
    model = train_model()

    # Save the final model
    torch.save(model.state_dict(), 'final_effect_detection_model.pth')
    print("Final model saved!")



Using device: cuda
Preparing data...
Data loaded!
Train batches: 348
Val batches: 72
Test batches: 74
Initializing model...
Detected channels - Early: 24, Mid: 80, Late: 1280
Model initialized with 8,011,008 trainable parameters
Starting training...

Epoch 1/5


  with torch.cuda.amp.autocast():
Training: 100%|████████████| 348/348 [04:01<00:00,  1.44it/s, batch_loss=0.4795]
Validation: 100%|███████████████████████████████| 72/72 [00:52<00:00,  1.38it/s]


Train - Loss: 0.6666, Acc: 0.6312, Prec: 0.6312, Rec: 1.0000, F1: 0.7739
Val   - Loss: 0.6465, Acc: 0.6526, Prec: 0.6526, Rec: 1.0000, F1: 0.7898
New best model saved with validation F1: 0.7898

Epoch 2/5


  with torch.cuda.amp.autocast():
Training: 100%|████████████| 348/348 [04:01<00:00,  1.44it/s, batch_loss=0.7130]
Validation: 100%|███████████████████████████████| 72/72 [00:52<00:00,  1.37it/s]


Train - Loss: 0.6521, Acc: 0.6312, Prec: 0.6312, Rec: 1.0000, F1: 0.7739
Val   - Loss: 0.6201, Acc: 0.6526, Prec: 0.6526, Rec: 1.0000, F1: 0.7898

Epoch 3/5


  with torch.cuda.amp.autocast():
Training: 100%|████████████| 348/348 [04:06<00:00,  1.41it/s, batch_loss=0.5393]
Validation: 100%|███████████████████████████████| 72/72 [00:52<00:00,  1.38it/s]


Train - Loss: 0.6306, Acc: 0.6312, Prec: 0.6312, Rec: 1.0000, F1: 0.7739
Val   - Loss: 0.5541, Acc: 0.6526, Prec: 0.6526, Rec: 1.0000, F1: 0.7898

Epoch 4/5


  with torch.cuda.amp.autocast():
Training:  26%|███▍         | 92/348 [01:05<03:03,  1.40it/s, batch_loss=0.5456]


KeyboardInterrupt: 

In [None]:
# Check total GPU memory
print(f"GPU Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Check current memory usage
print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

# Check free memory
free_memory, total_memory = torch.cuda.mem_get_info(0)
print(f"GPU Memory Free: {free_memory / 1024**3:.2f} GB")