In [58]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import cv2
from torchvision import transforms
from collections import deque

In [60]:
# Define the 3D CNN model for collision prediction
class CollisionPredictor3DCNN(nn.Module):
    def __init__(self, input_channels=3, clip_len=16, height=112, width=112):
        super(CollisionPredictor3DCNN, self).__init__()
        
        self.input_channels = input_channels
        self.clip_len = clip_len
        self.height = height
        self.width = width
        
        print(f"Model initialized with input_channels={input_channels}, clip_len={clip_len}, height={height}, width={width}")
        
        # First 3D convolutional block
        self.conv1 = nn.Sequential(
            nn.Conv3d(input_channels, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        )
        
        # Second 3D convolutional block
        self.conv2 = nn.Sequential(
            nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        )
        
        # Third 3D convolutional block
        self.conv3 = nn.Sequential(
            nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        )
        
        # Fourth 3D convolutional block
        self.conv4 = nn.Sequential(
            nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)),
            nn.BatchNorm3d(512),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        )
        
        # Calculate size of flattened features after convolutions
        c_len, h, w = self._calculate_conv_output_size()
        self.flat_size = 512 * c_len * h * w
        print(f"Calculated flat size: {self.flat_size} (512 * {c_len} * {h} * {w})")
        
        # Fully connected layers
        self.fc1 = nn.Sequential(
            nn.Linear(self.flat_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        
        # Additional input for time features (time_of_event, time_of_alert)
        self.time_fc = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU()
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(1024 + 64, 512),  # Combine video features with time features
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        
        # Output layer (collision probability)
        self.output = nn.Sequential(
            nn.Linear(512, 1),
            nn.Sigmoid()
        )
    
    def _calculate_conv_output_size(self):
        # Calculate output size after convolutional layers
        c_len, h, w = self.clip_len, self.height, self.width
        
        # After conv1: [clip_len, height/2, width/2]
        c_len, h, w = c_len, h // 2, w // 2
        
        # After conv2: [clip_len, height/4, width/4]
        c_len, h, w = c_len, h // 2, w // 2
        
        # After conv3: [clip_len/2, height/8, width/8]
        c_len, h, w = c_len // 2, h // 2, w // 2
        
        # After conv4: [clip_len/4, height/16, width/16]
        c_len, h, w = c_len // 2, h // 2, w // 2
        
        return c_len, h, w
    
    def forward(self, x, time_features=None):
        # Check input shape and rearrange if needed
        # Expected shape: [batch_size, channels, clip_len, height, width]
        # If shape is [batch_size, height, clip_len, channels, width], we need to permute
        
        if x.shape[1] > 5:  # This suggests height is in channel position
            # Permute to get [batch_size, channels, clip_len, height, width]
            x = x.permute(0, 3, 2, 1, 4)
            print(f"Permuted input shape: {x.shape}")
        
        # Apply convolutional blocks
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        
        # Flatten the output
        x = x.view(x.size(0), -1)
        
        # Apply first fully connected layer
        x = self.fc1(x)
        
        if time_features is not None:
            # Process time features
            time_feats = self.time_fc(time_features)
            
            # Concatenate with CNN features
            x = torch.cat((x, time_feats), dim=1)
        else:
            # If no time features, pad with zeros
            batch_size = x.size(0)
            x = torch.cat((x, torch.zeros(batch_size, 64, device=x.device)), dim=1)
        
        # Apply second fully connected layer
        x = self.fc2(x)
        
        # Output layer
        x = self.output(x)
        
        return x

In [61]:
# Custom dataset class for video collision data
class VideoCollisionDataset(Dataset):
    def __init__(self, video_paths, time_of_events, time_of_alerts, targets, 
                 clip_len=16, spatial_size=(112, 112), transform=None):
        self.video_paths = video_paths
        self.time_of_events = time_of_events
        self.time_of_alerts = time_of_alerts
        self.targets = targets
        self.clip_len = clip_len
        self.spatial_size = spatial_size
        self.transform = transform
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        # Load video
        video = self.load_video(self.video_paths[idx])
        
        # Extract clip from video
        clip = self.extract_clip(video, idx)
        
        # Prepare time features
        time_features = self.prepare_time_features(idx)
        
        # If transform exists, apply it to each frame
        if self.transform:
            transformed_clip = []
            for frame in clip:
                # frame is already a numpy array (H, W, C), so apply transform directly
                frame_tensor = self.transform(frame)  # Results in tensor (C, H, W)
                transformed_clip.append(frame_tensor)
            
            # Stack along time dimension
            clip = torch.stack(transformed_clip)  # Shape (T, C, H, W)
            # Permute to get (C, T, H, W) for 3D CNN
            clip = clip.permute(1, 0, 2, 3)
        else:
            # If no transform, manually handle the conversion
            # Convert from (T, H, W, C) to (C, T, H, W)
            clip = np.transpose(clip, (3, 0, 1, 2))
            clip = torch.from_numpy(clip).float()
        
        target = torch.tensor([self.targets[idx]], dtype=torch.float)
        time_features = torch.from_numpy(time_features).float()
        
        return clip, time_features, target
    
    def load_video(self, video_path):
        """
        Load video from file.
        """
        frames = []
        cap = cv2.VideoCapture(video_path)
        
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {video_path}")
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Resize frame
            frame = cv2.resize(frame, self.spatial_size)
            
            # Convert BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            frames.append(frame)
        
        cap.release()
        
        if not frames:
            raise ValueError(f"No frames could be read from video: {video_path}")
        
        # Convert to numpy array
        video = np.array(frames)
        
        return video
    
    def extract_clip(self, video, idx):
        """
        Extract clip from video based on target and time information.
        """
        if self.targets[idx] == 1 and self.time_of_events[idx] is not None:
            # If there's a collision, center the clip around the collision time
            center_frame = self.time_to_frame(self.time_of_events[idx], video.shape[0])
            start_frame = max(0, center_frame - self.clip_len // 2)
            end_frame = min(video.shape[0], start_frame + self.clip_len)
            
            # If we're at the edge of the video, adjust start_frame
            if end_frame - start_frame < self.clip_len:
                start_frame = max(0, end_frame - self.clip_len)
            
            clip = video[start_frame:end_frame]
            
            # If clip is shorter than clip_len, pad it
            if clip.shape[0] < self.clip_len:
                padding = np.zeros((self.clip_len - clip.shape[0], *clip.shape[1:]), dtype=clip.dtype)
                clip = np.concatenate([clip, padding], axis=0)
        else:
            # If no collision or no time_of_event, extract a random clip
            if video.shape[0] <= self.clip_len:
                # If video is shorter than clip_len, pad it
                padding = np.zeros((self.clip_len - video.shape[0], *video.shape[1:]), dtype=video.dtype)
                clip = np.concatenate([video, padding], axis=0)
            else:
                # Randomly select a starting point
                start_frame = np.random.randint(0, video.shape[0] - self.clip_len + 1)
                clip = video[start_frame:start_frame + self.clip_len]
        
        return clip
    
    def prepare_time_features(self, idx):
        """
        Prepare time features (time_of_event, time_of_alert).
        """
        # Normalize time values to [0, 1] or use -1 for None
        time_event = -1 if self.time_of_events[idx] is None else self.time_of_events[idx]
        time_alert = -1 if self.time_of_alerts[idx] is None else self.time_of_alerts[idx]
        time_features = np.array([time_event, time_alert], dtype=np.float32)
        
        return time_features
    
    def apply_transforms(self, clip):
        """
        Apply transforms to clip frames.
        """
        if self.transform:
            transformed_clip = []
            for frame in clip:
                # Convert NumPy array to PIL Image if it's not already
                if isinstance(frame, np.ndarray):
                    # Ensure frame has proper shape for transform
                    if frame.shape[2] == 3:  # If it's (H, W, C)
                        transformed_frame = self.transform(frame)
                    else:
                        # If somehow we got (C, H, W), rearrange it
                        frame = np.transpose(frame, (1, 2, 0))
                        transformed_frame = self.transform(frame)
                else:
                    transformed_frame = self.transform(frame)
                
                # Now transformed_frame is a tensor (C, H, W)
                transformed_clip.append(transformed_frame)
            
            # Stack tensors to (T, C, H, W)
            return torch.stack(transformed_clip).numpy()
        return clip
    
    def time_to_frame(self, time, total_frames):
        """
        Convert time to frame index.
        """
        # Assuming time is normalized between 0 and 1
        return int(time * (total_frames - 1))

In [62]:
# Transforms for video frames
def get_transforms(mode='train'):
    if mode == 'train':
        return transforms.Compose([
            # Don't include ToPILImage since our frames are already numpy arrays
            transforms.ToPILImage(),  # Convert numpy array to PIL Image
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    else:
        return transforms.Compose([
            # Don't include ToPILImage since our frames are already numpy arrays
            transforms.ToPILImage(),  # Convert numpy array to PIL Image
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

In [63]:
def get_device():
    """
    Get the device to use for training.
    """
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    return device

In [64]:
# Function to train the model with debug prints
def train_model(model, train_loader, val_loader, num_epochs=20, learning_rate=0.001):
    # Check if GPU is available
    device = get_device()
    model = model.to(device)
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)
    
    best_val_loss = float('inf')
    best_model_state = None
    
    # Debug: Print model architecture
    print(f"Model architecture:\n{model}")
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        # Debug: Print batch information for first epoch
        for batch_idx, (clips, time_features, targets) in enumerate(train_loader):
            # Debug info for first batch
            if epoch == 0 and batch_idx == 0:
                print(f"Input shapes - clips: {clips.shape}, time_features: {time_features.shape}, targets: {targets.shape}")
            
            # Check if tensor dimensions need to be fixed
            if clips.dim() == 5 and clips.shape[1] > 5:  # If second dim is too large, likely wrong order
                # Permute to the correct order [B, C, T, H, W]
                clips = clips.permute(0, 3, 2, 1, 4)
                print(f"Corrected clips shape: {clips.shape}")
            
            clips = clips.to(device)
            time_features = time_features.to(device)
            targets = targets.to(device)
            
            try:
                # Forward pass with error handling
                outputs = model(clips, time_features)
                loss = criterion(outputs, targets)
                
                # Backward pass and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                train_loss += loss.detach().item() * clips.size(0)
                
                # Debug info for first few batches
                if epoch == 0 and batch_idx < 3:
                    print(f"Batch {batch_idx} processed successfully. Loss: {loss.detach().item():.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {batch_idx}:")
                print(f"Clips shape: {clips.shape}")
                print(f"Time features shape: {time_features.shape}")
                print(f"Targets shape: {targets.shape}")
                print(f"Error: {str(e)}")
                raise  # Re-raise the exception to stop training
        
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0.0
        all_outputs = []
        all_targets = []
        
        with torch.no_grad():
            for clips, time_features, targets in val_loader:
                # Fix tensor dimensions if needed
                if clips.dim() == 5 and clips.shape[1] > 5:
                    clips = clips.permute(0, 3, 2, 1, 4)
                
                clips = clips.to(device)
                time_features = time_features.to(device)
                targets = targets.to(device)
                
                try:
                    outputs = model(clips, time_features)
                    loss = criterion(outputs, targets)
                    
                    val_loss += loss.item() * clips.size(0)
                    
                    all_outputs.extend(outputs.cpu().numpy())
                    all_targets.extend(targets.cpu().numpy())
                except RuntimeError as e:
                    print(f"Error during validation:")
                    print(f"Clips shape: {clips.shape}")
                    print(f"Error: {str(e)}")
                    raise
        
        val_loss = val_loss / len(val_loader.dataset)
        
        # Calculate metrics
        all_outputs = np.array(all_outputs).flatten()
        all_targets = np.array(all_targets).flatten()
        
        binary_preds = (all_outputs > 0.5).astype(int)
        accuracy = np.mean(binary_preds == all_targets)
        
        # Update learning rate based on validation loss
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            print(f"Saved new best model with validation loss: {best_val_loss:.4f}")
    
    # Load best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model


In [65]:
# Function for real-time inference
class CollisionDetector:
    def __init__(self, model_path, clip_len=16, spatial_size=(112, 112)):


        self.device = get_device()
        self.clip_len = clip_len
        self.spatial_size = spatial_size
        
        # Initialize model
        self.model = CollisionPredictor3DCNN(
            input_channels=3, 
            clip_len=clip_len, 
            height=spatial_size[0], 
            width=spatial_size[1]
        ).to(self.device)
        
        # Load model weights
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
        
        # Initialize frame buffer
        self.buffer = deque(maxlen=clip_len)
        
        # Initialize transform
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def process_frame(self, frame):
        # Resize frame
        frame = cv2.resize(frame, self.spatial_size)
        
        # Convert BGR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Apply transform
        frame = self.transform(frame)
        
        # Add to buffer
        self.buffer.append(frame.numpy())
        
        # If buffer is not full, return None
        if len(self.buffer) < self.clip_len:
            return None
        
        # Create clip tensor
        clip = np.array(self.buffer)
        # Buffer shape is (T, C, H, W), need (C, T, H, W) for 3D CNN
        clip = np.transpose(clip, (1, 0, 2, 3))  # (T, C, H, W) -> (C, T, H, W)
        clip = torch.from_numpy(clip).float().unsqueeze(0)  # Add batch dimension
        
        # Move to device
        clip = clip.to(self.device)
        
        # Since we don't have time features for real-time prediction,
        # we'll use default values (-1, -1) to indicate no specific time info
        time_features = torch.tensor([[-1.0, -1.0]], dtype=torch.float32).to(self.device)
        
        # Forward pass
        with torch.no_grad():
            collision_prob = self.model(clip, time_features)
            
        return collision_prob.item()
    
    def predict_from_video(self, video_path, threshold=0.5):
        """
        Process an entire video and return frames with collision probabilities.
        Returns list of (frame, probability) pairs where probability > threshold.
        """
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file: {video_path}")
        
        # Reset buffer
        self.buffer = deque(maxlen=self.clip_len)
        
        results = []
        frame_idx = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            prob = self.process_frame(frame)
            
            if prob is not None and prob > threshold:
                results.append((frame_idx, prob))
            
            frame_idx += 1
        
        cap.release()
        return results

In [66]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
    
# Parameters
clip_len = 16
spatial_size = (112, 112)
batch_size = 16
num_epochs = 20
learning_rate = 0.001

In [67]:
# Check if GPU is available
device = get_device()
print(f"Using device: {device}")

Using device: mps


In [68]:
# Set the data directory
data_dir = "./"

In [69]:
train_df = pd.read_csv(f'{data_dir}/nexar-collision-prediction/train.csv')

In [70]:
train_df.head()

Unnamed: 0,id,time_of_event,time_of_alert,target
0,1924,,,0
1,822,19.5,18.633,1
2,1429,,,0
3,208,19.8,19.233,1
4,1904,,,0


In [71]:
video_folder = f'{data_dir}/nexar-collision-prediction/train' 
video_paths = []
time_of_events = []
time_of_alerts = []
targets = []

for index, row in train_df.iterrows():
    video_id = str(int(row['id']))
    id_len = len(video_id)
    if id_len < 5:
        video_id = '0' * (5 - id_len) + video_id

    video_path = f"{video_folder}/{video_id}.mp4"
    video_paths.append(video_path)
    
    # Convert time_of_event and time_of_alert to float
    time_of_event = row['time_of_event'] if pd.notna(row['time_of_event']) else None
    time_of_alert = row['time_of_alert'] if pd.notna(row['time_of_alert']) else None
    
    time_of_events.append(time_of_event)
    time_of_alerts.append(time_of_alert)
    
    targets.append(row['target'])

In [72]:
# Create datasets
# Split data into train and validation sets (80/20 split)
split_idx = int(0.8 * len(video_paths))
    
train_transform = get_transforms(mode='train')
val_transform = get_transforms(mode='val')

In [73]:
train_dataset = VideoCollisionDataset(
    video_paths=video_paths[:split_idx],
    time_of_events=time_of_events[:split_idx],
    time_of_alerts=time_of_alerts[:split_idx],
    targets=targets[:split_idx],
    clip_len=clip_len,
    spatial_size=spatial_size,
    transform=train_transform
)
    
val_dataset = VideoCollisionDataset(
    video_paths=video_paths[split_idx:],
    time_of_events=time_of_events[split_idx:],
    time_of_alerts=time_of_alerts[split_idx:],
    targets=targets[split_idx:],
    clip_len=clip_len,
    spatial_size=spatial_size,
    transform=val_transform
)

In [74]:
# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=False
)
    
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

In [75]:
# Initialize model
model = CollisionPredictor3DCNN(
    input_channels=3,
    clip_len=clip_len,
    height=spatial_size[0],
    width=spatial_size[1]
).to(device)

Model initialized with input_channels=3, clip_len=16, height=112, width=112
Calculated flat size: 100352 (512 * 4 * 7 * 7)


In [76]:
# Train model
model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=num_epochs,
    learning_rate=learning_rate
)

Model architecture:
CollisionPredictor3DCNN(
  (conv1): Sequential(
    (0): Conv3d(3, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (

KeyboardInterrupt: 