In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torchvision import models
from torch.utils.data import DataLoader
from torchvision import transforms
import cv2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
class RNN_Model(nn.Module):
    def __init__(self, num_classes, hidden_size, num_layers):
        super(RNN_Model, self).__init__()
        # Load the pretrained ResNet-18 model
        self.resnet = models.resnet18(weights = models.ResNet18_Weights.DEFAULT)
        self.resnet = nn.Sequential(*(list(self.resnet.children())[:-1]))
        
        # RNN (LSTM) layer
        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        # Classification layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x shape: [batch, time, channels, height, width]
        batch_size, timesteps, C, H, W = x.size()
        
        # Flatten dimensions for ResNet
        x = x.view(batch_size * timesteps, C, H, W)
        
        # Feature extraction through ResNet
        with torch.no_grad():
            features = self.resnet(x)
        
        # Reshape for LSTM
        features = features.view(batch_size, timesteps, -1)
        
        # Sequence processing through LSTM
        lstm_out, _ = self.lstm(features)
        
        # Classification
        out = self.fc(lstm_out[:, -1, :])
        return out
    
def preprocess_frame(frame, size=(224, 224)):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(size),
        transforms.ToTensor(),
    ])
    return transform(frame)

# Function to load and preprocess video
def load_video(video_path, max_frames=16):
    cap = cv2.VideoCapture(video_path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret or len(frames) == max_frames:
                break
            frame = preprocess_frame(frame)
            frames.append(frame)
    finally:
        cap.release()
    
    # Stack frames and add batch dimension
    frames_tensor = torch.stack(frames)
    frames_tensor = frames_tensor.unsqueeze(0)  # Add batch dimension
    return frames_tensor

# Hyperparameters
num_classes = 1 # Define the number of classes
hidden_size = 256 # LSTM hidden size
num_layers = 2 # Number of LSTM layers

# Model instance
model = RNN_Model(num_classes, hidden_size, num_layers)
model.to(device)

video_path = "video_test_dataset/0/miss_4.mp4"
video_tensor = load_video(video_path).to(device)

with torch.no_grad():
    outputs = model(video_tensor)
    prob = torch.sigmoid(outputs).item()
    
print("Probability of miss: {:.2f}%".format(prob * 100))

Probability of miss: 49.59%
