In [1]:
import logging
import os

import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from facenet_pytorch import MTCNN
from PIL import Image
from sklearn.metrics import (
    accuracy_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from torch.nn import functional as F

# Setup logging to display progress and informational messages
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Model architecture
To load the saved model weights, we must first define the model's architecture. This class must be identical to the `DeepfakeDetector` class used during training.

In [2]:
class DeepfakeDetector(nn.Module):
    def __init__(self, num_frames=20, backbone="efficientnet_b4", dropout_rate=0.6):
        super(DeepfakeDetector, self).__init__()
        self.num_frames = num_frames

        # CNN Backbone for feature extraction
        if backbone == "efficientnet_b4":
            weights = models.EfficientNet_B4_Weights.IMAGENET1K_V1
            self.backbone = models.efficientnet_b4(weights=weights)
            self.backbone.classifier = nn.Identity()  # Remove final classifier
            feature_dim = 1792
        elif backbone == "resnet50":
            weights = models.ResNet50_Weights.IMAGENET1K_V1
            self.backbone = models.resnet50(weights=weights)
            self.backbone.fc = nn.Identity()
            feature_dim = 2048

        # Temporal processing layers (LSTM)
        self.lstm = nn.LSTM(
            input_size=feature_dim,
            hidden_size=512,
            num_layers=3,
            batch_first=True,
            dropout=dropout_rate,
            bidirectional=True,
        )

        # Attention mechanism to focus on important frames
        self.attention = nn.Sequential(
            nn.Linear(1024, 256),  # 512 * 2 for bidirectional LSTM
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Softmax(dim=1),
        )

        # Final classification layers
        self.classifier = nn.Sequential(
            nn.Linear(1024, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 1),
        )

    def forward(self, x):
        batch_size, num_frames, C, H, W = x.shape
        x = x.view(batch_size * num_frames, C, H, W)

        features = self.backbone(x)
        features = features.view(batch_size, num_frames, -1)

        lstm_out, _ = self.lstm(features)

        attention_weights = self.attention(lstm_out)
        context_vector = (lstm_out * attention_weights).sum(dim=1)

        output = self.classifier(context_vector)
        return output

# Load Model and set up for inference
This section handles loading the pre-trained model and preparing all necessary components for the inference process, including the face detector (MTCNN) and image transformations.

In [3]:
def load_model_and_dependencies():
    """
    Initializes the model, loads weights, and sets up dependencies.
    """
    # --- Configuration ---
    MODEL_PATH = (
        "models/best_deepfake_detector_model.pth"  # IMPORTANT: Update this path
    )
    NUM_FRAMES = 15  # Must match the model's training configuration
    IMAGE_SIZE = 224

    # Set device (GPU if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging.info(f"Using device: {device}")

    # Initialize model
    model = DeepfakeDetector(num_frames=NUM_FRAMES, backbone="efficientnet_b4").to(
        device
    )

    # Load the trained weights
    try:
        model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
        logging.info(f"Successfully loaded model weights from {MODEL_PATH}")
    except FileNotFoundError:
        logging.error(
            f"Model file not found at {MODEL_PATH}. Please update the MODEL_PATH variable."
        )
        return None, None, None, None, None
    except Exception as e:
        logging.error(f"Error loading model: {e}")
        return None, None, None, None, None

    model.eval()  # Set model to evaluation mode

    # Initialize MTCNN for face detection
    mtcnn = MTCNN(
        image_size=IMAGE_SIZE,
        margin=20,
        post_process=False,
        device=device,
        select_largest=True,  # Handle multiple faces per frame,
        selection_method="probability"  # Select the face with the highest confidence
    )
    logging.info("MTCNN face detector initialized.")

    # Define validation transforms (must match those used during training)
    val_transforms = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )

    return model, mtcnn, val_transforms, device, IMAGE_SIZE

# Video inference and bounding box visualization
This function performs the core task: processing a video frame by frame, detecting faces, running the deepfake prediction, and drawing bounding boxes with the results on each frame.

In [4]:
def detect_deepfake_in_video(
    video_path, output_path, model, mtcnn, transforms, device, image_size
):
    """
    Processes a video to detect deepfakes, draws bounding boxes, and saves the output.
    """
    if not os.path.exists(video_path):
        logging.error(f"Video not found at {video_path}")
        return

    logging.info(f"Starting deepfake detection on {video_path}")

    cap = cv2.VideoCapture(video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Define the codec and create VideoWriter object to save the output video
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    with torch.no_grad():
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # Convert from BGR (OpenCV) to RGB for PIL and model
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame_rgb)

            # Detect faces and their bounding boxes
            boxes, _ = mtcnn.detect(pil_image)

            if boxes is not None:
                for box in boxes:
                    x1, y1, x2, y2 = [int(b) for b in box]

                    # Extract face using bounding box
                    face = pil_image.crop((x1, y1, x2, y2))

                    # Preprocess the face
                    face_resized = face.resize((image_size, image_size))
                    face_tensor = transforms(face_resized)

                    # The model expects a sequence of frames. We replicate the single face
                    # tensor to match the required input shape [1, num_frames, C, H, W].
                    input_tensor = (
                        face_tensor.unsqueeze(0)
                        .repeat(model.num_frames, 1, 1, 1)
                        .unsqueeze(0)
                        .to(device)
                    )

                    # Perform prediction
                    prediction = model(input_tensor)
                    prob = torch.sigmoid(prediction).item()

                    # Determine label and color for the bounding box
                    if prob > 0.5:
                        label = f"FAKE: {prob:.2%}"
                        color = (0, 0, 255)  # Red for Fake
                    else:
                        label = f"REAL: {1-prob:.2%}"
                        color = (0, 255, 0)  # Green for Real

                    # Draw bounding box and label on the original frame
                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(
                        frame,
                        label,
                        (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.7,
                        color,
                        2,
                    )

            out.write(frame)
            frame_count += 1
            if frame_count % 100 == 0:
                logging.info(f"Processed {frame_count} frames...")

    cap.release()
    out.release()
    logging.info(f"Detection complete. Output video saved to {output_path}")

# Main execution block

In [5]:
if __name__ == "__main__":
    # --- User Configuration ---

    INPUT_VIDEO_PATH = "./id59_id61_0006.mp4"
    OUTPUT_VIDEO_PATH = "./output_video.mp4"

    # Load model and dependencies
    model, mtcnn, val_transforms, device, image_size = load_model_and_dependencies()

    if model:
        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(OUTPUT_VIDEO_PATH)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Run the deepfake detection process
        detect_deepfake_in_video(
            video_path=INPUT_VIDEO_PATH,
            output_path=OUTPUT_VIDEO_PATH,
            model=model,
            mtcnn=mtcnn,
            transforms=val_transforms,
            device=device,
            image_size=image_size,
        )

2025-10-09 12:18:38,807 - INFO - Using device: cuda
2025-10-09 12:18:39,525 - INFO - Successfully loaded model weights from models/best_deepfake_detector_model.pth
2025-10-09 12:18:39,537 - INFO - MTCNN face detector initialized.
2025-10-09 12:18:39,537 - INFO - Starting deepfake detection on ./id59_id61_0006.mp4
2025-10-09 12:18:43,614 - INFO - Processed 100 frames...
2025-10-09 12:18:47,127 - INFO - Processed 200 frames...
2025-10-09 12:18:51,206 - INFO - Processed 300 frames...
2025-10-09 12:18:55,595 - INFO - Processed 400 frames...
2025-10-09 12:18:58,516 - INFO - Detection complete. Output video saved to ./output_video.mp4
