# Step 1: Upload Video to Colab


In [1]:
from google.colab import files
uploaded = files.upload()

Saving project_video.mp4 to project_video.mp4


# Step 2: Load Pretrained Models

In [2]:
import torch
import torchvision.transforms as T
from torchvision.models.segmentation import deeplabv3_resnet101
from torchvision.models import efficientnet_b0
from PIL import Image
import cv2
import numpy as np

In [3]:
# Load Segmentation Model
seg_model = deeplabv3_resnet101(pretrained=True).eval().cuda()

# Load Classification Model
clf_model = efficientnet_b0(pretrained=True).eval().cuda()

# Transforms
seg_transform = T.Compose([
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

clf_transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

Downloading: "https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth
100%|██████████| 233M/233M [00:01<00:00, 200MB/s]
Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 192MB/s]


# Step 3: Process Video and Predict

In [9]:
# Load video
video_path = '/content/project_video.mp4'
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Output video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output.mp4', fourcc, fps, (width, height))

In [10]:
from torchvision.models import resnet50
from torchvision import models
from torchvision import transforms
from torchvision.datasets import ImageNet
import torchvision

# Download class labels
!wget -q https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

# Load them into a list
with open("imagenet_classes.txt") as f:
    classes = [line.strip() for line in f.readlines()]

In [11]:
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert to PIL and preprocess for segmentation
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    input_tensor = seg_transform(pil_image).unsqueeze(0).cuda()

    with torch.no_grad():
        output = seg_model(input_tensor)['out'][0]
        seg_mask = output.argmax(0).byte().cpu().numpy()

    # Find contours and classify objects inside
    contours, _ = cv2.findContours(seg_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w * h < 1000:
            continue  # skip small noise

        obj_img = pil_image.crop((x, y, x+w, y+h))
        obj_tensor = clf_transform(obj_img).unsqueeze(0).cuda()

        with torch.no_grad():
            pred = clf_model(obj_tensor)
            label = classes[pred.argmax().item()]

        # Draw bounding box and label
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (0, 255, 0), 1, cv2.LINE_AA)

    out.write(frame)

cap.release()
out.release()
cv2.destroyAllWindows()

# Step 4: Download Output Video

In [12]:
from google.colab import files
files.download('output.mp4')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>