In [None]:
!pip install transformers

In [None]:
import cv2
from transformers import DPTImageProcessor, DPTForDepthEstimation
import torch
import numpy as np
from PIL import Image

# Set up the input video file
input_file = "/content/drive/MyDrive/Pexels Videos 2711276.mp4"
cap = cv2.VideoCapture(input_file)

# Set up the output video file
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output_file = "output.mp4"
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
out = cv2.VideoWriter(output_file, fourcc, fps, frame_size)

# Set up the DPT model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
model.to(device)

# Process each frame of the video
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the OpenCV BGR image to PIL RGB image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Prepare the image for the model
    inputs = processor(images=image, return_tensors="pt")

    # Run the model
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth

    # Interpolate the depth map to original size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=image.size[::-1],
        mode="bicubic",
        align_corners=False,
    )

    # Convert the depth map to a numpy array and normalize
    output = prediction.squeeze().cpu().numpy()
    output = (output * 255 / np.max(output)).astype("uint8")

    # Write the output frame to the output video file
    out.write(cv2.cvtColor(output, cv2.COLOR_GRAY2BGR))

cap.release()
out.release()

