In [None]:
# @title Cell 1 - Environment setup (stable versions)

# Install PyTorch (CUDA 12.1 build) - works on Colab GPUs
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121

# Other dependencies
!pip install opencv-python matplotlib tqdm

# Install HuggingFace and other utilities
!pip install git+https://github.com/facebookresearch/segment-anything.git
!pip install git+https://github.com/IDEA-Research/GroundingDINO.git
!pip install huggingface_hub

# Make sure ffmpeg is installed
!apt-get update -qq
!apt-get install -y ffmpeg

# Clone repos (for models and scripts)
!git clone https://github.com/facebookresearch/segment-anything.git
!git clone https://github.com/IDEA-Research/GroundingDINO.git

print("✅ Environment ready. Next step: download model weights")


In [None]:
# @title Cell 2 - Download model weights for GroundingDINO and SAM

import os

# Create directories
os.makedirs("weights", exist_ok=True)

# GroundingDINO weights
if not os.path.exists("weights/groundingdino_swint_ogc.pth"):
    !wget -O weights/groundingdino_swint_ogc.pth https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

# SAM weights (ViT-H, most accurate)
if not os.path.exists("weights/sam_vit_h_4b8939.pth"):
    !wget -O weights/sam_vit_h_4b8939.pth https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

print("✅ Model weights downloaded and stored in ./weights")


In [None]:
# @title Cell 3 - Upload video and inspect properties
import os
from google.colab import files
import subprocess

# Upload video
uploaded = files.upload()

# Get uploaded filename
video_path = list(uploaded.keys())[0]
print(f"✅ Uploaded video: {video_path}")

# Ambil nama file yang diupload (anggap hanya satu file)
input_video = list(uploaded.keys())[0]
output_video = "output.mp4"
!ffmpeg -i "$input_video" -filter:v "fps=12" "$output_video" -y
from IPython.display import clear_output

# Inspect video with ffmpeg
print("\n--- Video Properties ---")
subprocess.run(["ffmpeg", "-i", video_path, "-hide_banner"])


In [None]:
# @title Cell 4 - Extract frames and audio from the input video

import os
import subprocess

# Create folders
os.makedirs("frames", exist_ok=True)
video12fps = "/content/output.mp4"
# Extract frames (as PNG to avoid compression artifacts)
subprocess.run([
    "ffmpeg", "-i", video12fps,
    "-q:v", "2",  # high quality
    "frames/frame_%06d.png"
])

print("✅ Frames extracted to ./frames")

# Extract audio (if exists)
if not os.path.exists("audio.aac"):
    subprocess.run([
        "ffmpeg", "-i", video12fps,
        "-vn", "-acodec", "copy", "audio.aac"
    ])
    print("✅ Audio extracted to audio.aac")
else:
    print("⚠️ Audio already exists, skipped extraction")


In [None]:
# @title Cell 5 - Run GroundingDINO on frames to produce detections

import torch
import json
import os
from tqdm import tqdm
import cv2

# === Prompt input ===
prompt = 'dress'  #@param {type: "string"}
box_threshold = 0.3  #@param {type: "number"}
text_threshold = 0.25

# Import GroundingDINO
import sys
sys.path.append("GroundingDINO")

from groundingdino.util.inference import load_model, load_image, predict

# Load model
model_path = "weights/groundingdino_swint_ogc.pth"
config_path = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
dino_model = load_model(config_path, model_path)

# Prepare output folder
os.makedirs("detections", exist_ok=True)

# Process all frames
frame_files = sorted([f for f in os.listdir("frames") if f.endswith(".png")])
print(f"Processing {len(frame_files)} frames with prompt: {prompt}")

for frame_file in tqdm(frame_files):
    frame_path = os.path.join("frames", frame_file)
    image_source, image = load_image(frame_path)  # image_source = np.array, image = tensor

    # Run prediction
    boxes, logits, phrases = predict(
        model=dino_model,
        image=image,
        caption=prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold
    )

    h, w, _ = image_source.shape

    # Convert normalized [cx, cy, w, h] → absolute [x1, y1, x2, y2]
    abs_boxes = []
    for (cx, cy, bw, bh) in boxes.tolist():
        x1 = (cx - bw/2) * w
        y1 = (cy - bh/2) * h
        x2 = (cx + bw/2) * w
        y2 = (cy + bh/2) * h
        abs_boxes.append([x1, y1, x2, y2])

    # Save detection metadata
    det_path = os.path.join("detections", frame_file.replace(".png", ".json"))
    with open(det_path, "w") as f:
        json.dump({
            "boxes": abs_boxes,
            "phrases": phrases,
            "logits": logits.tolist()
        }, f)

print("✅ All detections saved in ./detections as absolute pixel coords")


In [None]:
# @title Cell 5.5 – Preview bounding boxes from detections 12fps(Video)

import cv2
import json
import os
import numpy as np
from tqdm import tqdm

# Prepare output folder
os.makedirs("previews", exist_ok=True)

# Get frame list
frame_files = sorted([f for f in os.listdir("frames") if f.endswith(".png")])

# Read first frame to set video size
first_frame = cv2.imread(os.path.join("frames", frame_files[0]))
h, w = first_frame.shape[:2]

# Create video writer (MP4, 30 fps)
out_path = "previews/preview_boxes.mp4"
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
video = cv2.VideoWriter(out_path, fourcc, 12, (w, h))

for frame_file in tqdm(frame_files):
    frame_path = os.path.join("frames", frame_file)
    det_path = os.path.join("detections", frame_file.replace(".png", ".json"))

    # Load frame
    image = cv2.imread(frame_path)

    # If no detection JSON, just write raw frame
    if not os.path.exists(det_path):
        video.write(image)
        continue

    # Load detections
    with open(det_path, "r") as f:
        det = json.load(f)
    boxes = np.array(det["boxes"])

    # Draw boxes
    if boxes.size > 0:
        for box, phrase in zip(boxes.astype(int), det.get("phrases", [])):
            x1, y1, x2, y2 = box
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(image, phrase, (x1, max(y1 - 5, 0)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Write frame to video
    video.write(image)

video.release()
print(f"✅ Video saved: {out_path}")

from google.colab import files
files.download("/content/previews/preview_boxes.mp4")



In [None]:
# @title Cell 6 - Run SAM on detections to generate masks

import torch
import numpy as np
import json
from tqdm import tqdm
import cv2
import os

# Import SAM
import sys
sys.path.append("segment-anything")
from segment_anything import sam_model_registry, SamPredictor

# Load SAM model
sam_checkpoint = "weights/sam_vit_h_4b8939.pth"
sam = sam_model_registry["vit_h"](checkpoint=sam_checkpoint)
sam.to("cuda")
predictor = SamPredictor(sam)

# Prepare mask folder
os.makedirs("masks", exist_ok=True)

# Process frames
frame_files = sorted([f for f in os.listdir("frames") if f.endswith(".png")])

saved_count = 0
skipped_count = 0

for frame_file in tqdm(frame_files):
    frame_path = os.path.join("frames", frame_file)
    det_path = os.path.join("detections", frame_file.replace(".png", ".json"))

    # Skip if no detections
    if not os.path.exists(det_path):
        skipped_count += 1
        continue

    # Load frame
    image = cv2.imread(frame_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    predictor.set_image(image_rgb)

    # Load detections
    with open(det_path, "r") as f:
        det = json.load(f)
    boxes = np.array(det["boxes"])

    if len(boxes) == 0:
        # Save empty mask if nothing detected
        mask = np.zeros(image.shape[:2], dtype=np.uint8)
    else:
        # Predict mask for each box
        transformed_boxes = predictor.transform.apply_boxes_torch(
            torch.tensor(boxes, device="cuda"),
            image.shape[:2]
        )
        masks, _, _ = predictor.predict_torch(
            point_coords=None,
            point_labels=None,
            boxes=transformed_boxes,
            multimask_output=False
        )

        # Merge all masks into one binary mask
        mask = masks.sum(dim=0).cpu().numpy()
        mask = (mask > 0).astype(np.uint8) * 255

    # Fix shape (remove leading channel if exists)
    mask = np.squeeze(mask)

    # Save mask with debug check
    mask_path = os.path.join("masks", frame_file)
    success = cv2.imwrite(mask_path, mask)
    if not success:
        print("❌ Failed to save:", mask_path, "| shape:", mask.shape, "| dtype:", mask.dtype, "| unique:", np.unique(mask))
    else:
        saved_count += 1

print(f"✅ All masks processed. Saved: {saved_count}, Skipped: {skipped_count}, Total frames: {len(frame_files)}")


In [None]:
# @title Cell 7 - Generate masked frames for Output A (B&W) and Output B (Grey overlay)

import os
import cv2
import numpy as np
from tqdm import tqdm

# Prepare output folders
os.makedirs("output_bw", exist_ok=True)
os.makedirs("output_grey", exist_ok=True)

frame_files = sorted([f for f in os.listdir("frames") if f.endswith(".png")])

for frame_file in tqdm(frame_files):
    frame_path = os.path.join("frames", frame_file)
    mask_path = os.path.join("masks", frame_file)

    # Load images
    frame = cv2.imread(frame_path)
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

    if mask is None:
        # if no mask found, create empty mask
        mask = np.zeros(frame.shape[:2], dtype=np.uint8)

    # --- Expand mask (without blur for BW output) ---
    iterations = 20   # expand ~20px
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    expanded = cv2.dilate(mask, kernel, iterations=iterations)

    # --- Output A: black & white mask (expanded, not blurred) ---
    bw = cv2.merge([expanded, expanded, expanded])  # 3-channel B&W
    cv2.imwrite(os.path.join("output_bw", frame_file), bw)

    # --- Smooth expanded mask for blending in grey overlay ---
    smooth = cv2.GaussianBlur(expanded, (41, 41), 0)
    smooth = cv2.normalize(smooth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

    # --- Output B: grey overlay ---
    grey = frame.copy()
    grey_color = (126, 126, 126)  # mid-grey

    # Create alpha mask for blending
    alpha = smooth.astype(float) / 255.0
    for c in range(3):
        grey[:, :, c] = (1 - alpha) * grey[:, :, c] + alpha * grey_color[c]

    # Apply blur only to output_grey
    grey = cv2.GaussianBlur(grey, (9, 9), 0)

    cv2.imwrite(os.path.join("output_grey", frame_file), grey)

print("✅ Output frames saved in ./output_bw (expanded) and ./output_grey (expanded + blurred)")


In [None]:
# @title Cell 8 - Re-encode frames into videos and reattach audio

import subprocess
import re

# Rebuild Output A (B&W)
subprocess.run([
    "ffmpeg", "-y", "-framerate", "12", "-i", "output_bw/frame_%06d.png",
    "-i", "audio.aac", "-c:v", "libx264", "-pix_fmt", "yuv420p",
    "-c:a", "aac", "-shortest", "output_bw.mp4"
])

# Rebuild Output B (Grey overlay)
subprocess.run([
    "ffmpeg", "-y", "-framerate", "12", "-i", "output_grey/frame_%06d.png",
    "-i", "audio.aac", "-c:v", "libx264", "-pix_fmt", "yuv420p",
    "-c:a", "aac", "-shortest", "output_grey.mp4"
])

print("✅ Videos created: output_bw.mp4 and output_grey.mp4")


In [18]:
# @title Cell 9 Download

from google.colab import files
files.download("/content/output_bw.mp4")

from google.colab import files
files.download("/content/output_grey.mp4")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# @title Clear All
import shutil

shutil.rmtree('/content/frames')
shutil.rmtree('/content/detections')
shutil.rmtree('/content/masks')
shutil.rmtree('/content/output_bw')
shutil.rmtree('/content/output_grey')