In [None]:
!pip install supervision
!ln -s /home/ubuntu/gdino/GroundingDINO/groundingdino .
!ln -s /home/ubuntu/gdino/GroundingDINO/weights .

In [None]:
import os

from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
GDINO_PATH = "/home/ubuntu/gdino/GroundingDINO"
IMAGE_PATH = f"{GDINO_PATH}/.asset/cat_dog.jpeg"
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

In [3]:
image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

cv2.imwrite("annotated_image.jpg", annotated_frame)

True

In [None]:
!pip install opencv-python

### FUNCTIONS

In [24]:
from ast import Tuple
import cv2
from scipy.fftpack import sc_diff
import matplotlib.pyplot as plt
import cv2 as cv
from typing import Any, Generator
import numpy as np
import torch
from PIL import Image
import groundingdino.datasets.transforms as T

def is_iterable(obj):
    try:
        iter(obj)
        return True
    except TypeError:
        return False
class Stack:
    def __init__(self, max_size):
        self.stack = []
        self.max_size = max_size

    def push(self, item):
        if len(self.stack) == self.max_size:
            self.stack.pop(0)  # Remove the oldest item
        self.stack.append(item)

    def pop(self):
        if len(self.stack) < 1:
            return None
        return self.stack.pop()

    def size(self):
        return len(self.stack)
    def __str__(self):
        return str(self.stack)
    def __iter__(self):
        return iter(self.stack)

def extract_frames(video_path, output_folder, frames_limit=100, skip=0):
    """
        write each frame to a file
    """
    # Open the video file
    video = cv2.VideoCapture(video_path)

    # Check if video opened successfully
    if not video.isOpened():
        print("Could not open video")
        return

    frame_taken = 0
    iteration = -1
    success = True
    files = []
    while (frames_limit > 0 and frame_taken < frames_limit) or (frames_limit == 0 and success is True):
        iteration += 1
        
        # Read the next frame from the video. If you read at the end of the video, success will be False
        success, frame = video.read()
        # print(frame_count)

        # Break the loop if the video is finished
        if not success:
            break
        if skip != 0 and iteration % skip != 0:
            continue
        
        

        # Save the frame into the output folder
        cv2.imwrite(f"{output_folder}/frame{frame_taken}.jpg", frame)
        files.append(f"{output_folder}/frame{frame_taken}.jpg")

        frame_taken +=1

    # Release the video file
    video.release()
    return files

def generate_frames(video_file: str, frames_limit=10) -> Generator[np.ndarray, None, None]:
    """
        yield each frame as byte array
    """
    video = cv2.VideoCapture(video_file)
    frame_count = 0

    while video.isOpened():
        success, frame = video.read()

        if not ((frames_limit > 0 and frame_count < frames_limit) or (frames_limit == 0 and success is True)):
            break

        yield frame
        frame_count += 1

    video.release()

def plot_image(image: np.ndarray, size: int = 12) -> None:
    %matplotlib inline
    plt.figure(figsize=(size, size))
    plt.imshow(image[...,::-1])
    plt.show()

def zoom_at(img, zoom=1, angle=0, coord=None):
    
    cy, cx = [ i/2 for i in img.shape[:-1] ] if coord is None else coord[::-1]
    
    rot_mat = cv2.getRotationMatrix2D((cx,cy), angle, zoom)
    result = cv2.warpAffine(img, rot_mat, img.shape[1::-1], flags=cv2.INTER_LINEAR)
    
    return result

def convert_ndarray(frame: np.ndarray[Any]) ->  torch.Tensor:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image_source = Image.fromarray(frame_rgb)

    # image_source = Image.fromarray(arr).convert("RGB")
    # image_source = Image.open(image_path).convert("RGB")
    # image_source = Image.open(image_path).convert("RGB")
    # image = np.asarray(image_source)
    image_transformed, _ = transform(image_source, None)
    return image_transformed

def add_text_to_frame2(frame, text, position=(50, 50), font_scale=1, font_color=(0, 0, 255), thickness=4):
    """
    Adds text to a single frame.
    """
    font = cv2.FONT_HERSHEY_SIMPLEX
    if is_iterable(text):
        for line in text:
            cv2.putText(frame, line, position, font, font_scale, font_color, thickness)
            # position = (position[0], position[1] + 50)
        # text = " ".join(text)
    else:
        cv2.putText(frame, text, position, font, font_scale, font_color, thickness)
    return frame

### Extract images

In [101]:
files = extract_frames(
    video_path="input/basketball.mp4", 
    output_folder="output/mid", 
    frames_limit=100, 
    skip=100
)
files


['output/mid/frame0.jpg',
 'output/mid/frame1.jpg',
 'output/mid/frame2.jpg',
 'output/mid/frame3.jpg',
 'output/mid/frame4.jpg',
 'output/mid/frame5.jpg',
 'output/mid/frame6.jpg',
 'output/mid/frame7.jpg',
 'output/mid/frame8.jpg',
 'output/mid/frame9.jpg',
 'output/mid/frame10.jpg',
 'output/mid/frame11.jpg',
 'output/mid/frame12.jpg',
 'output/mid/frame13.jpg',
 'output/mid/frame14.jpg',
 'output/mid/frame15.jpg',
 'output/mid/frame16.jpg',
 'output/mid/frame17.jpg',
 'output/mid/frame18.jpg',
 'output/mid/frame19.jpg',
 'output/mid/frame20.jpg',
 'output/mid/frame21.jpg',
 'output/mid/frame22.jpg',
 'output/mid/frame23.jpg',
 'output/mid/frame24.jpg',
 'output/mid/frame25.jpg',
 'output/mid/frame26.jpg',
 'output/mid/frame27.jpg']

### Using files

In [2]:
import cv2
vid_cap = cv2.VideoCapture('input/basketball.mp4')
fps = vid_cap.get(cv2.CAP_PROP_FPS)
ret, frame = vid_cap.read()
vid_cap.release()
resolution_size = (int(frame.shape[0]), int(frame.shape[1]))
resolution_size, fps


((1520, 2704), 59.94005994005994)

In [None]:
import os

import torch
from torchvision.ops import box_convert

# size = (int(img0.shape[1]/2), int(img0.shape[0]/2))
resolution = (int(1024), int(1024))
# Create a new video
h264 = cv.VideoWriter_fourcc('h','2','6','4')
mp4v = cv.VideoWriter_fourcc('m', 'p', '4', 'v')
new_video = cv2.VideoWriter("new_video1.mp4", mp4v, fps, resolution)

for file in files:
    f_name = os.path.basename(file)
    print(f_name)
    image_source, image = load_image(file)
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption="basketball",
        box_threshold=BOX_TRESHOLD,
        text_threshold=TEXT_TRESHOLD
    )
    if boxes.shape[0] == 0:
        continue

    annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

    h, w, _ = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    x = xyxy[0][0]
    y = xyxy[0][1]

    # cv2.imwrite(f"output/predict/{f_name}", annotated_frame)
    # cv.imwrite('zoom_frame0.jpg', zoom_at(annotated_frame, 1.5, coord=(264.5, 275)) )
    
    # plot_image(annotated_frame, 8)
    # plot_image(zoom_at(annotated_frame, 2, coord=(x, y)), 1280)
    new_video.write(annotated_frame)
    # print(boxes, logits, phrases)
    print("--------------")

print("Releasing video")
new_video.release()

In [None]:
# !conda install -y -c conda-forge ipywidgets # or pip install ipywidgets
# !conda install -n base -c conda-forge jupyterlab_widgets

# !conda install -n base -c conda-forge widgetsnbextension
# !which python
# !pip install ipywidgets widgetsnbextension pandas-profiling



### Using generator

In [None]:
from typing import Tuple
from tqdm.notebook import tqdm
import groundingdino.datasets.transforms as T
import PIL
import torch
from PIL import Image
from torchvision.ops import box_convert

def load_ndarr_image(numpy_image) -> Tuple[np.array, torch.Tensor]:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    # image_source = Image.open(image_path).convert("RGB")
    PIL_image = Image.fromarray(np.uint8(numpy_image)).convert('RGB')

    image = np.asarray(PIL_image)
    image_transformed, _ = transform(PIL_image, None)
    return image, image_transformed

frame_iterator = iter(generate_frames(video_file="input/basketball.mp4", frames_limit=5))
frames_data = []
for frame in tqdm(frame_iterator, total=5):
    # print(frame)
    print("-------------------")
    image_source, image = load_ndarr_image(frame)
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption="basketball",
        box_threshold=BOX_TRESHOLD,
        text_threshold=TEXT_TRESHOLD
    )
    frames_data.append((boxes, logits, phrases))

    annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
    h, w, _ = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()

    # cv2.imwrite(f"output/predict/{f_name}", annotated_frame)
    print(boxes, logits, phrases, xyxy)
    plot_image(annotated_frame, 16)


### Zooming

In [48]:
import cv2 as cv

img = cv.imread('output/mid/frame0.jpg')
z_img = zoom_at(img, 1.5, coord=(264.5, 275))
# print(z_img)
    

cv.imwrite('frame0.jpg', img )
cv.imwrite('zoom_frame0.jpg', zoom_at(img, 1.5, coord=(264.5, 275)) )

True

### Generate video

In [None]:
# !sudo apt-get install -y ffmpeg x264 libx264-dev
# !pip install jupyterlab_widgets ipywidgets
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [21]:
x = Stack()
x.push("Hello")
x.push("World")


def write_frame(vid_writer, source_frame, history: Stack,  boxes, logits, phrases):
    annotated_frame = annotate(image_source=source_frame, boxes=boxes, logits=logits, phrases=phrases)
    h, w, _ = frame.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    x = xyxy[0][0]
    y = xyxy[0][1]
    zoom_frame = zoom_at(frame, 2, coord=(x, y))

    print("Zoom ->>>>>", (x, y))
    history.push(f"Zoom at: {x}, {y} ---- phrases: {phrases}")
    add_text_to_frame2(zoom_frame, history, position=(50, 150))
    vid_writer.write(zoom_frame)

In [25]:
import os
import cv2
from tqdm.notebook import tqdm
from groundingdino.util.inference import load_model, load_image, predict, annotate
import torch
from torchvision.ops import box_convert

h264 = cv2.VideoWriter_fourcc('h','2','6','4')
mp4v = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
mp4v_2 = cv2.VideoWriter_fourcc(*'MP4V')
vid_name = "output/new_video1.mp4"


# vid_cap = cv.VideoCapture("input/basketball.mp4")

# fps = vid_cap.get(cv2.CAP_PROP_FPS)
# ret, frame = vid_cap.read()
# vid_cap.release()
# resolution_size = (int(frame.shape[0]), int(frame.shape[1]))


model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25
cap = cv2.VideoCapture("input/basketball.mp4")

fps = cap.get(cv2.CAP_PROP_FPS)
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# cap.open("input/basketball.mp4")
exist_fourcc = cap.get(cv2.CAP_PROP_FOURCC)
fourcc = cv2.VideoWriter_fourcc(*'MP4V')

cap.release()
print("-----------", exist_fourcc)
# return
try:
    os.remove(vid_name)
except:
    pass
new_video = cv2.VideoWriter(vid_name, fourcc, fps, (width, height))


if not new_video.isOpened():
    print("Error: Could not open output video.")
    exit()

test
frame_iterator = iter(generate_frames(video_file="input/basketball.mp4", frames_limit=120))
frames_data = []
counter = 0
previouse_state = {}
history = Stack(15)
for frame in tqdm(frame_iterator, total=5):
    # print(frame)
    print(f'\rCount: {counter}', end='', flush=True)
    
    transformed_array = convert_ndarray(frame)


    boxes, logits, phrases = predict(
        model=model,
        image=transformed_array,
        caption="basketball",
        box_threshold=BOX_TRESHOLD,
        text_threshold=TEXT_TRESHOLD
    )
    # print(boxes, logits, phrases)

    # Can't detect any object
    if boxes.shape[0] == 0:
        write_frame(new_video, frame, history, previouse_state["boxes"], previouse_state["logits"], previouse_state["phrases"])
        continue

    previouse_state = {
        "boxes": boxes,
        "logits": logits,
        "phrases": phrases
    }
    write_frame(new_video, frame, history,  boxes, logits, phrases)
    counter += 1

print("Releasing video")
new_video.release()




final text_encoder_type: bert-base-uncased
----------- 1668703592.0


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/5 [00:00<?, ?it/s]

Count: 0



Zoom ->>>>> (416.02338, 858.37134)
Count: 1Zoom ->>>>> (416.93814, 862.1715)
Count: 2Zoom ->>>>> (413.7442, 864.8559)
Count: 3Zoom ->>>>> (418.1336, 869.54034)
Count: 4Zoom ->>>>> (413.4619, 873.45197)
Count: 5Zoom ->>>>> (409.07446, 877.97296)
Count: 6Zoom ->>>>> (409.99716, 883.97845)
Count: 7Zoom ->>>>> (410.34915, 884.84686)
Count: 8Zoom ->>>>> (407.07065, 890.44293)
Count: 9Zoom ->>>>> (407.07065, 890.44293)
Count: 9Zoom ->>>>> (407.4342, 896.73676)
Count: 10Zoom ->>>>> (407.57465, 897.67487)
Count: 11Zoom ->>>>> (611.7635, 801.2432)
Count: 12Zoom ->>>>> (408.3813, 899.4302)
Count: 13Zoom ->>>>> (409.71753, 899.5812)
Count: 14Zoom ->>>>> (410.57266, 899.47534)
Count: 15Zoom ->>>>> (411.42215, 899.6825)
Count: 16Zoom ->>>>> (414.72546, 899.962)
Count: 17Zoom ->>>>> (416.83835, 900.0826)
Count: 18Zoom ->>>>> (418.22287, 900.07965)
Count: 19Zoom ->>>>> (417.86465, 899.1971)
Count: 20Zoom ->>>>> (418.0332, 898.4385)
Count: 21Zoom ->>>>> (418.9025, 897.1738)
Count: 22Zoom ->>>>> (418.2