In [1]:
# Copyright (c) Facebook, Inc. and its affiliates.
import argparse
import glob
import multiprocessing as mp
import numpy as np
import os
import tempfile
import time
import warnings
import cv2
import sys
import mss
import torch
from tqdm import tqdm

from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger

sys.path.insert(0, 'Detic/third_party/CenterNet2/')
sys.path.insert(0, 'Detic/')
# sys.path.insert(0, 'Detic/')
from centernet.config import add_centernet_config
from detic.config import add_detic_config

from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer, _create_text_labels

from detic.modeling.utils import reset_cls_test
import detectron2.data.transforms as T
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

In [2]:
BUILDIN_CLASSIFIER = {
    'lvis': 'Detic/datasets/metadata/lvis_v1_clip_a+cname.npy',
    'objects365': 'Detic/datasets/metadata/o365_clip_a+cnamefix.npy',
    'openimages': 'Detic/datasets/metadata/oid_clip_a+cname.npy',
    'coco': 'Detic/datasets/metadata/coco_clip_a+cname.npy',
}

BUILDIN_METADATA_PATH = {
    'lvis': 'lvis_v1_val',
    'objects365': 'objects365_v2_val',
    'openimages': 'oid_val_expanded',
    'coco': 'coco_2017_val',
}

In [5]:
def setup_cfg(config_path, weights_path):
    cfg = get_cfg()
    cfg.MODEL.DEVICE="cpu"
    add_centernet_config(cfg)
    add_detic_config(cfg)
    cfg.merge_from_file(config_path)
    cfg.merge_from_list(["MODEL.WEIGHTS",weights_path])
    # Set score_threshold for builtin models
    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.6
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.6
    cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'rand' # load later
    # if not args.pred_all_class:
    cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True
    cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = 'Detic/'+ cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH
    cfg.freeze()
    return cfg

In [9]:
# cfg = setup_cfg("/Users/ajaybati/Documents/mimic/mimic/grasping/Detic/configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml",
#           "/Users/ajaybati/Documents/mimic/mimic/grasping/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth")

cfg = setup_cfg("C:\\Users\\arshs\\OneDrive\\Documents\\GitHub\\mimic\\third_party\\Detic\\configs\\Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml", "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth")

In [10]:
metadata = MetadataCatalog.get(
            BUILDIN_METADATA_PATH["lvis"])
classifier = BUILDIN_CLASSIFIER["lvis"]
num_classes = len(metadata.thing_classes)

In [11]:
predictor = DefaultPredictor(cfg)
reset_cls_test(predictor.model, classifier, num_classes)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


FileNotFoundError: [Errno 2] No such file or directory: 'Detic/datasets/metadata/lvis_v1_train_cat_info.json'

In [None]:
original_image = cv2.imread("dog_bike_car.jpg") #dog_bike_car.jpg, kitchen.jpeg
aug = T.ResizeShortestEdge(
    [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
)

In [8]:
def preprocess_img(original_image):
    height, width = original_image.shape[:2]
    image = aug.get_transform(original_image).apply_image(original_image)
    image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
    image.to("cpu")
    return {"image": image, "height": height, "width": width}

In [9]:
def detic_forward(batched_inputs):
    with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
        # Apply pre-processing to image.
        # height, width = original_image.shape[:2]
        # image = aug.get_transform(original_image).apply_image(original_image)
        # image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
        # image.to(cfg.MODEL.DEVICE)

        # inputs = {"image": image, "height": height, "width": width}
        # batched_inputs = [inputs, inputs]
        predictions = predictor.model(batched_inputs)
        # model = predictor.model
        # images = model.preprocess_image(batched_inputs)
        # features = model.backbone(images.tensor)
        # proposals, _ = model.proposal_generator(images, features, None)
        batched_feats = []
        for pred in predictions:
            x = pred['instances'].feats
            if x.dim() > 2:
                x = torch.flatten(x, start_dim=1)
            official_feats = predictor.model.roi_heads.box_predictor[-1].cls_score.linear(x)
            batched_feats.append(official_feats)
        return predictions, batched_feats
    # predictions

In [None]:
#code to read video and label it with model output predictions
cap = cv2.VideoCapture('query2.mp4')
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
out = cv2.VideoWriter(
    os.path.join(f"query2Out2.mp4"),
    cv2.VideoWriter_fourcc(*"mp4v"),
    30, (frame_width, frame_height)
)

for f in tqdm(range(frame_count)):
    ret, frame = cap.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    batched_inputs = [preprocess_img(frame)]
    preds, feats = detic_forward(batched_inputs)
    predictions = preds[0]
    features = feats[0]
    labels = _create_text_labels(predictions['instances'].pred_classes.tolist(), predictions['instances'].scores, metadata.get("thing_classes", None))
    bboxes = predictions['instances'].pred_boxes.tensor.numpy().astype(int).tolist()
    scores = predictions['instances'].scores.tolist()
    for i, box in enumerate(bboxes):
        if 'knife' in labels[i]:
            cv2.rectangle(frame, box[:2], box[2:], (0, 255, 0), 4)
            cv2.putText(frame, f'{labels[i]}: {scores[i]:.2f}', (box[0], box[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 3)
        else:
            cv2.rectangle(frame, box[:2], box[2:], (255, 0, 0), 2)
            cv2.putText(frame, f'{labels[i]}: {scores[i]:.2f}', (box[0], box[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 3)
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    out.write(frame)
out.release()

In [10]:
def cos_sim(query1, query2):
    """
    cossim compare query1: nxd with query2: mxd

    Output: nxm
    """
    norm1 = torch.linalg.norm(query1,dim=-1, keepdim=True)
    norm2 = torch.linalg.norm(query2,dim=-1, keepdim=True).T
    second = torch.transpose(query2,0,1)[None,:,:]
    return (query1[:,None,:] @ second).squeeze()/(norm1 @ norm2)

In [11]:
def get_outputs(path):
    """
    processes image through model

    Input: path to image
    Output: get 512d features, lables, integer bboxes, and scores
    """
    frame = cv2.imread(path)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    batched_inputs = [preprocess_img(frame)]
    preds, feats = detic_forward(batched_inputs)
    predictions = preds[0]
    features = feats[0]
    labels = _create_text_labels(predictions['instances'].pred_classes.tolist(), predictions['instances'].scores, metadata.get("thing_classes", None))
    bboxes = predictions['instances'].pred_boxes.tensor.numpy().astype(int).tolist()
    scores = predictions['instances'].scores.tolist()
    return features, labels, bboxes, scores

In [12]:
features1, labels1, bboxes1, scores1 = get_outputs('query1.png')
features2, labels2, bboxes2, scores2 = get_outputs('query2.png')

In [35]:
sim = cos_sim(features1, features2).numpy() #cosine similarity between features1, features2 -> output=#features1x#features2
np.take(labels2, np.argmax(sim, axis=-1)) #for each detected item in features1, get closest item in features2, then index through features2

array(['person 91%', 'knife 91%', 'tomato 94%', 'tomato 94%',
       'lettuce 77%', 'tomato 87%', 'chopping_board 89%', 'jean 76%',
       'button 82%', 'lettuce 70%', 'cucumber 65%', 'cucumber 65%',
       'tomato 87%', 'button 82%', 'cucumber 65%', 'tomato 87%',
       'tomato 87%', 'button 82%', 'tomato 94%', 'chopping_board 89%',
       'tomato 87%', 'cucumber 65%', 'cucumber 65%', 'tomato 87%',
       'cucumber 65%', 'cucumber 65%', 'cucumber 90%', 'cucumber 65%'],
      dtype='<U18')

In [36]:
np.array(labels1)

array(['person 94%', 'knife 92%', 'tomato 89%', 'apple 85%', 'bowl 82%',
       'bell_pepper 80%', 'chopping_board 79%', 'pot 78%', 'button 76%',
       'bowl 76%', 'green_onion 74%', 'bell_pepper 73%', 'grape 73%',
       'button 72%', 'green_onion 71%', 'bread 70%', 'carrot 70%',
       'button 70%', 'cherry 69%', 'baguet 69%', 'tomato 69%',
       'garlic 65%', 'brussels_sprouts 65%', 'chili_(vegetable) 63%',
       'garlic 63%', 'garlic 62%', 'green_onion 61%', 'green_onion 60%'],
      dtype='<U21')