In [9]:
from fisheye.unfish import FisheyeFlatten
import numpy as np
import argparse

import cv2
import torch
from PIL import Image
from supervision import VideoInfo
from summarization.DSNet.src.kts.cpd_auto import cpd_auto
from action_detection.yowov2.config import build_dataset_config, build_model_config
from action_detection.yowov2.dataset.transforms import BaseTransform
from action_detection.yowov2.models import build_model
from action_detection.yowov2.utils.misc import load_weight
from summarization.DSNet.src.helpers.video_helper import FeatureExtractor
from summarization.DSNet.src.helpers import init_helper, vsumm_helper, bbox_helper
from summarization.DSNet.src.modules.model_zoo import get_model

In [10]:
parser = argparse.ArgumentParser(description='YOWOv2 Demo')

parser.add_argument('-size', '--img_size', default=224, type=int,
                    help='the size of input frame')
parser.add_argument('--show', action='store_true', default=False,
                    help='show the visulization results.')
parser.add_argument('--cuda', action='store_true', default=False,
                    help='use cuda.')
parser.add_argument('--save_folder', default='det_results/', type=str,
                    help='Dir to save results')
parser.add_argument('-vs', '--vis_thresh', default=0.3, type=float,
                    help='threshold for visualization')
parser.add_argument('--video', default='9Y_l9NsnYE0.mp4', type=str,
                    help='AVA video name.')
parser.add_argument('--gif', action='store_true', default=False,
                    help='generate gif.')

parser.add_argument('-d', '--dataset', default='ava_v2.2',
                    help='ava_v2.2')
parser.add_argument('--pose', action='store_true', default=False,
                    help='show 14 action pose of AVA.')


parser.add_argument('-v', '--version', default='yowo_v2_large', type=str,
                    help='build YOWOv2')
parser.add_argument('--weight', default='checkpoints/yowo2/yowo_v2_large.pth',
                    type=str, help='Trained state_dict file path to open')
parser.add_argument('-ct', '--conf_thresh', default=0.1, type=float,
                    help='confidence threshold')
parser.add_argument('-nt', '--nms_thresh', default=0.5, type=float,
                    help='NMS threshold')
parser.add_argument('--topk', default=40, type=int,
                    help='NMS threshold')
parser.add_argument('-K', '--len_clip', default=16, type=int,
                    help='video clip length.')
parser.add_argument('-m', '--memory', action="store_true", default=False,
                    help="memory propagate.")

parser.add_argument('--deep_sort_weights', type=str,
                    default='summarization/deep_sort_pytorch/deep_sort/deep/checkpoint/ckpt.t7',
                    help='ckpt.t7 path')
parser.add_argument('--source', type=str, default='0', help='source')

parser.add_argument('--save-txt', action='store_true', help='save MOT compliant results to *.txt')

parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 16 17')

parser.add_argument("--config_deepsort", type=str, default="summarization/deep_sort_pytorch/configs/deep_sort.yaml")

def kts(n_frames, features, sample_rate):
    seq_len = len(features)
    picks = np.arange(0, seq_len) * sample_rate

    kernel = np.matmul(features, features.T)
    change_points, _ = cpd_auto(kernel, seq_len - 1, 1, verbose=False)
    change_points *= sample_rate
    change_points = np.hstack((0, change_points, n_frames))
    begin_frames = change_points[:-1]
    end_frames = change_points[1:]
    change_points = np.vstack((begin_frames, end_frames - 1)).T

    n_frame_per_seg = end_frames - begin_frames
    return change_points, n_frame_per_seg, picks

def multi_hot_vis(args, frame, out_bboxes, orig_w, orig_h, class_names, act_pose=False):
    # visualize detection results
    for bbox in out_bboxes:
        x1, y1, x2, y2 = bbox[:4]

        cls_conf = bbox[5:]

        # rescale bbox
        x1, x2 = int(x1 * orig_w), int(x2 * orig_w)
        y1, y2 = int(y1 * orig_h), int(y2 * orig_h)

        # score = obj * cls
        det_conf = float(bbox[4])
        cls_scores = np.sqrt(det_conf * cls_conf)

        indices = np.where(cls_scores > args.vis_thresh)
        scores = cls_scores[indices]
        indices = list(indices[0])
        scores = list(scores)

        if len(scores) > 0:
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            blk = np.zeros(frame.shape, np.uint8)
            font = cv2.FONT_HERSHEY_SIMPLEX
            coord = []
            text = []
            text_size = []

            for _, cls_ind in enumerate(indices):
                text.append("[{:.2f}] ".format(scores[_]) + str(class_names[cls_ind]))
                text_size.append(cv2.getTextSize(text[-1], font, fontScale=0.5, thickness=1)[0])
                coord.append((x1 + 3, y1 + 14 + 20 * _))
                cv2.rectangle(blk, (coord[-1][0] - 1, coord[-1][1] - 12),
                              (coord[-1][0] + text_size[-1][0] + 1, coord[-1][1] + text_size[-1][1] - 4), (0, 255, 0),
                              cv2.FILLED)
            frame = cv2.addWeighted(frame, 1.0, blk, 0.5, 1)
            for t in range(len(text)):
                cv2.putText(frame, text[t], coord[t], font, 0.5, (0, 0, 0), 1)

    return frame



In [11]:
source = r"test_videos/video6.mp4"

Fisheye Rectification

In [12]:
camera_matrix = np.load('checkpoints/fisheye/camera_matrix.npy')
dist_coeffs = np.load('checkpoints/fisheye/dist_coeffs.npy')

width = 0
height = 0
i = 0
vid_writer = None
vid = cv2.VideoCapture(source)
output_name = source.rsplit('/', 1)[0]
frames_number = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
frames_size = VideoInfo.from_video_path(source).resolution_wh
rectification = FisheyeFlatten(frames_size, 1, camera_matrix, dist_coeffs)

frame_list = []

while True:
    _, frame = vid.read()
    i += 1
    if frame is None:
        break
    f = rectification(frame)
    frame_list.append(f)
    if (width == 0) or (height == 0):
        width = f.shape[1]
        height = f.shape[0]
    cv2.imshow("frame", f)
    cv2.waitKey(1)
#         vid_writer = cv2.VideoWriter(f"D:/results/{output_name}", cv2.VideoWriter_fourcc(*"mp4v"), 25, (width, height))
# 
#     vid_writer.write(f)
# vid_writer.release()
# destroyAllWindows()

Action Detection

In [18]:
video_clip = []
sample_rate = 16
basetransform = BaseTransform(img_size=height)
args = parser.parse_args(args=[])
d_cfg = build_dataset_config(args)
m_cfg = build_model_config(args)
class_names = d_cfg['label_map']
num_classes = 7
class_colors = [(np.random.randint(255),
                     np.random.randint(255),
                     np.random.randint(255)) for _ in range(num_classes)]

if args.cuda:
    print('use cuda')
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model, _ = build_model(
        args=args,
        d_cfg=d_cfg,
        m_cfg=m_cfg,
        device='cuda',
        num_classes=num_classes,
        trainable=False
    )

print(args.weight)

model = load_weight(model=model, path_to_ckpt=args.weight)

for frame in frame_list:
    frame_rgb = frame[..., (2, 1, 0)]
    frame_pil = Image.fromarray(frame_rgb.astype(np.uint8))
    
    video_clip.append(frame_pil)
    orig_h, orig_w = frame_rgb.shape[:2]
    
    if len(video_clip) < sample_rate:
        continue
        
    x, _ = basetransform(video_clip)

    x = torch.stack(x, dim=1)
    x = x.unsqueeze(0).to(device)
    
    outputs = model(x)

Dataset Config: AVA_V2.2 
Model Config: YOWO_V2_LARGE 
Build YOWO_V2_LARGE ...
num_classes 7
2D Backbone: YOLO_FREE_LARGE
--pretrained: False
FPN: pafpn_elan
Head: Decoupled Head
Head: Decoupled Head
Head: Decoupled Head
3D Backbone: RESNEXT101
--pretrained: False
Head: Decoupled Head
Head: Decoupled Head
Head: Decoupled Head
checkpoints/yowo2/yowo_v2_large.pth
Finished loading model!
16


KeyboardInterrupt: 

Video summarization

In [20]:
n_frames = 0
googlenet_model = FeatureExtractor()
feature_bank = []
summarize_args = init_helper.get_arguments()

summarize_model = get_model(summarize_args.model, **vars(args))
summarize_model = summarize_model.eval().to(summarize_args.device)
state_dict = torch.load(summarize_args.ckpt_path,map_location=lambda storage, loc: storage)
summarize_model.load_state_dict(state_dict)


for frame in frame_list:
    n_frames += 1
    if n_frames % sample_rate == 0:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        feature = googlenet_model.run(frame)
        feature_bank.append(feature)

feature_bank = np.array(feature_bank)
cps, nfps, picks = kts(n_frames, feature_bank, sample_rate)

seq_len = len(feature_bank)

with torch.no_grad():
    seq_torch = torch.from_numpy(feature_bank).unsqueeze(0).to("cuda")

    pred_cls, pred_bboxes = summarize_model.predict(seq_torch)

    pred_bboxes = np.clip(pred_bboxes, 0, seq_len).round().astype(np.int32)

    pred_cls, pred_bboxes = bbox_helper.nms(pred_cls, pred_bboxes, args.nms_thresh)
    pred_summ = vsumm_helper.bbox2summary(
        seq_len, pred_cls, pred_bboxes, cps, n_frames, nfps, picks)

cap = cv2.VideoCapture(args.source)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(args.save_path, fourcc, fps, (width, height))

frame_idx = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    if pred_summ[frame_idx]:
        out.write(frame)

    frame_idx += 1

out.release()
cap.release()


usage: ipykernel_launcher.py [-h] [--device {cuda,cpu}] [--seed SEED]
                             [--splits SPLITS [SPLITS ...]]
                             [--max-epoch MAX_EPOCH] [--model-dir MODEL_DIR]
                             [--log-file LOG_FILE] [--lr LR]
                             [--weight-decay WEIGHT_DECAY]
                             [--lambda-reg LAMBDA_REG]
                             [--nms-thresh NMS_THRESH] [--ckpt-path CKPT_PATH]
                             [--sample-rate SAMPLE_RATE] [--source SOURCE]
                             [--save-path SAVE_PATH]
                             [--base-model {attention,lstm,linear,bilstm,gcn}]
                             [--num-head NUM_HEAD] [--num-feature NUM_FEATURE]
                             [--num-hidden NUM_HIDDEN]
                             [--neg-sample-ratio NEG_SAMPLE_RATIO]
                             [--incomplete-sample-ratio INCOMPLETE_SAMPLE_RATIO]
                             [--pos-iou-thresh POS

AttributeError: 'tuple' object has no attribute 'tb_frame'