# Домашнее задание

ML разработчикам необходимо не только уметь обучать нейронные сети и генерировать новые идеи, но еще и уметь в страивать наработки в pipeline. В этом домашнем задании нам предстоит сделать из frustum детектора production ready (ну почти) решение, которое может работать на сырых данных.

К сожалению, frustum-pointnet работает независимо для каждой 2D детекции. В этом домашнем задании вам предстоит написать обертку над frustum-pointnet, которая будет работать над целыми облаками. Вам также нужно будет воспользоваться 2D детектором, чтобы находить коробки на изображении.

Во второй части задания вам нужно будет написать оценку качества работы вашего алгоритма, которая становится чуть сложнее, когда на сцене могут находится много объектов.

## Часть 1

Ниже написан класс, который вам нужно реализовать. Чтобы воспользоваться предобученной сетью, позаимствуйте код из
https://github.com/charlesq34/frustum-pointnets/blob/master/train/test.py

Предобученные модели лежат здесь: https://shapenet.cs.stanford.edu/media/frustum_pointnets_snapshots.zip

В частности, вам нужно модифицировать функцию `get_session_and_ops` - функция должна уметь работать без глобальных флагов. После этого посмотрите, как эта функция используется.
Выход сети преобразуется в понятный формат в функции `write_detection_results`.

In [None]:
# ! pip install "tensorflow==1.14"
import os
import sys
import glob
from tqdm import tqdm
from collections import namedtuple
import numpy as np
import cv2
import pickle
from matplotlib import pyplot as plt 

from ssd import SSD
sys.path.append(os.path.join(os.getcwd(), 'frustum_pointnets/train'))
sys.path.append(os.path.join(os.getcwd(), 'frustum_pointnets/train/test_'))
sys.path.append(os.path.join(os.getcwd(), 'frustum_pointnets/kitti'))
import provider
from test_ import inference, get_session_and_ops
from kitti_util import Calibration
from kitti_object import get_lidar_in_image_fov, kitti_object

from shapely.geometry import Polygon
from shapely.affinity import rotate

In [None]:
Detection = namedtuple('Detection', ['xyz', 'angle', 'lwh', 'confidence'])
Scene = namedtuple('Scene', ['detections'])

class PipelineDetector(object):
    def __init__(self, ssd_detector, ssd_threshold, frustum_pointnet, 
                 frustum_batch_size, frustum_num_pts):
        
        self.ssd_detector = ssd_detector
        self.ssd_threshold = ssd_threshold
        
        self.frustum_pointnet = frustum_pointnet
        self.frustum_batch_size = frustum_batch_size
        self.frustum_num_pts = frustum_num_pts
        self.frustum_sess, self.frustum_ops = get_session_and_ops(self.frustum_batch_size, self.frustum_num_pts)
  
    def predict(self, velo_pts, image, calib):
        # TODO: run 2D detector on the image
        # TODO: extract bounding boxes with vehicle classes and filter them by ssd_threshold
        detection = self.ssd_detector.predict(image)
        vehicle_idx = np.where(detection['detection_classes'] == 1)
        conf_idx = np.where(detection['detection_scores'] >= self.ssd_threshold)
        final_idx = np.intersect1d(vehicle_idx, conf_idx)
        bbox = detection['detection_boxes'][final_idx]    
        detection_conf = detection['detection_scores'][final_idx]

        # TODO: process lidar point cloud and construct frustum examples   
        rect_pts = np.zeros_like(velo_pts)
        rect_pts[:, :3] = calib.project_velo_to_rect(velo_pts[:, :3].copy())
        rect_pts[:, 3] = velo_pts[:, 3]
        
        img_height, img_width, _= img.shape
        _, img_pts, in_img_mask = get_lidar_in_image_fov(velo_pts[:, :3].copy(), calib, 0, 0, img_width, img_height, return_more=True)
         
        frustum_examples = []
        frustum_angles = []
        scene = Scene([])

        for box in bbox:
            box = (box.reshape((2, 2)) * image.shape[:2]).astype(int)
            (ul_y, ul_x), (lr_y, lr_x)  = box 
            box_mask = (img_pts[:, 1] < lr_y) * (img_pts[:, 1] >= ul_y) * (img_pts[:, 0] < lr_x) * (img_pts[:, 0] >= ul_x) 
            
            mask = in_img_mask & box_mask
            rect_pts_masked = rect_pts[mask]            

            if len(rect_pts_masked):
                box2d_center = np.array([(ul_x + lr_x) / 2.0, (ul_y + ul_y) / 2.0])
                uvdepth = np.zeros((1, 3))
                uvdepth[0, :2] = box2d_center
                uvdepth[0, 2] = 20 # some random depth
                box2d_center_rect = calib.project_image_to_rect(uvdepth)
                frustum_angle = -1 * np.arctan2(box2d_center_rect[0, 2], box2d_center_rect[0, 0])
                frustum_angle += np.pi/2.0
                
                np.random.seed()
                point_cloud = provider.rotate_pc_along_y(rect_pts_masked.copy(), frustum_angle)
                idx = np.random.choice(len(point_cloud), size=self.frustum_num_pts, replace=True)
                point_cloud = point_cloud[idx]
                
                frustum_angles.append(frustum_angle)
                frustum_examples.append(point_cloud)
            
        if len(frustum_examples):
            one_hot_batch = np.array([[1, 0, 0],] * len(frustum_examples)).reshape(-1, 3)        

            #TODO: run frustum inference (use batch to accelerate inference per frame)
            predictions = self.frustum_pointnet(self.frustum_sess, 
                                               self.frustum_ops, 
                                               np.array(frustum_examples), 
                                               one_hot_batch, 
                                               self.frustum_batch_size)
            _, centers, heading_cls, heading_res, size_cls, size_res, _ = predictions

            # TODO: construct Scene namedtuple and return it
            for i, _ in enumerate(predictions):    
                h, w, l, tx, ty, tz, ry = provider.from_prediction_to_label_format(centers[i], 
                                                                                   heading_cls[i], 
                                                                                   heading_res[i], 
                                                                                   size_cls[i], 
                                                                                   size_res[i], 
                                                                                   frustum_angles[i])
                detection = Detection(xyz=np.array((tx, ty, tz)), angle=ry, lwh=np.array((l, w, h)), confidence=detection_conf[i])
                scene.detections.append(detection)
            return scene

## Часть 2

Для оценки качества работы 3D детекторов обычно используется average precision. Как измерить precision и recall детектора?

У каждой коробки детектора есть confidence. После того, как мы зафиксировали порог, у нас остается часть детекций.
Давайте теперь посмотрим на сцену сверху: bird's eye view. Забудем про координату z.

Далее мы можем посчитать IoU между всеми коробками ground truth и нашими детекциями.Давайте решим, что если IoU больше 0.7, то мы будем считать, что мы увидели gt коробку - относим эту детекцию к TP. Если gt не нашла пару - False Negative. Если детекция не нашла пару - False Positive.

Ваша задача написать код подсчета метрики average precision построенного детектора.

In [None]:
def get_gt_bbox(labels, calib, types=['Car', 'Van']):
    bbox = []
    for label in labels:
        if label.type in types:
            location_ref = np.array(label.t).reshape(1, -1) 
            location_velo = calib.project_ref_to_velo(location_ref).reshape(-1,)
            length = label.l
            width = label.w
            angle = label.ry
            poly = get_polygon(location_velo[:2], width, length, angle)
            bbox.append(poly)
    filtered_bbox = filter_by_distance(bbox)
    return filtered_bbox

def get_pred_bbox(detections, calib):
    bbox = []
    for det in detections:
        location_ref = det.xyz.reshape(1, -1)
        location_velo =  calib.project_ref_to_velo(location_ref).reshape(-1,)
        length, width, _ = det.lwh
        angle = det.angle
        poly = get_polygon(location_velo[:2], width, length, angle)
        bbox.append(poly)
    filtered_bbox = filter_by_distance(bbox)
    return filtered_bbox
            
def get_polygon(xy, width, length, angle):
    polygon = Polygon([(-width/2, -length/2), (-width/2, length/2), (width/2, length/2), (width/2, -length/2)])
    polygon = rotate(polygon, -angle, use_radians=True)
    pts = np.array(polygon.exterior.coords)[:4]
    pts += xy
    return Polygon(pts)

def filter_by_distance(bbox, max_dist=40):
    centroids = [box.centroid for box in bbox]
    dist_bbox = [bbox[i] for (i, c) in enumerate(centroids) if c.x**2 + c.y**2 <= max_dist **2 ]
    return dist_bbox

In [None]:
# from eval_det.py

def compute_iou(polygon1, polygon2):
    return polygon1.intersection(polygon2).area / polygon1.union(polygon2).area

def voc_ap(rec, prec):
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], rec, [1.]))
    mpre = np.concatenate(([0.], prec, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap

def eval_det_cls(pred, gt, ovthresh=0.7):
    """ Generic functions to compute precision/recall for object detection
        for a single class.
        Input:
            pred: map of {img_id: [(bbox, score)]} where bbox is numpy array
            gt: map of {img_id: [bbox]}
            ovthresh: scalar, iou threshold
            use_07_metric: bool, if True use VOC07 11 point method
        Output:
            rec: numpy array of length nd
            prec: numpy array of length nd
            ap: scalar, average precision
    """
    # construct gt objects
    class_recs = {} # {img_id: {'bbox': bbox list, 'det': matched list}}
    npos = 0
    for img_id in gt.keys():
        bbox = np.array(gt[img_id])
        det = [False] * len(bbox)
        npos += len(bbox)
        class_recs[img_id] = {'bbox': bbox, 'det': det}
    # pad empty list to all other imgids
    for img_id in pred.keys():
        if img_id not in gt:
            class_recs[img_id] = {'bbox': np.array([]), 'det': []}

    # construct dets
    image_ids = []
    confidence = []
    BB = []
    for img_id in pred.keys():
        for box,score in pred[img_id]:
            image_ids.append(img_id)
            confidence.append(score)
            BB.append(box)
    confidence = np.array(confidence)
    BB = np.array(BB) # (nd,4 or 8,3)

    # sort by confidence
    sorted_ind = np.argsort(-confidence)
    sorted_scores = np.sort(-confidence)
    BB = BB[sorted_ind]
    image_ids = [image_ids[x] for x in sorted_ind]

    # go down dets and mark TPs and FPs
    nd = len(image_ids)
    tp = np.zeros(nd)
    fp = np.zeros(nd)
    for d in range(nd):
        R = class_recs[image_ids[d]]
        bb = BB[d]
        ovmax = -np.inf
        BBGT = R['bbox']

        if BBGT.size > 0:
            # compute overlaps
            for j in range(BBGT.shape[0]):
                iou = compute_iou(bb, BBGT[j]) 
                if iou > ovmax:
                    ovmax = iou
                    jmax = j

        #print d, ovmax
        if ovmax > ovthresh:
            if not R['det'][jmax]:
                tp[d] = 1.
                R['det'][jmax] = 1
            else:
                fp[d] = 1.
        else:
            fp[d] = 1.

    # compute precision recall
    fp = np.cumsum(fp)
    tp = np.cumsum(tp)
    rec = tp / float(npos)
    # avoid divide by zero in case the first detection matches a difficult
    # ground truth
    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
    ap = voc_ap(rec, prec)

    return rec, prec, ap

In [None]:
def get_detections_and_gt(dataset_len):
    ssd = SSD()
    detector = PipelineDetector(ssd_detector=ssd, 
                                ssd_threshold=0.5, 
                                frustum_pointnet=inference, 
                                frustum_batch_size=1, 
                                frustum_num_pts=1024)
    dataset = kitti_object(root_dir='./')
    
    det_bboxs = dict()
    gt_bboxs = dict()
    
    for idx in tqdm(range(dataset_len)):
        img = dataset.get_image(idx)
        lidar = dataset.get_lidar(idx)
        calib = dataset.get_calibration(idx)
        labels = dataset.get_label_objects(idx)
        detections = detector.predict(lidar, img, calib)
        
        if detections is not None:
            detection = detections.detections
        
            det_bbox = get_pred_bbox(detections, calib)
            gt_bbox = get_gt_bbox(labels, calib)

            box_with_conf = []
            for i, box in enumerate(det_bbox):
                conf = detections[i].confidence
                box_with_conf.append([box, conf])
            det_bboxs[idx] = box_with_conf 
            gt_bboxs[idx] = gt_bbox
    return det_bboxs, gt_bboxs

In [None]:
dataset_len = len(glob.glob(os.path.join('training/calib', '*txt')))
det_bboxs, gt_bboxs = get_detections_and_gt(dataset_len)

In [None]:
_, _, ap = eval_det_cls(det_bboxs, gt_bboxs)