In [4]:
import os

application_path = '../'

In [5]:
import sys

sys.path.append(os.path.join(application_path, 'tools'))

from monodepth2.infer import load_model
from tracktor_utils import tracker_obj
from tracktor.utils import interpolate
from torchvision.transforms import ToTensor, Compose, Resize, ToPILImage

from monodepth2.infer import infer_depth as monodepth_infer
from statsmodels.nonparametric.kernel_regression import KernelReg
from scipy.signal import savgol_filter

import cv2
import torch
import numpy as np

from tqdm import tqdm
from PIL import Image
from glob import glob

import pickle 

from tqdm import trange
from human_depth_dataset.dataset import (
    RGBDPeopleDataset, 
    KittiHumanDepthDataset
)

from human_depth_dataset.evaluate_depth import evaluateDepths, \
    calculateTrueErrors

import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
def kalmanfilter(x,p,z,r):
    # p - estimate unceratininty 
    # r - measurement unceratininty ( σ2 )  
    # z - Measured System State

    # Kalman gain calculation
    K =  p/(p+r)
    # estimate current state
    x1 = x + K*(z-x)
    # update current estimate uncertainity
    p1 = (1-K)*p

    return (x1,p1)

In [12]:
tracker = tracker_obj(os.path.join(application_path, "tracking_wo_bnw"))

In [13]:
sequences = [0, 1, 2]

In [14]:
inference = {'name': 'monodepth'}

encoder, depth_decoder, (feed_width, feed_height) = load_model("mono+stereo_1024x320")
inference['encoder'] = encoder
inference['depth_decoder'] = depth_decoder
inference['input_size'] = (feed_width, feed_height)

-> Loading model from  models/mono+stereo_1024x320
   Loading pretrained encoder
   Loading pretrained decoder


In [17]:
depth_tracks = {}
depth_tracks_smoothed = {}
depth_tracks_p = {}

depth_frame_dict = {}

depth_merger = 'median'
for seq in tqdm(sequences, leave=True):
    depth_frame_dict[seq] = {}
    tracker.reset()
    
    frames = glob(f'../human_depth_dataset/data/rgbd/rgb/*_{seq}.ppm')
    frames = sorted(frames, key=lambda x: int(x.split('/')[-1].split('_')[1])) 
    
    for frame in frames:
        if 'combined' in frame:
            continue
        depth_frame_dict[seq][frame.split('/')[-1]] = []
        img_pil = Image.open(frame)
        
        depth_map, depth_im = monodepth_infer(inference['encoder'],
                                                  inference['depth_decoder'],
                                                  inference['input_size'],
                                                  img_pil)
        depth_map = depth_map[0, 0] * 5.4
        
        transforms = Compose([
            Resize((749, 1333)),
            ToTensor(),
        ])
        frame_batch = {
            'img': transforms(img_pil).unsqueeze(0)[:, :3, :, :]#.cuda()
        }
                
        tracker.step(frame_batch)
        results = tracker.get_results()
        results = interpolate(results)
        for t, r in results.items():
            x1, y1, x2, y2 = map(int, r[max(r, key=int)])
            m = np.zeros_like(depth_map)
            y1 = int(y1 * m.shape[0] / 749)
            y2 = int(y2 * m.shape[0] / 749)

            x1 = int(x1 * m.shape[1] / 1333)
            x2 = int(x2 * m.shape[1] / 1333)

            m[y1:y2, x1:x2] = 1
            person_depth = depth_map * m
            try:
                if depth_merger == 'mean':
                    avg_depth = person_depth[np.where(person_depth != 0)].mean()
                elif depth_merger == 'median': 
                    avg_depth = np.median(person_depth[np.where(person_depth != 0)])
                else:
                    raise Exception("Undefined depth_merger error!")
                x, y = int((x1 + x2) / 2), int((y1 + y2) / 2)

                if t not in depth_tracks:
                    depth_tracks[t] = [avg_depth]
                else: 
                    depth_tracks[t].append(avg_depth)
                    
                avg_depth_s = avg_depth
                p = 1
                if len(depth_tracks[t]) > 1:
                    avg_depth_s = depth_tracks_smoothed[t][-1]
                    p = depth_tracks_p[t][-1]
                
                avg_depth_s, p = kalmanfilter(avg_depth_s, p, avg_depth, 1)
                
                if t not in depth_tracks_smoothed:
                    depth_tracks_smoothed[t] = [avg_depth_s]
                else: 
                    depth_tracks_smoothed[t].append(avg_depth_s)
                    
                if t not in depth_tracks_p:
                    depth_tracks_p[t] = [p]
                else: 
                    depth_tracks_p[t].append(p)
                    
                depth_frame_dict[seq][frame.split('/')[-1]].append({
                    'box': [x1, y1, x2, y2],
                    'depth': avg_depth_s
                })
                
            except ValueError as e:
                print(e)
                continue

100%|██████████| 3/3 [13:15<00:00, 265.31s/it]


In [22]:

from torch.utils.data import Dataset

class RGBDPeopleDataset(Dataset):
    def __init__(self, data_root='../data/rgbd/', mask_file='../data/rgbd/yolact.pkl'):
        rgb_dir = 'rgb'
        depth_dir = 'depth'

        self.rgb_files = [os.path.join(data_root, rgb_dir, filename) for
                          filename in
                          os.listdir(os.path.join(data_root, rgb_dir)) if 'combined' not in filename]

        self.depth_files = []
        for path in self.rgb_files:
            filename = os.path.splitext(os.path.basename(path))[0]
            self.depth_files.append(
                os.path.join(data_root, depth_dir, filename+'.pgm'))
            
        self.masks = None
        if mask_file != None:
            with open(mask_file, 'rb') as f:
                self.masks = pickle.load(f)

    def __len__(self):
        return len(self.rgb_files)

    def __getitem__(self, idx):
        rgb_np = np.rot90(cv2.imread(self.rgb_files[idx]))
        rgb_np = cv2.cvtColor(rgb_np, cv2.COLOR_BGR2RGB)
        rgb_im = pil.fromarray(rgb_np)

        depth = cv2.imread(self.depth_files[idx], -1).newbyteorder()

        # According to the dataset paper: http://www2.informatik.uni-freiburg.de/~spinello/spinelloIROS11.pdf
        depth = 8 * 0.075 * 594.2 / (1084 - depth)
        depth = np.rot90(depth)
        
        index = self.rgb_files[idx].split('/')[-1]
        mask = self.masks[index] if self.masks != None else None

        return {'rgb': rgb_im, 'depth': depth, 'index': index, 'mask': mask}

In [23]:
rgbd_dataset = RGBDPeopleDataset( 
    '../human_depth_dataset/data/rgbd/',
    '../human_depth_dataset/data/rgbd/yolact.pkl'
)

In [25]:
import cv2
import pickle
import os.path
import numpy as np
import PIL.Image as pil
from torch.utils.data import Dataset


In [27]:
index

'seq0_0285_0.ppm'

In [35]:
stats = []

for i in trange(len(rgbd_dataset)):
    item = rgbd_dataset.__getitem__(i)
    mask = item['mask']['mask']
    index = item['index']
    depth = item['depth']
    
    seq, ind, _ = index.split('_')   
    tracktor_boxes = depth_frame_dict[int(index[-5])][index]
    
    if len(mask):
        matching = np.zeros((mask.shape[0], len(tracktor_boxes)))
        
        for ii, m in enumerate(mask):
            for jj, box in enumerate(tracktor_boxes):
                x1, y1, x2, y2 = box['box']
                matching[ii][jj] = m[y1:y2, x1:x2].sum() / m.sum()#((y2 - y1) * (x2 - x1))
        
        mask_assign = {}
        for jj, _ in enumerate(tracktor_boxes):
            current_masks = list(set(range(len(mask))) - set(mask_assign.keys()))
            if len(current_masks):
                mask_thres = np.where(matching[current_masks, jj] > 0.3)
                if len(mask_thres[0]):
                    mask_assign[matching[mask_thres[0], jj].argmax()] = jj
        
        for m, t in mask_assign.items():
            person_depth = depth * mask[m, :, :, 0]
            gt = np.median(person_depth[np.where(person_depth != 0)])
            predicted = tracktor_boxes[t]['depth']
            stats.append(evaluateDepths(predicted, gt))
    else:
        continue

100%|██████████| 3399/3399 [00:37<00:00, 91.20it/s]


In [36]:
calculateTrueErrors([s[0] for s in stats], [s[1] for s in stats])

Total number of pixels:  122.0
    0.4696,     1.4129,     0.1829,     2.6743,     0.5131,     0.3115,     0.6230,     0.7869


In [10]:
# import matplotlib.pyplot as plt
# import matplotlib.patches as patches
# from PIL import Image
# import numpy as np

# # Create figure and axes
# fig,ax = plt.subplots()

# # Display the image
# ax.imshow(mask[0, :, :, 0])

# # Create a Rectangle patch
# for b in tracktor_boxes:
#     x1, y1, x2, y2 = b['box']
#     rect = patches.Rectangle((x1,y1),(x2 - x1),(y2- y1),linewidth=1,edgecolor='r',facecolor='none')

#     # Add the patch to the Axes
#     ax.add_patch(rect)

# plt.show()

In [11]:
# rect = patches.Rectangle((50,100),40,30,linewidth=1,edgecolor='r',facecolor='none')

# plt.imshow(mask[0, :, :, 0])