In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from dataclasses import dataclass, field
from scipy.spatial.transform import Rotation as R  # rotation axis ??? left-hand / clockwise
from typing import List, Tuple
from pathlib import Path as path

%matplotlib ipympl

In [2]:
'''
    Get 3D label and bounding boxes (from two images) for each scene
'''
def get_location(captures):
    # coeff = np.array([-0.00389254, 0.49512566, 0.13212298])

    def _find(l, s):
        for elem in l:
            if elem['id'] == s or elem['id'] == f'{s}_0':
                values = elem['values']
                for i, value in enumerate(values):
                    if value['labelName'] == 'human':
                        break
                else:
                    return None
                return values[i]
    
    def _get_bbox(view):
        '''
            For each view, extract the 2D bounding box and
        '''
        bbox_3d, bbox_2d = _find(view['annotations'], 'bounding box 3D'), _find(view['annotations'], 'bounding box')

        q = R.from_quat(bbox_3d.get('rotation'))

        obj_location = q.apply(bbox_3d.get('translation'), inverse=True) + view.get('position')
        # pre-process 2D bbox
        bbox = {
            'center': np.array(bbox_2d['origin']) + np.array(bbox_2d['dimension'])/2,
            'size': np.array(bbox_2d['dimension'])/2,
        }
        return obj_location, bbox

    # angles = [
    #     [10, 245, 0],
    #     [7, 120, 0],
    # ]
    angles = [
        [0, 225, 0],
        [0, 135, 0],
    ]
    bboxes, cameras = [], []
    for i, view in enumerate(captures):  # per view
        obj_location, bbox_2d = _get_bbox(view)

        bboxes.append(bbox_2d)
        cameras.append({
            'filename': view['filename'],
            'position': view['position'],
            'rotation': angles[i],
            'quaternion': view.get('rotation'),
            'intrinsic': view.get('matrix')
        })

    return {
        '3D_location': obj_location,
        'bboxes': bboxes,
        'cameras': cameras,
    }

In [3]:
@dataclass
class Camera:
    position: np.ndarray#=field(default_factory=np.array([0., 0., 0.]))
    angle: np.ndarray#=field(default_factory=np.array([0., 1., 0.]))  # angle in degree
    intrinsic_: np.ndarray#=field(default_factory=np.array([0., 1., 0.]))  # angle in degree
    quaternion: np.ndarray=field(default_factory=np.array([1., 0., 0., 0.]))  # angle in quaternion

    focal_length: float=20.
    resolution: np.ndarray=np.array([3840, 2160])
    sensor_size: np.ndarray=np.array([30, 30])
    fov: float=73.73979

    def __post_init__(self):
        # print('info', self.position, self.angle, self.quaternion)
        pitch, yaw, roll = self.angle  # R_x, R_y, R_z

        sin_yaw, cos_yaw = np.sin(np.deg2rad(yaw)), np.cos(np.deg2rad(yaw))
        sin_pitch, cos_pitch = np.sin(np.deg2rad(pitch)), np.cos(np.deg2rad(pitch))
        sin_roll, cos_roll = np.sin(np.deg2rad(roll)), np.cos(np.deg2rad(roll))
        
        self.rotation_yaw = np.array([
            [cos_yaw, 0, -sin_yaw],
            [0, 1, 0],
            [sin_yaw, 0, cos_yaw],       
        ])
        self.rotation_pitch = np.array([
            [1, 0, 0],
            [0, cos_pitch, -sin_pitch],
            [0, sin_pitch, cos_pitch],
        ])
        self.rotation_roll = np.array([
            [cos_roll, -sin_roll, 0],
            [sin_roll, cos_roll, 0],
            [0, 0, 1]
        ])
        # self.rotation = self.rotation_yaw @ self.rotation_pitch @ self.rotation_roll
        self.rotation = self.rotation_roll @ self.rotation_pitch @ self.rotation_yaw

        self.intrinsic = np.array([
            [self.focal_length*self.resolution[0]/self.sensor_size[0], 0, self.resolution[0]/2],
            [0, self.focal_length*self.resolution[1]/self.sensor_size[1], self.resolution[1]/2],
            [0, 0, 1]
        ])
        self.intrinsic_ = np.array(self.intrinsic_).reshape((3, 3))

    def pixel2ray(self, pixel):
        pixel = np.append(pixel, np.zeros((pixel.shape[0], 1)) + 1, axis=-1)
        # camera_coor = pixel @ self.intrinsic_
        camera_coor = pixel @ np.linalg.inv(self.intrinsic)
        camera_coor = camera_coor / camera_coor[:, -1]
        # camera_coor = pixel @ np.linalg.inv(self.intrinsic_)
        world_coor = (camera_coor @ self.rotation)
        vector = world_coor - np.array(self.position)
        directional_vector = vector / np.linalg.norm(vector)

        # print(self.quaternion)
        # q = R.from_quat(self.quaternion)
        # world_coor = q.apply(camera_coor, inverse=True) - self.position
        # directional_vector = world_coor / np.linalg.norm(world_coor)
        
        return [Line(origin=coor, direction=vec) for coor, vec in zip(world_coor, directional_vector)]
        
@dataclass
class Line:
    origin: np.ndarray
    direction: np.ndarray


def find_points(line_a: Line, line_b: Line):
    n = np.cross(line_a.direction, line_b.direction)
    d = np.abs(np.dot(n, line_a.origin - line_b.origin)) / np.linalg.norm(n)
    
    t_a = np.dot(np.cross(line_b.direction, n), (line_b.origin - line_a.origin)) / np.dot(n, n)
    t_b = np.dot(np.cross(line_a.direction, n), (line_b.origin - line_a.origin)) / np.dot(n, n)

    p_a = line_a.origin + t_a * line_a.direction
    p_b = line_b.origin + t_b * line_b.direction

    return (p_a + p_b) / 2

colors = ['red', 'blue']

In [7]:
folder_path = path.joinpath(path.cwd(), 'solo_9')

mse = []
for i in range(10):  # per scene
    # Extract 3D label and bboxes of each image from a scene
    file_path = path.joinpath(folder_path, f'sequence.{i}')
    with open(path.joinpath(file_path, 'step0.frame_data.json')) as json_file:
        data = json.load(json_file)
    captures = data['captures']
    output = get_location(captures)
    bboxes, cameras = output['bboxes'], output['cameras']

    cameras_ = [
        Camera(position=camera['position'], angle=camera['rotation'], intrinsic_=camera['intrinsic'], quaternion=camera['quaternion'])
        for camera in output['cameras']
    ]

    lines = []
    for j, (bbox, camera) in enumerate(zip(output.get('bboxes'), cameras_)):
        pixels = np.array([
            bbox['center']
        ])
        rays = camera.pixel2ray(pixels)
        stack_origin = np.stack([camera.position]*pixels.shape[0])
        lines.append(rays[-1])

    point = find_points(*lines)
    m = np.sum((point - output['3D_location'])**2)
    print(f'sample {i}')
    print(point)
    print(output['3D_location'])
    print('lpm', m)
    print('-'*30)
    mse.append(m)
    # break


sample 0
[0.15049528 0.37517559 0.45076814]
[ 0.04815929  1.11557293 -1.53010453]
lpm 4.482517429456927
------------------------------
sample 1
[0.15049208 0.37519587 0.45065092]
[-0.32352058  1.11557269  3.04011358]
lpm 7.478162685552228
------------------------------
sample 2
[0.15049078 0.37517599 0.45073344]
[0.32269124 1.11557293 0.21541381]
lpm 0.6332159538254851
------------------------------
sample 3
[0.15044741 0.37511251 0.45071041]
[1.9712949  1.11557293 2.46421274]
lpm 7.917958857848313
------------------------------
sample 4
[0.15050851 0.37520949 0.45070796]
[-0.78690091  1.11557293  0.36978478]
lpm 1.4334230116463162
------------------------------
sample 5
[0.15050319 0.3751933  0.450744  ]
[-0.38637322  1.11557269 -0.971902  ]
lpm 2.8603195873491516
------------------------------


KeyError: 'values'