In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from scipy.spatial.transform import Rotation as R
from typing import List, Tuple

%matplotlib ipympl

In [16]:
def extract_(l, s):
    for elem in l:
        if elem['id'] == s or elem['id'] == f'{s}_0':
            return elem['values'][0]
            
with open('./solo_1/sequence.0/step0.frame_data.json') as json_file:
    data = json.load(json_file)

# captures = data['captures']
# obj_1, obj_2 = captures[0], captures[1]
# img_1, img_2 = obj_1['filename'], obj_2['filename']
# obj_1_3d, bbox_1 = extract_(obj_1['annotations'], 'bounding box 3D'), extract_(obj_1['annotations'], 'bounding box')
# obj_2_3d, bbox_2 = extract_(obj_2['annotations'], 'bounding box 3D'), extract_(obj_2['annotations'], 'bounding box')


def extract(captures):
    '''
        Extract labels
    '''

    def _get_bbox(view):
        '''
            For each view, extract the 2D bounding box and
        '''
        bbox_3d, bbox_2d = extract_(view['annotations'], 'bounding box 3D'), extract_(view['annotations'], 'bounding box')
        
        # refine 3D location of the object
        euler = R.from_quat(bbox_3d.get('rotation')).as_matrix()
        obj_location = view.get('position') + np.dot(bbox_3d['translation'], euler)  # - bbox_3d['size']*np.array([-0.00389254, 0.49512566, 0.13212298])

        # pre-process 2D bbox
        bbox = {
            'center': np.array(bbox_2d['origin']) + np.array(bbox_2d['dimension'])/2,
            'size': np.array(bbox_2d['dimension'])/2,
        }

        return obj_location, bbox

    bboxes, cameras = [], []

    for capture in captures:  # per view
        bbox_3d, bbox_2d = _get_bbox(capture)
        bboxes.append({
            '3D': bbox_3d,
            '2D': bbox_2d
        })
        cameras.append({
            'filename': capture['filename'],
            'position': capture['position'],
        })

    return {
        # '3D_location': bboxes['3D'],
        'bboxes': bboxes,
        'cameras': cameras,
    }

output = extract(data['captures'])
bboxes, cameras = output['bboxes'], output['cameras']

print(bboxes[0])
# print(cameras[0])
print(bboxes[1])

{'3D': array([-2.00287218,  1.11557254, -2.92380786]), '2D': {'center': array([1884.,  734.]), 'size': array([ 47., 176.])}}
{'3D': array([-2.00286907,  1.11557279, -2.92380769]), '2D': {'center': array([2306. ,  811.5]), 'size': array([106. , 262.5])}}


In [3]:
@dataclass
class Camera:
    focal_length: float
    resolution: Tuple[int, int]
    sensor_size: Tuple[int, int]

    position: np.ndarray#=field(default_factory=np.array([0., 0., 0.]))
    angle: np.ndarray#=field(default_factory=np.array([0., 1., 0.]))  # angle in degrees

    def __post_init__(self):
        beta, alpha, gamma = self.angle  # R_x, R_y, R_z | pitch, yaw, roll

        sin_beta, cos_beta = np.sin(np.deg2rad(beta)), np.cos(np.deg2rad(beta))
        sin_alpha, cos_alpha = np.sin(np.deg2rad(alpha)), np.cos(np.deg2rad(alpha))
        sin_gamma, cos_gamma = np.sin(np.deg2rad(gamma)), np.cos(np.deg2rad(gamma))

        self.rotation = np.array([
            [cos_alpha*cos_beta, cos_alpha*sin_beta*sin_gamma - sin_alpha*cos_gamma, cos_alpha*sin_beta*cos_gamma + sin_alpha*sin_gamma],
            [sin_alpha*cos_beta, sin_alpha*sin_beta*sin_gamma + cos_alpha*cos_gamma, sin_alpha*sin_beta*cos_gamma - cos_alpha*sin_gamma],
            [-sin_beta, cos_beta*sin_gamma, cos_beta*cos_gamma]
        ])
        unit_vec = np.array([0, 0, 1])
        # v = np.matmul(self.rotation, unit_vec.T)
        v = unit_vec @ self.rotation
        self.unit_angle = v / np.linalg.norm(v)

        self.intrinsic = np.array([
            [self.focal_length*self.resolution[0]/self.sensor_size[0], 0, self.resolution[0]/2],
            [0, self.focal_length*self.resolution[1]/self.sensor_size[1], self.resolution[1]/2],
            [0, 0, 1]
        ])

    def pixel2ray(self, pixel):
        pixel = np.append(pixel, np.zeros((pixel.shape[0], 1)) + 1, axis=-1)
        # pixel = pixel / self.focal_length
        translation = np.array(self.position) 
        camera_coor = pixel @ np.linalg.inv(self.intrinsic.T)
        # camera_coor = pixel @ np.linalg.inv(self.intrinsic).T
        world_coor = (camera_coor @ self.rotation) + translation

        vector = world_coor - self.position
        directional_vector = vector / np.linalg.norm(vector)

        # return directional_vector
        return Line(origin=world_coor, direction=directional_vector)


@dataclass
class Line:
    origin: np.ndarray
    direction: np.ndarray


def find_points(line_a: Line, line_b: Line):
    n = np.cross(line_a.direction, line_b.direction)
    d = np.abs(np.dot(n, line_a.origin - line_b.origin)) / np.linalg.norm(n)
    
    t_a = np.dot(np.cross(line_b.direction, n), (line_b.origin - line_a.origin)) / np.dot(n, n)
    t_b = np.dot(np.cross(line_a.direction, n), (line_b.origin - line_a.origin)) / np.dot(n, n)

    p_a = line_a.origin + t_a * line_a.direction
    p_b = line_b.origin + t_b * line_b.direction

    return (p_a + p_b) / 2



In [17]:
def get_location(bboxes_list, cameras_list):
    for bboxes, cameras in zip(bboxes_list, cameras_list):
        print(bboxes, cameras)

get_location([bboxes], [cameras])

[{'3D': array([-2.00287218,  1.11557254, -2.92380786]), '2D': {'center': array([1884.,  734.]), 'size': array([ 47., 176.])}}, {'3D': array([-2.00286907,  1.11557279, -2.92380769]), '2D': {'center': array([2306. ,  811.5]), 'size': array([106. , 262.5])}}] [{'filename': 'step0.camera_0.png', 'position': [8.0, 2.5, 3.0]}, {'filename': 'step0.camera.png', 'position': [-7.0, 2.5, 3.0]}]
