В данном варианте используется детектор особых точек SIFT и дескриптор на основе модели kornia. Для построения карты глубины используется нейросетевая модель MiDAs.
Для восстановления позы камеры используется функция cv2.solvePnPRansac. 
Из видео выбран каждый 10 кадр, для существенного смещения камеры.

In [35]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from transformers import DPTForDepthEstimation, DPTFeatureExtractor
import torch
import os
from tqdm import tqdm
import kornia as K
import kornia.feature as KF
from kornia_moons.feature import *


In [36]:
model = KF.HardNet(True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_depth = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device)
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")


Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

The class DPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use DPTImageProcessor instead.



In [37]:
def local_descriptor_function(image, key_points, model):
  with torch.no_grad():
    model.eval()
    timg = K.color.rgb_to_grayscale(K.image_to_tensor(image, False).float()) / 255.
    lafs = laf_from_opencv_SIFT_kpts(key_points)
    affine = KF.LAFAffNetShapeEstimator(True)
    orienter = KF.LAFOrienter(32, angle_detector = KF.OriNet(True))
    orienter.eval()
    affine.eval()
    lafs_new = orienter(affine(lafs, timg), timg)
    patches = KF.extract_patches_from_pyramid(timg, lafs_new, 32)
    B, N, CH, H, W = patches.size()
    descs = model(patches.view(B * N, CH, H, W)).view(B * N, -1)
  return descs.detach().cpu().numpy()

In [38]:
def points_matching_function(image_1, image_2, model):

  detector_type = cv2.SIFT_create(8000)
  key_points_1 = detector_type.detect(image_1, None)
  key_points_2 = detector_type.detect(image_2, None)
  descriptor_1 = local_descriptor_function(image_1, key_points_1, model)
  descriptor_2 = local_descriptor_function(image_2, key_points_2, model)
  dists, idxs = KF.match_smnn(torch.from_numpy(descriptor_1), torch.from_numpy(descriptor_2), 0.3)
  tentatives = cv2_matches_from_kornia(dists, idxs)
  return key_points_1,key_points_2,tentatives


In [39]:
# Получить список файлов с изображениями
image_folder = "./frames"
image_files = sorted([os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith(".png")])

In [40]:
# --- Функция для получения карты глубины ---
def estimate_depth(image):
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model_depth(**inputs)
        depth = outputs.predicted_depth[0].cpu().numpy()
    depth_rescaled = (depth - depth.min()) / (depth.max() - depth.min())  # Нормализация
    return depth_rescaled

In [41]:
# --- Функция для восстановления движения камеры ---
def compute_camera_motion(kp1, kp2, matches, depth_map1, K):
    points3D_1 = []
    points2D_2 = []
    
    h, w = depth_map1.shape  # Размер карты глубины
    
    for match in matches:
        u1, v1 = int(kp1[match.queryIdx].pt[0]), int(kp1[match.queryIdx].pt[1])
        u2, v2 = int(kp2[match.trainIdx].pt[0]), int(kp2[match.trainIdx].pt[1])
        
        # Проверка, что координаты находятся в пределах карты глубины
        if 0 <= u1 < w and 0 <= v1 < h:
            Z1 = depth_map1[v1, u1]
            if Z1 > 0:  # Проверка валидности глубины
                X1 = (u1 - K[0, 2]) * Z1 / K[0, 0]
                Y1 = (v1 - K[1, 2]) * Z1 / K[1, 1]
                points3D_1.append([X1, Y1, Z1])
                points2D_2.append([u2, v2])
    
    if len(points3D_1) == 0 or len(points2D_2) == 0:
        raise ValueError("Недостаточно валидных точек для восстановления движения камеры.")
    
    points3D_1 = np.array(points3D_1)
    points2D_2 = np.array(points2D_2, dtype=np.float32)

    _, rvec, tvec, _ = cv2.solvePnPRansac(points3D_1, points2D_2, K, None)
    R, _ = cv2.Rodrigues(rvec)
    return R, tvec

In [42]:
from mpl_toolkits.mplot3d import Axes3D

def create_trajectory(poses):
    trajectory = [np.array([0, 0, 0])]
    current_pose = np.eye(4)
    pose_cam = [np.array([[0, 0, 0],[0, 0, 0],[0, 0, 0]])]

    for R, t in poses:
        T = np.eye(4)
        T[:3, :3] = R
        T[:3, 3] = t.T
        current_pose = np.dot(current_pose, T)
        trajectory.append(current_pose[:3, 3])
        pose_cam.append(current_pose[:3, :3])

    return np.array(trajectory),np.array(pose_cam)

In [43]:
import plotly.graph_objects as go
import numpy as np
# Создание фигуры

def plot_trajectory_3D(trajectory, pose_cam):
    fig = go.Figure()


    fig.add_trace(go.Scatter3d(
        x=trajectory[:, 0],
        y=trajectory[:, 1],
        z=trajectory[:, 2],
        mode='lines+markers',
        marker=dict(size=5, color='blue'),
        line=dict(color='blue')
    ))

    # Добавление ориентации камеры в каждой точке траектории
    for i, (R, t) in enumerate(zip(pose_cam,trajectory)):
        # Направление камеры (ось Z камеры)
        camera_direction = R @ np.array([0, 0, 1])  # Направление оси Z камеры
        camera_direction_end = t + camera_direction * 0.5  # Конец вектора направления

        # Добавляем линию, представляющую направление камеры
        fig.add_trace(go.Scatter3d(
            x=[t[0], camera_direction_end[0]],
            y=[t[1], camera_direction_end[1]],
            z=[t[2], camera_direction_end[2]],
            mode='lines',
            line=dict(color='green', width=2),
            name=f'Camera Direction {i}' if i == 0 else None,
            showlegend=False if i > 0 else True
        ))

    # Настройка макета графика
    fig.update_layout(
        title='Camera Motion Trajectory',
        scene=dict(
            xaxis_title='X-axis',
            yaxis_title='Y-axis',
            zaxis_title='Z-axis'
        ),
        showlegend=True
    )

    # Показать график
    fig.show()

In [44]:
K_ = np.array([[3000 ,   0.    , 960],
       [  0.    , 3000 , 540 ],
      [  0.    ,   0.    ,   1.]], dtype=np.float32) # Матрица камеры
sift = cv2.SIFT_create()
bf = cv2.BFMatcher()
trajectory = [[0, 0, 0]]
T_global = np.eye(4)
poses = []
frame_counter = 0

for i in tqdm(range(len(image_files) - 1)):
   try:
         img1 = cv2.cvtColor(cv2.imread(image_files[i]), cv2.COLOR_BGR2RGB)
         img2 = cv2.cvtColor(cv2.imread(image_files[i + 1]), cv2.COLOR_BGR2RGB)
 
         kp1, kp2, good_matches = points_matching_function(img1, img2, model)

         depth_map1 = estimate_depth(img1)
      
         # Обработка текущего кадра
         depth_map2 = estimate_depth(img2)
   
         # Восстановление движения камеры
         R, t = compute_camera_motion(kp1, kp2, good_matches, depth_map1, K_)
         poses.append((R, t))
      

         # Переход к следующему кадру
         depth_map1 = depth_map2
         kp1 = kp2
         #print(frame_counter)
   

         #trajectory = np.array(trajectory)
         
         
   except:
      raise Warning
   
trajectory,pose_cam = create_trajectory(poses)

plot_trajectory_3D(trajectory,pose_cam)

  0%|          | 0/113 [00:00<?, ?it/s]


`LAFAffNetShapeEstimator` default behaviour is changed and now it does preserve original LAF orientation. Make sure your code accounts for this.

100%|██████████| 113/113 [30:36<00:00, 16.25s/it]


Данный метод дал самые точные результаты. Время вычисления составило 30:36.
