In [None]:
# --------------------------------------------------------------
#   sfm_point_generator.py
#
#   Description:
#       This script generates 3D points from video data using 
#       Structure from Motion (SfM).
#   
#   Author: Özden Özel
#   Created: 2026-01-28
#
# --------------------------------------------------------------


import os
import cv2 as cv
from matplotlib import colors
import torch
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt


def generate_sfm_points(video_path: str, cam_traj_path: str,  output_path: str):
    """
    Generates 3D points from video data using Structure from Motion (SfM).
    
    Args:
        video_path (str): Path to the input video file.
        cam_traj_path (str): Path to the camera trajectory file.
        output_path (str): Path to save the generated 3D points.
    """

    # Check is the video file exists
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    # Check if the camera trajectory file exists
    if not os.path.exists(cam_traj_path):
        raise FileNotFoundError(f"Camera trajectory file not found: {cam_traj_path}")

    # Read the video file
    cap = cv.VideoCapture(video_path)
    
    

def frame_to_sfm_points(frame: torch.Tensor, depth: torch.Tensor, R: torch.Tensor, t: torch.Tensor, cam_specs: dict) -> Tuple[List[int], List[torch.Tensor], List[torch.Tensor]]:
    """
    Converts a video frame to 3D points using SfM techniques.

    Args:
        frame (cv.Mat): Input video frame.
        depth (cv.Mat): Depth map corresponding to the frame.
        R (cv.Mat): Rotation matrix of the camera.
        t (cv.Mat): Translation matrix of the camera.
        cam_specs (dict): Camera specifications including intrinsic parameters.

    Returns:
        List[Tuple[int, torch.Tensor, torch.Tensor]]: The frame index and its generated 3D points in the format [μ, color].
    """

    focal_length = cam_specs['focal_length']
    cx = cam_specs['cx']
    cy = cam_specs['cy']

    """
    x = f * Xc / Zc --> Xc = x * Zc / f
    y = f * Yc / Zc --> Yc = y * Zc / f

    [Xc, Yc, Zc]^T = R * [Xw, Yw, Zw]^T + t --> [Xw, Yw, Zw]^T = R^-1 * ([Xc, Yc, Zc]^T - t)

    """

    frame_indices = []
    world_coords = []
    colors = []
    for y in range(frame.shape[0]-800):
        for x in range(frame.shape[1]-800):
            if y % 100 == 0 and x % 100 == 0:
                print(f"Processing pixel ({x}, {y})")

            Zc = depth[y, x]
            if Zc == 0:
                print("Invalid depth at pixel ({x}, {y}), skipping.")
                continue  # Skip invalid depth
            Xc = (x - frame.shape[1] * .5) * Zc / focal_length
            Yc = (y - frame.shape[0] * .5) * Zc / focal_length
            # Form the camera coordinates
            cam_coords = torch.tensor([[Xc], [Yc], [Zc]])
            color = frame[y, x]
            # Convert to world coordinates
            R_inv = torch.inverse(torch.tensor(R))
            t_tensor = torch.tensor(t).reshape(3, 1)
            
            frame_indices.append(1)
            world_coords.append(R_inv @ (torch.transpose(cam_coords, 0, 1) - t_tensor))
            colors.append(color)
            

    return frame_indices, world_coords, colors















def converge_sfm_points(points_list: List[torch.Tensor], delta: List[Tuple[float, float, float]]) -> torch.Tensor:
    """
    Converges multiple sets of 3D points into a single coherent set.
    Args:
        points_list (List[torch.Tensor]): List of 3D point tensors.
    Returns:
        torch.Tensor: Converged 3D points.
    """
    pass





if __name__ == "__main__":
    script_dir = os.path.dirname(__file__)
    frame_path = os.path.join(script_dir, 'dog.jpg')
    depth_path = os.path.join(script_dir, 'depth.npy')

    frame = cv.imread(frame_path)  # returns None if file unreadable
    if frame is None:
        raise FileNotFoundError(f"Image not found or unreadable: {frame_path}")
    frame = torch.Tensor(frame)

    if not os.path.exists(depth_path):
        raise FileNotFoundError(f"Depth file not found: {depth_path}")
    depth = np.load(depth_path)
    depth = torch.Tensor(depth)

    print(f"Frame shape: {frame.shape}, Depth shape: {depth.shape}")
    print(f"Frame type: {type(frame)}, Depth type: {type(depth)}")

    R = torch.Tensor([
        [1, 2, 4], 
        [0, 4, 2], 
        [0, 0, 2]
        ])
    t = torch.Tensor([1, 2, 1])

    cam_specs = {
        'focal_length': 50e-3,
        'cx': 320,
        'cy': 240
    }
    
    frame_indices, world_coords, colors = frame_to_sfm_points(frame, depth, R, t, cam_specs)
    
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

    print(f"x_shape: {world_coords[:][0].shape}, y_shape: {world_coords[:][1].shape}, z_shape: {world_coords[:][2].shape}")
    ax.scatter(world_coords[:][0], world_coords[:][1], world_coords[:][2], s=2)

    ax.set_xlabel("X")
    ax.set_ylabel("Y")
    ax.set_zlabel("Z")

    plt.show()



In [27]:
import os
import cv2 as cv
from matplotlib import colors
import torch
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt


script_dir = "C:\\Users\\ozden\\source\\repos\\Video2World\\src\\utils"
frame_path = os.path.join(script_dir, 'dog.jpg')
depth_path = os.path.join(script_dir, 'depth.npy')

frame = cv.imread(frame_path)  # returns None if file unreadable
if frame is None:
    raise FileNotFoundError(f"Image not found or unreadable: {frame_path}")
frame = torch.Tensor(frame)

if not os.path.exists(depth_path):
    raise FileNotFoundError(f"Depth file not found: {depth_path}")
depth = np.load(depth_path)
depth = torch.Tensor(depth)

print(f"Frame shape: {frame.shape}, Depth shape: {depth.shape}")
print(f"Frame type: {type(frame)}, Depth type: {type(depth)}\n")

R = torch.Tensor([
    [1, 2, 4], 
    [0, 4, 2], 
    [0, 0, 2]
    ])
t = torch.Tensor([1, 2, 1])

cam_specs = {
    'focal_length': 50e-3,
    'cx': 320,
    'cy': 240
}


print(f"R: {R}, R_shape: {R.shape}")
print(f"t: {t}, t_shape: {t.shape}")

print("\n================================================================")
print("================================================================\n")

# ========================================================================
# ========================================================================

y = 100
x = 100
focal_length = cam_specs['focal_length']

Zc = depth[y, x]
if Zc == 0:
    print(f"Invalid depth at pixel ({x}, {y}), skipping.")
else: 
    Xc = (x - frame.shape[1] * .5) * Zc / focal_length
    Yc = (y - frame.shape[0] * .5) * Zc / focal_length
    
    # Form the camera coordinates
    cam_coords = torch.tensor([[Xc], [Yc], [Zc]])
    color = frame[y, x]
    
    print(f"cam_coords: {cam_coords}, cam_Coords_shape: {cam_coords.shape}\n")
    print(f"color: {color}, color_shape: {color.shape}\n")
    
    # Convert to world coordinates
    R_inv = torch.inverse(R)
    t_tensor = t.reshape(3, 1)

    print(f"R_inv: {R_inv}, R_inv_shape: {R_inv.shape}\n")
    print(f"t_tensor: {t_tensor}, t_tensor_shape: {t_tensor.shape}\n")
    print(f"torch.transpose(cam_coords, 0, 1): {torch.transpose(cam_coords, 0, 1)}, shape: {torch.transpose(cam_coords, 0, 1).shape}\n")

    world_coords = R_inv @ (cam_coords - t_tensor)
    colors = color
    
    print(f"world_coords: {world_coords}, world_coords_shape: {world_coords.shape}\n")
    print(f"colors: {colors}, colors_shape: {colors.shape}\n")

Frame shape: torch.Size([1213, 1546, 3]), Depth shape: torch.Size([1213, 1546])
Frame type: <class 'torch.Tensor'>, Depth type: <class 'torch.Tensor'>

R: tensor([[1., 2., 4.],
        [0., 4., 2.],
        [0., 0., 2.]]), R_shape: torch.Size([3, 3])
t: tensor([1., 2., 1.]), t_shape: torch.Size([3])


cam_coords: tensor([[-3.8006e+04],
        [-2.8604e+04],
        [ 2.8237e+00]]), cam_Coords_shape: torch.Size([3, 1])

color: tensor([ 5., 11., 10.]), color_shape: torch.Size([3])

R_inv: tensor([[ 1.0000, -0.5000, -1.5000],
        [ 0.0000,  0.2500, -0.2500],
        [ 0.0000,  0.0000,  0.5000]]), R_inv_shape: torch.Size([3, 3])

t_tensor: tensor([[1.],
        [2.],
        [1.]]), t_tensor_shape: torch.Size([3, 1])

torch.transpose(cam_coords, 0, 1): tensor([[-3.8006e+04, -2.8604e+04,  2.8237e+00]]), shape: torch.Size([1, 3])

world_coords: tensor([[-2.3707e+04],
        [-7.1519e+03],
        [ 9.1183e-01]]), world_coords_shape: torch.Size([3, 1])

colors: tensor([ 5., 11., 10.]), 

In [1]:
import os
import cv2 as cv
import torch
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt


script_dir = "C:\\Users\\ozden\\source\\repos\\Video2World\\src\\utils"
frame_path = os.path.join(script_dir, 'dog.jpg')
depth_path = os.path.join(script_dir, 'depth.npy')

frame = cv.imread(frame_path)  # returns None if file unreadable
if frame is None:
    raise FileNotFoundError(f"Image not found or unreadable: {frame_path}")
frame = torch.Tensor(frame)

if not os.path.exists(depth_path):
    raise FileNotFoundError(f"Depth file not found: {depth_path}")
depth = np.load(depth_path)
depth = torch.Tensor(depth)

print(f"Frame shape: {frame.shape}, Depth shape: {depth.shape}")
print(f"Frame type: {type(frame)}, Depth type: {type(depth)}\n")

R = torch.Tensor([
    [1, 2, 4], 
    [0, 4, 2], 
    [0, 0, 2]
    ])
t = torch.Tensor([1, 2, 1])

cam_specs = {
    'focal_length': 50e-3,
    'cx': 320,
    'cy': 240
}


print(f"R: {R}, R_shape: {R.shape}\n")
print(f"t: {t}, t_shape: {t.shape}")

print("\n================================================================")
print("================================================================\n")

# ========================================================================
# ========================================================================

y = 100
x = 100
focal_length = cam_specs['focal_length']

world_coords = []
colors = []

for y in range(frame.shape[0]-800):
    for x in range(frame.shape[1]-800):
        if y % 100 == 0 and x % 100 == 0:
            print(f"Processing pixel ({x}, {y})")

        Zc = depth[y, x]
        if Zc == 0:
            print(f"Invalid depth at pixel ({x}, {y}), skipping.")
        else: 
            Xc = (x - frame.shape[1] * .5) * Zc / focal_length
            Yc = (y - frame.shape[0] * .5) * Zc / focal_length

            # Form the camera coordinates
            cam_coords = torch.tensor([[Xc], [Yc], [Zc]])
            color = frame[y, x]

            #print(f"cam_coords: {cam_coords}, cam_Coords_shape: {cam_coords.shape}\n")
            #print(f"color: {color}, color_shape: {color.shape}\n")

            # Convert to world coordinates
            R_inv = torch.inverse(R)
            t_tensor = t.reshape(3, 1)

            #print(f"R_inv: {R_inv}, R_inv_shape: {R_inv.shape}\n")
            #print(f"t_tensor: {t_tensor}, t_tensor_shape: {t_tensor.shape}\n")
            #print(f"torch.transpose(cam_coords, 0, 1): {torch.transpose(cam_coords, 0, 1)}, shape: {torch.transpose(cam_coords, 0, 1).shape}")

            world_coords.append(R_inv @ (cam_coords - t_tensor))
            colors.append(color)

print(f"world_coords.shape: {len(world_coords)}")
print(f"colors.shape: {len(colors)}")

Frame shape: torch.Size([1213, 1546, 3]), Depth shape: torch.Size([1213, 1546])
Frame type: <class 'torch.Tensor'>, Depth type: <class 'torch.Tensor'>

R: tensor([[1., 2., 4.],
        [0., 4., 2.],
        [0., 0., 2.]]), R_shape: torch.Size([3, 3])

t: tensor([1., 2., 1.]), t_shape: torch.Size([3])


Processing pixel (0, 0)
Processing pixel (100, 0)
Processing pixel (200, 0)
Processing pixel (300, 0)
Processing pixel (400, 0)
Processing pixel (500, 0)
Processing pixel (600, 0)
Processing pixel (700, 0)
Processing pixel (0, 100)
Processing pixel (100, 100)
Processing pixel (200, 100)
Processing pixel (300, 100)
Processing pixel (400, 100)
Processing pixel (500, 100)
Processing pixel (600, 100)
Processing pixel (700, 100)
Processing pixel (0, 200)
Processing pixel (100, 200)
Processing pixel (200, 200)
Processing pixel (300, 200)
Processing pixel (400, 200)
Processing pixel (500, 200)
Processing pixel (600, 200)
Processing pixel (700, 200)
Processing pixel (0, 300)
Processing pixel (10

In [None]:
depth.norm

world_coords.len: 308098, world_coords[0].shape: torch.Size([3, 1])


AttributeError: 'list' object has no attribute 'shape'

In [None]:
import torch
from dataclasses import dataclass


@dataclass
class SFMPoint:
    coords: torch.Tensor       # 3D coordinates
    covariance: torch.Tensor   # Covariance matrix
    color: torch.Tensor        # Color information
    alpha: torch.Tensor        # Opacity

    def __eq__(self, other):
        if not isinstance(other, SFMPoint):
            return False
        return (torch.equal(self.coords, other.coords) and
                torch.equal(self.covariance, other.covariance) and
                torch.equal(self.color, other.color) and
                torch.equal(self.alpha, other.alpha))



p1 = SFMPoint(
    coords = torch.tensor([1.0, 3.0, 3.0]),
    covariance = torch.eye(3),
    color = torch.tensor([255, 0, 0]),
    alpha = torch.tensor(1.0) 
)

p2 = SFMPoint(
    coords = torch.tensor([1.0, 2.0, 3.0]),
    covariance = torch.eye(3),
    color = torch.tensor([255, 0, 0]),
    alpha = torch.tensor(1.0)
)
print(f"Point 1: {p1}")
print(f"Point 2: {p2}")
print(f"Point 1: {p1}")
print(f"Are points equal? {p1 == p2}")

Point 1: SFMPoint(coords=tensor([1., 2., 3.]), covariance=tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]), color=tensor([255,   0,   0]), alpha=tensor(1.))
Point 2: SFMPoint(coords=tensor([1., 2., 3.]), covariance=tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]), color=tensor([255,   0,   0]), alpha=tensor(1.))
Point 1: SFMPoint(coords=tensor([1., 2., 3.]), covariance=tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]), color=tensor([255,   0,   0]), alpha=tensor(1.))
Are points equal? True
