Support multi-res occ grid & prop net (#176)

* multres grid * prop * benchmark with prop and occ * benchmark blender with weight_decay * docs * bump version
nerfstudio-project · Mar 15, 2023 · b53f8a4 · b53f8a4
1 parent 82fd69c
commit b53f8a4
Show file tree

Hide file tree

Showing 35 changed files with 2,342 additions and 1,408 deletions.
diff --git a/.gitignore b/.gitignore
@@ -118,4 +118,5 @@ venv.bak/
 .vsocde
 
 benchmarks/
-outputs/
+outputs/
+data
diff --git a/docs/source/apis/generated/nerfacc.ray_resampling.rst b/docs/source/apis/generated/nerfacc.ray_resampling.rst
diff --git a/docs/source/apis/utils.rst b/docs/source/apis/utils.rst
@@ -17,7 +17,6 @@ Utils
    render_weight_from_alpha
    render_visibility
 
-   ray_resampling
    pack_data
    unpack_data
 
diff --git a/docs/source/examples/ngp.rst b/docs/source/examples/ngp.rst
@@ -7,7 +7,7 @@ See code `examples/train_ngp_nerf.py` at our `github repository`_ for details.
 
 Benchmarks
 ------------
-*updated on 2022-10-12*
+*updated on 2023-03-14*
 
 Here we trained a `Instant-NGP Nerf`_ model on the `Nerf-Synthetic dataset`_. We follow the same
 settings with the Instant-NGP paper, which uses train split for training and test split for
@@ -30,11 +30,15 @@ memory footprint is about 3GB.
 +-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
 |(training time)        | 309s  | 258s  | 256s    | 316s  | 292s  | 207s  | 218s  | 250s  | 263s  |
 +-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
-|Ours 20k steps         | 35.50 | 36.16 | 29.14   | 35.23 | 37.15 | 31.71 | 24.88 | 29.91 | 32.46 |
+|Ours (occ) 20k steps   | 35.81 | 36.87 | 29.59   | 35.70 | 37.45 | 33.63 | 24.98 | 30.64 | 33.08 |
 +-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
-|(training time)        | 287s  | 274s  | 269s    | 317s  | 269s  | 244s  | 249s  | 257s  | 271s  |
+|(training time)        | 288s  | 255s  | 247s    | 319s  | 274s  | 238s  | 247s  | 252s  | 265s  |
++-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
+|Ours (prop) 20k steps  | 34.06 | 34.32 | 27.93   | 34.27 | 36.47 | 31.39 | 24.39 | 30.57 | 31.68 |
++-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
+|(training time)        | 238s  | 236s  | 250s    | 235s  | 235s  | 236s  | 236s  | 236s  | 240s  |
 +-----------------------+-------+-------+---------+-------+-------+-------+-------+-------+-------+
 
 .. _`Instant-NGP Nerf`: https://github.com/NVlabs/instant-ngp/tree/51e4107edf48338e9ab0316d56a222e0adf87143
-.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75
+.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/
 .. _`Nerf-Synthetic dataset`: https://drive.google.com/drive/folders/1JDdLGDruGNXWnM1eqY1FNL9PlStjaKWi
diff --git a/docs/source/examples/unbounded.rst b/docs/source/examples/unbounded.rst
@@ -5,7 +5,7 @@ See code `examples/train_ngp_nerf.py` at our `github repository`_ for details.
 
 Benchmarks
 ------------
-*updated on 2022-11-07*
+*updated on 2023-03-14*
 
 Here we trained a `Instant-NGP Nerf`_  on the `MipNerf360`_ dataset. We used train 
 split for training and test split for evaluation. Our experiments are conducted on a 
@@ -32,12 +32,19 @@ that takes from `MipNerf360`_.
 +----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
 | MipNerf360 (~days)   | 26.98 | 24.37 | 33.46 | 29.55 | 32.23 | 31.63 | 26.40 | 29.23 |
 +----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
-| Ours (~20 mins)      | 25.41 | 22.97 | 30.71 | 27.34 | 30.32 | 31.00 | 23.43 | 27.31 |
+| Ours (occ)           | 24.76 | 22.38 | 29.72 | 26.80 | 28.02 | 30.67 | 22.39 | 26.39 |
 +----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
-| Ours (Training time) | 25min | 17min | 19min | 23min | 28min | 20min | 17min | 21min |
+| Ours (Training time) | 323s  | 302s  | 300s  | 337s  | 347s  | 320s  | 322s  | 322s  |
 +----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
+| Ours (prop)          | 25.44 | 23.21 | 30.62 | 26.75 | 30.63 | 30.93 | 25.20 | 27.54 |
++----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
+| Ours (Training time) | 308s  | 304s  | 308s  | 306s  | 313s  | 301s  | 287s  | 304s  |
++----------------------+-------+-------+-------+-------+-------+-------+-------+-------+
+
+Note `Ours (prop)` is basically a `Nerfacto_` model.
 
 .. _`Instant-NGP Nerf`: https://arxiv.org/abs/2201.05989
 .. _`MipNerf360`: https://arxiv.org/abs/2111.12077
 .. _`Nerf++`: https://arxiv.org/abs/2010.07492
-.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/tree/76c0f9817da4c9c8b5ccf827eb069ee2ce854b75
+.. _`github repository`: https://github.com/KAIR-BAIR/nerfacc/
+.. _`Nerfacto`: https://github.com/nerfstudio-project/nerfstudio/blob/main/nerfstudio/models/nerfacto.py
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -123,7 +123,7 @@ Links:
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Example Usages
+   :caption: Example Usages and Benchmarks
 
    examples/*
 

diff --git a/examples/datasets/dnerf_synthetic.py b/examples/datasets/dnerf_synthetic.py
@@ -86,7 +86,7 @@ def __init__(
         near: float = None,
         far: float = None,
         batch_over_images: bool = True,
-        device: str = "cuda:0",
+        device: str = "cpu",
     ):
         super().__init__()
         assert split in self.SPLITS, "%s" % split

diff --git a/examples/datasets/nerf_360_v2.py b/examples/datasets/nerf_360_v2.py
@@ -22,7 +22,7 @@
 from scene_manager import SceneManager
 
 
-def _load_colmap(root_fp: str, subject_id: str, split: str, factor: int = 1):
+def _load_colmap(root_fp: str, subject_id: str, factor: int = 1):
     assert factor in [1, 2, 4, 8]
 
     data_dir = os.path.join(root_fp, subject_id)
@@ -134,12 +134,66 @@ def _load_colmap(root_fp: str, subject_id: str, split: str, factor: int = 1):
         "test": all_indices[all_indices % 8 == 0],
         "train": all_indices[all_indices % 8 != 0],
     }
-    indices = split_indices[split]
-    # All per-image quantities must be re-indexed using the split indices.
-    images = images[indices]
-    camtoworlds = camtoworlds[indices]
+    return images, camtoworlds, K, split_indices
+
+
+def similarity_from_cameras(c2w, strict_scaling):
+    """
+    reference: nerf-factory
+    Get a similarity transform to normalize dataset
+    from c2w (OpenCV convention) cameras
+    :param c2w: (N, 4)
+    :return T (4,4) , scale (float)
+    """
+    t = c2w[:, :3, 3]
+    R = c2w[:, :3, :3]
+
+    # (1) Rotate the world so that z+ is the up axis
+    # we estimate the up axis by averaging the camera up axes
+    ups = np.sum(R * np.array([0, -1.0, 0]), axis=-1)
+    world_up = np.mean(ups, axis=0)
+    world_up /= np.linalg.norm(world_up)
+
+    up_camspace = np.array([0.0, -1.0, 0.0])
+    c = (up_camspace * world_up).sum()
+    cross = np.cross(world_up, up_camspace)
+    skew = np.array(
+        [
+            [0.0, -cross[2], cross[1]],
+            [cross[2], 0.0, -cross[0]],
+            [-cross[1], cross[0], 0.0],
+        ]
+    )
+    if c > -1:
+        R_align = np.eye(3) + skew + (skew @ skew) * 1 / (1 + c)
+    else:
+        # In the unlikely case the original data has y+ up axis,
+        # rotate 180-deg about x axis
+        R_align = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+
+    #  R_align = np.eye(3) # DEBUG
+    R = R_align @ R
+    fwds = np.sum(R * np.array([0, 0.0, 1.0]), axis=-1)
+    t = (R_align @ t[..., None])[..., 0]
+
+    # (2) Recenter the scene using camera center rays
+    # find the closest point to the origin for each camera's center ray
+    nearest = t + (fwds * -t).sum(-1)[:, None] * fwds
+
+    # median for more robustness
+    translate = -np.median(nearest, axis=0)
 
-    return images, camtoworlds, K
+    #  translate = -np.mean(t, axis=0)  # DEBUG
+
+    transform = np.eye(4)
+    transform[:3, 3] = translate
+    transform[:3, :3] = R_align
+
+    # (3) Rescale the scene using camera distances
+    scale_fn = np.max if strict_scaling else np.median
+    scale = 1.0 / scale_fn(np.linalg.norm(t + translate, axis=-1))
+
+    return transform, scale
 
 
 class SubjectLoader(torch.utils.data.Dataset):
@@ -169,7 +223,7 @@ def __init__(
         far: float = None,
         batch_over_images: bool = True,
         factor: int = 1,
-        device: str = "cuda:0",
+        device: str = "cpu",
     ):
         super().__init__()
         assert split in self.SPLITS, "%s" % split
@@ -184,14 +238,25 @@ def __init__(
         )
         self.color_bkgd_aug = color_bkgd_aug
         self.batch_over_images = batch_over_images
-        self.images, self.camtoworlds, self.K = _load_colmap(
-            root_fp, subject_id, split, factor
+        self.images, self.camtoworlds, self.K, split_indices = _load_colmap(
+            root_fp, subject_id, factor
+        )
+        # normalize the scene
+        T, sscale = similarity_from_cameras(
+            self.camtoworlds, strict_scaling=False
         )
-        self.images = torch.from_numpy(self.images).to(device).to(torch.uint8)
+        self.camtoworlds = np.einsum("nij, ki -> nkj", self.camtoworlds, T)
+        self.camtoworlds[:, :3, 3] *= sscale
+        # split
+        indices = split_indices[split]
+        self.images = self.images[indices]
+        self.camtoworlds = self.camtoworlds[indices]
+        # to tensor
+        self.images = torch.from_numpy(self.images).to(torch.uint8).to(device)
         self.camtoworlds = (
-            torch.from_numpy(self.camtoworlds).to(device).to(torch.float32)
+            torch.from_numpy(self.camtoworlds).to(torch.float32).to(device)
         )
-        self.K = torch.tensor(self.K).to(device).to(torch.float32)
+        self.K = torch.tensor(self.K).to(torch.float32).to(device)
         self.height, self.width = self.images.shape[1:3]
 
     def __len__(self):
@@ -275,7 +340,7 @@ def fetch_data(self, index):
             value=(-1.0 if self.OPENGL_CAMERA else 1.0),
         )  # [num_rays, 3]
 
-        # [n_cams, height, width, 3]
+        # [num_rays, 3]
         directions = (camera_dirs[:, None, :] * c2w[:, :3, :3]).sum(dim=-1)
         origins = torch.broadcast_to(c2w[:, :3, -1], directions.shape)
         viewdirs = directions / torch.linalg.norm(

diff --git a/examples/datasets/nerf_synthetic.py b/examples/datasets/nerf_synthetic.py
@@ -79,7 +79,7 @@ def __init__(
         near: float = None,
         far: float = None,
         batch_over_images: bool = True,
-        device: str = "cuda:0",
+        device: torch.device = torch.device("cpu"),
     ):
         super().__init__()
         assert split in self.SPLITS, "%s" % split
@@ -110,19 +110,19 @@ def __init__(
             self.images, self.camtoworlds, self.focal = _load_renderings(
                 root_fp, subject_id, split
             )
-        self.images = torch.from_numpy(self.images).to(device).to(torch.uint8)
-        self.camtoworlds = (
-            torch.from_numpy(self.camtoworlds).to(device).to(torch.float32)
-        )
+        self.images = torch.from_numpy(self.images).to(torch.uint8)
+        self.camtoworlds = torch.from_numpy(self.camtoworlds).to(torch.float32)
         self.K = torch.tensor(
             [
                 [self.focal, 0, self.WIDTH / 2.0],
                 [0, self.focal, self.HEIGHT / 2.0],
                 [0, 0, 1],
             ],
             dtype=torch.float32,
-            device=device,
         )  # (3, 3)
+        self.images = self.images.to(device)
+        self.camtoworlds = self.camtoworlds.to(device)
+        self.K = self.K.to(device)
         assert self.images.shape[1:3] == (self.HEIGHT, self.WIDTH)
 
     def __len__(self):