#### Fine-tune open3d pre-trained PointNet++ model with AI2THOR synthetic data (which is already saved in 3d point-cloud format)

In [1]:
from ai2thor.controller import Controller
import numpy as np
import cv2
import os

controller = Controller(scene='FloorPlan1', renderDepthImage=True, renderInstanceSegmentation=True)
root = "../../data/synthetic_data"
os.makedirs(root, exist_ok=True)

def save_data(event, idx):
    rgb = event.frame
    depth = event.depth_frame
    seg = event.instance_segmentation_frame

    cv2.imwrite(f'{root}/rgb_{idx}.png', cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR))
    np.save(f'{root}/depth_{idx}.npy', depth)
    np.save(f'{root}/seg_{idx}.npy', seg)

# Collect 10 frames
for i in range(10):
    controller.step("MoveAhead")
    save_data(controller.last_event, i)

controller.stop()

thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%   0.0 s/B]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  31.0 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%   5.6 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  11.7 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  20.4 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  38.4 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  41.0 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  71.1 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0%  89.7 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0% 103.1 KiB/s]  of 775.MB
thor-OSXIntel64-f0825767cd50d69f666c7f282e54abfe58f1e917.zip: [   0% 108

##### Convert RGB-D + Seg to 3D Point Cloud

In [None]:
# https://github.com/allenai/ai2thor/releases
def depth_to_point_cloud(depth, fx, fy, cx, cy):
    h, w = depth.shape
    i, j = np.meshgrid(np.arange(w), np.arange(h))
    z = depth
    x = (i - cx) * z / fx
    y = (j - cy) * z / fy
    return np.stack((x, y, z), axis=-1)  # (H, W, 3)

fx = fy = 300  # You can retrieve this from the AI2-THOR metadata if needed
cx = 300
cy = 300

def create_point_cloud(idx):
    rgb = cv2.imread(f'{root}/rgb_{idx}.png')
    depth = np.load(f'{root}/depth_{idx}.npy')
    seg = np.load(f'{root}/seg_{idx}.npy')

    pc = depth_to_point_cloud(depth, fx, fy, cx, cy)
    color = rgb / 255.0
    label = seg[:, :, 0]  # Use object ID or class

    # Flatten
    pc = pc.reshape(-1, 3)
    color = color.reshape(-1, 3)
    label = label.flatten()

    return np.concatenate([pc, color, label[:, None]], axis=1)  # shape: (N, 7) -> [x, y, z, R, G, B, label] -> suitable for RandLA-Net, PointNet++ and ...

for i in range(10):
    pc = create_point_cloud(i)
    np.save(f'{root}/pointcloud_{i}.npy', pc)

##### Dataset class for PointNet++


In [None]:
import torch
from torch.utils.data import Dataset
import glob

class AI2THORDataset(Dataset):
    def __init__(self, data_dir):
        self.files = glob.glob(f"{data_dir}/pointcloud_*.npy")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])  # (N, 7)
        xyz = data[:, 0:3]
        features = data[:, 3:6]
        labels = data[:, 6].astype(np.int64)

        # Downsample or crop points to fixed size
        idxs = np.random.choice(len(xyz), 1024, replace=True)
        return (
            torch.tensor(xyz[idxs], dtype=torch.float32),
            torch.tensor(features[idxs], dtype=torch.float32),
            torch.tensor(labels[idxs], dtype=torch.long),
        )

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class PointNetSetAbstraction(nn.Module):
    def __init__(self, in_channels, mlp):
        super().__init__()
        layers = []
        last_ch = in_channels
        for ch in mlp:
            layers.append(nn.Conv1d(last_ch, ch, 1))
            layers.append(nn.BatchNorm1d(ch))
            layers.append(nn.ReLU())
            last_ch = ch
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        # x: B x C x N
        return torch.max(self.mlp(x), 2)[0]  # Global max pooling

class PointNetPlusPlus(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.sa1 = PointNetSetAbstraction(6, [64, 128, 256])
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, xyz, features):
        x = torch.cat([xyz, features], dim=2).transpose(1, 2)  # B x C x N
        x = self.sa1(x)  # B x 256
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
from torch.utils.data import DataLoader

dataset = AI2THORDataset(f"{root}")
loader = DataLoader(dataset, batch_size=4, shuffle=True)

model = PointNetPlusPlus(num_classes=50).to("cpu")  # Adjust classes as needed
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    for xyz, feat, labels in loader:
        pred = model(xyz, feat)
        loss = criterion(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch}: Loss = {loss.item():.4f}")