In [1]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F  # Import for F.mse_loss
from torchvision import models
import cv2
import yaml
from tqdm import tqdm

In [2]:
!pip install open3d


Collecting open3d
  Downloading open3d-0.19.0-cp311-cp311-manylinux_2_31_x86_64.whl.metadata (4.3 kB)
Collecting dash>=2.6.0 (from open3d)
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting configargparse (from open3d)
  Downloading configargparse-1.7.1-py3-none-any.whl.metadata (24 kB)
Collecting addict (from open3d)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting pyquaternion (from open3d)
  Downloading pyquaternion-0.9.9-py3-none-any.whl.metadata (1.4 kB)
Collecting flask>=3.0.0 (from open3d)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting werkzeug>=3.0.0 (from open3d)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash>=2.6.0->open3d)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading open3d-0.19.0-cp311-cp311-manylinux_2_31_x86_64.whl (447.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.7/447.7 MB[0m [31m3.8 MB/s[0m eta 

In [3]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import cv2
import numpy as np
from scipy.spatial.transform import Rotation as R
import torchvision.transforms as T
import os
import yaml
from PIL import Image


class PoseDataset(Dataset):
    def __init__(self,  rgb_dir, depth_dir,linemod_root, augment=False):

        self.rgb_dir = rgb_dir
        self.depth_dir = depth_dir
        self.linemod_root = linemod_root
        self.RGB_img_filenames = sorted([
            f for f in os.listdir(rgb_dir) if f.endswith(".png")
        ])
        self.depth_img_filenames = sorted([
            f for f in os.listdir(depth_dir) if f.endswith(".png")
        ])


        # Preload gt.yml data for all classes
        self.gt_data = {}
        for class_id in range(1, 16):
            class_str = f"{class_id:02d}"
            gt_path = os.path.join(linemod_root, class_str, "gt.yml")
            if os.path.exists(gt_path):
                with open(gt_path, 'r') as f:
                    self.gt_data[class_str] = yaml.safe_load(f)

        
        self.rgb_transform = T.Compose([
            T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
            T.RandomHorizontalFlip(),
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225])
        ])
        self.depth_transform = T.Compose([
            # T.ToTensor(),
            # T.Resize((224, 224)),
            T.Resize((224, 224)),
            T.ToTensor()  # Converts PIL float32 to FloatTensor [1, H, W]
        ])

    def __len__(self):
        return len(self.RGB_img_filenames)

    def normalize_depth(self,depth):
        depth = np.array(depth).astype(np.float32)
        return (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)

    def __getitem__(self, idx):

        RGB_filename = self.RGB_img_filenames[idx]
        class_id_str, img_id_str = RGB_filename.split("_")
        img_id = int(os.path.splitext(img_id_str)[0])

        # Load RGB image
        rgb_path = os.path.join(self.rgb_dir, RGB_filename)
        rgb_img = Image.open(rgb_path).convert("RGB")
        rgb_tensor = self.rgb_transform(rgb_img)

        # Load Depth image
        # depth_path = os.path.join(self.depth_dir, RGB_filename)
        # depth_img = Image.open(depth_path).convert("I")   # Single-channel

        # # Normalize and convert to PIL for transforms
        # depth_np_norm = self.normalize_depth(depth_img)
        # depth_img_norm = Image.fromarray((depth_np_norm * 255).astype(np.uint8))

        # depth_tensor = self.depth_transform(depth_img_norm)
           # Load Depth image (PIL single channel)
        depth_path = os.path.join(self.depth_dir, RGB_filename)
        depth_img = Image.open(depth_path).convert("I")  # 32-bit integer depth

        # Convert to float numpy, normalize (e.g., scale mm->meters or divide by max)
        depth_np = np.array(depth_img).astype(np.float32)
        depth_np /= 1000.0  # if in mm, convert to meters, adjust as per your data

        # Optional: clip depth values to a max distance (e.g., 2 meters)
        depth_np = np.clip(depth_np, 0, 2.0)

        # Normalize depth to [0,1] by dividing by max depth value (2.0)
        depth_np /= 2.0

        # Convert normalized float depth to PIL image in 'F' mode (32-bit float)
        depth_img_float = Image.fromarray(depth_np).convert('F')

        # Apply depth transforms (Resize -> ToTensor)
        depth_tensor = self.depth_transform(depth_img_float)  # [1, H, W], float32 in [0,1]


        # Load pose from GT file
        pose_list = self.gt_data[class_id_str][img_id]
        pose = next(item for item in pose_list if item['obj_id'] == int(class_id_str))

        R_mat = np.array(pose['cam_R_m2c']).reshape(3, 3).astype(np.float32)
        quat = R.from_matrix(R_mat).as_quat().astype(np.float32)
        quat /= np.linalg.norm(quat)
        t_vec = np.array(pose['cam_t_m2c'], dtype=np.float32) / 1000.0  #  to meters

        return {
            'RGB_image': rgb_tensor,
            'depth_image': depth_tensor,
            'rotation': torch.tensor(quat, dtype=torch.float32),
            'rotation_matrix': torch.tensor(R_mat, dtype=torch.float32),
            'translation': torch.tensor(t_vec, dtype=torch.float32),
            'class_id': int(class_id_str),
            'filename': RGB_filename
        }




In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class PoseNet6D_MLP_ConcatFusion(nn.Module):
    def __init__(self, pretrained=True, compress_rgb=True):
        super(PoseNet6D_MLP_ConcatFusion, self).__init__()

        # RGB branch (ResNet50)
        resnet_rgb = models.resnet50(
            weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None
        )
        self.rgb_backbone = nn.Sequential(*list(resnet_rgb.children())[:-1])  # (B, 2048, 1, 1)

        # Compress RGB features to 512-dim
        self.compress_rgb = compress_rgb
        if compress_rgb:
            self.rgb_compress = nn.Linear(2048, 512)

        # Depth branch (ResNet18, 1-channel input)
        resnet_depth = models.resnet18(weights=None)
        resnet_depth.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.depth_backbone = nn.Sequential(*list(resnet_depth.children())[:-1])  # (B, 512, 1, 1)

        # MLP gating to produce 2 gating scalars
        self.gate_mlp = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 2),
            nn.Sigmoid()
        )

        # Pose regression layers (input = 1024 after concat)
        self.fc_rot = nn.Linear(1024, 4)
        self.fc_trans = nn.Linear(1024, 3)

    def forward(self, rgb_img, depth_img):
        # RGB feature extraction
        rgb_feat = self.rgb_backbone(rgb_img).squeeze(-1).squeeze(-1)  # (B, 2048)
        if self.compress_rgb:
            rgb_feat = self.rgb_compress(rgb_feat)  # (B, 512)

        # Depth feature extraction
        depth_feat = self.depth_backbone(depth_img).squeeze(-1).squeeze(-1)  # (B, 512)

        # Gating
        concat_feat = torch.cat([rgb_feat, depth_feat], dim=1)  # (B, 1024)
        gates = self.gate_mlp(concat_feat)                      # (B, 2)
        rgb_gate = gates[:, 0].unsqueeze(1)
        depth_gate = gates[:, 1].unsqueeze(1)

        # Apply gates
        gated_rgb_feat = rgb_feat * rgb_gate
        gated_depth_feat = depth_feat * depth_gate

        # Fusion by concatenation (final feature = 1024)
        fused_feat = torch.cat([gated_rgb_feat, gated_depth_feat], dim=1)  # (B, 1024)

        # Pose regression
        rot = self.fc_rot(fused_feat)
        trans = self.fc_trans(fused_feat)
        rot = F.normalize(rot, dim=1)

        return rot, trans


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class PoseNet6D_MLP_ConcatFusion_512(nn.Module):
    def __init__(self, pretrained=True, compress_rgb=True):
        super(PoseNet6D_MLP_ConcatFusion_512, self).__init__()

        # RGB branch (ResNet50)
        resnet_rgb = models.resnet50(
            weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None
        )
        self.rgb_backbone = nn.Sequential(*list(resnet_rgb.children())[:-1])  # (B, 2048, 1, 1)

        # Compress RGB features to 512-dim
        self.compress_rgb = compress_rgb
        if compress_rgb:
            self.rgb_compress = nn.Linear(2048, 512)

        # Depth branch (ResNet18, 1-channel input)
        resnet_depth = models.resnet18(weights=None)
        resnet_depth.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.depth_backbone = nn.Sequential(*list(resnet_depth.children())[:-1])  # (B, 512, 1, 1)

        # MLP gating to produce 2 gating scalars
        self.gate_mlp = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 2),
            nn.Sigmoid()
        )

        # Fusion projection layer after concatenation (1024 → 512)
        self.fusion_fc = nn.Linear(1024, 512)

        # Pose regression layers
        self.fc_rot = nn.Linear(512, 4)
        self.fc_trans = nn.Linear(512, 3)

    def forward(self, rgb_img, depth_img):
        # RGB feature extraction
        rgb_feat = self.rgb_backbone(rgb_img).squeeze(-1).squeeze(-1)  # (B, 2048)
        if self.compress_rgb:
            rgb_feat = self.rgb_compress(rgb_feat)  # (B, 512)

        # Depth feature extraction
        depth_feat = self.depth_backbone(depth_img).squeeze(-1).squeeze(-1)  # (B, 512)

        # Gating
        concat_feat = torch.cat([rgb_feat, depth_feat], dim=1)  # (B, 1024)
        gates = self.gate_mlp(concat_feat)                      # (B, 2)
        rgb_gate = gates[:, 0].unsqueeze(1)
        depth_gate = gates[:, 1].unsqueeze(1)

        # Apply gates
        gated_rgb_feat = rgb_feat * rgb_gate
        gated_depth_feat = depth_feat * depth_gate

        # Fusion by concatenation
        fused_feat = torch.cat([gated_rgb_feat, gated_depth_feat], dim=1)  # (B, 1024)
        fused_feat = self.fusion_fc(fused_feat)  # (B, 512)

        # Pose regression
        rot = self.fc_rot(fused_feat)
        trans = self.fc_trans(fused_feat)
        rot = F.normalize(rot, dim=1)

        return rot, trans


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class PoseNet6D_MLP_Fusion(nn.Module):
    def __init__(self, pretrained=True, compress_rgb=True):
        super(PoseNet6D_MLP_Fusion, self).__init__()

        # RGB branch (ResNet50)
        resnet_rgb = models.resnet50(
            weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None
        )
        self.rgb_backbone = nn.Sequential(*list(resnet_rgb.children())[:-1])  # (B, 2048, 1, 1)

        # Compress RGB features to 512-dim
        self.compress_rgb = compress_rgb
        if compress_rgb:
            self.rgb_compress = nn.Linear(2048, 512)

        # Depth branch (ResNet18, 1-channel input)
        resnet_depth = models.resnet18(weights=None)
        resnet_depth.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.depth_backbone = nn.Sequential(*list(resnet_depth.children())[:-1])  # (B, 512, 1, 1)

        # MLP gating to produce 2 gating scalars (for RGB and Depth) from concatenated features
        # Input size = 512 + 512 = 1024 (after compression)
        self.gate_mlp = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 2),      # 2 gating weights: one for RGB, one for Depth
            nn.Sigmoid()            # Output in [0,1]
        )

        # Pose regression layers (after fusion)
        self.fc_rot = nn.Linear(512, 4)   # quaternion
        self.fc_trans = nn.Linear(512, 3) # translation

    def forward(self, rgb_img, depth_img):
        # RGB feature extraction
        rgb_feat = self.rgb_backbone(rgb_img).squeeze(-1).squeeze(-1)  # (B, 2048)
        if self.compress_rgb:
            rgb_feat = self.rgb_compress(rgb_feat)  # (B, 512)

        # Depth feature extraction
        depth_feat = self.depth_backbone(depth_img).squeeze(-1).squeeze(-1)  # (B, 512)

        # Concatenate features for gating MLP
        concat_feat = torch.cat([rgb_feat, depth_feat], dim=1)  # (B, 1024)
        gates = self.gate_mlp(concat_feat)                      # (B, 2), values in [0,1]

        # Split gating weights
        rgb_gate = gates[:, 0].unsqueeze(1)     # (B,1)
        depth_gate = gates[:, 1].unsqueeze(1)   # (B,1)

        # Apply gates to features (element-wise multiply)
        gated_rgb_feat = rgb_feat * rgb_gate    # (B, 512)
        gated_depth_feat = depth_feat * depth_gate  # (B, 512)

        # Fuse gated features (sum or concat; here we sum)
        fused_feat = gated_rgb_feat + gated_depth_feat  # (B, 512)

        # Pose regression
        rot = self.fc_rot(fused_feat)       # (B, 4)
        trans = self.fc_trans(fused_feat)   # (B, 3)

        # Normalize quaternion to unit length
        rot = F.normalize(rot, dim=1)

        return rot, trans


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class PoseNet6D(nn.Module):
    def __init__(self, pretrained=True, compress_rgb=True):
        super(PoseNet6D, self).__init__()

        # ==== RGB branch: ResNet50 ====
        resnet_rgb = models.resnet50(
            weights=models.ResNet50_Weights.IMAGENET1K_V2 if pretrained else None
        )
        self.rgb_backbone = nn.Sequential(*list(resnet_rgb.children())[:-1])  # Output: (B, 2048, 1, 1)

        #  compression layer for RGB features
        self.compress_rgb = compress_rgb
        if compress_rgb:
            self.rgb_compress = nn.Linear(2048, 512)  # Match depth feature dim

        # ==== Depth branch: ResNet18 modified for 1-channel input ====
        resnet_depth = models.resnet18(weights=None)  # No pretrained weights for depth
        resnet_depth.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.depth_backbone = nn.Sequential(*list(resnet_depth.children())[:-1])  # Output: (B, 512, 1, 1)

        # ==== Pose regression ====
        self.fc_rot = nn.Linear(1024, 4)   # 512 + 512 -> Quaternion
        self.fc_trans = nn.Linear(1024, 3) # 512 + 512 -> Translation

    def forward(self, rgb_img, depth_img):
        # RGB features
        rgb_feat = self.rgb_backbone(rgb_img).squeeze(-1).squeeze(-1)  # (B, 2048)
        if self.compress_rgb:
            rgb_feat = self.rgb_compress(rgb_feat)  # (B, 512)

        # Depth features
        depth_feat = self.depth_backbone(depth_img).squeeze(-1).squeeze(-1)  # (B, 512)

        # Concatenate features
        feat = torch.cat([rgb_feat, depth_feat], dim=1)  # (B, 1024)

        # Predict pose
        rot = self.fc_rot(feat)         # (B, 4)
        trans = self.fc_trans(feat)     # (B, 3)
        rot = F.normalize(rot, dim=1)   # Normalize quaternion
        return rot, trans


In [8]:
import torch
import torch.nn.functional as F

def mse_pose_loss(pred_q, pred_t, gt_q, gt_t):
    return torch.mean((pred_q - gt_q)**2) + torch.mean((pred_t - gt_t)**2)

def angle_pose_loss(pred_q, pred_t, gt_q, gt_t):
    pred_q = F.normalize(pred_q, dim=1)
    gt_q = F.normalize(gt_q, dim=1)
    cos_sim = torch.sum(pred_q * gt_q, dim=1).clamp(-1+1e-7, 1-1e-7)
    angle_loss = torch.mean(1 - cos_sim.abs())
    trans_loss = torch.mean((pred_t - gt_t)**2)
    return angle_loss + trans_loss

def smooth_l1_pose_loss(pred_q, pred_t, gt_q, gt_t):
    return F.smooth_l1_loss(pred_q, gt_q) + F.smooth_l1_loss(pred_t, gt_t)

def pose_loss(pred_q, pred_t, gt_q, gt_t):
    rot_loss = 1 - torch.sum(pred_q * gt_q, dim=1)**2
    trans_loss = torch.mean((pred_t - gt_t)**2, dim=1)
    return rot_loss.mean() + trans_loss.mean()


In [9]:
def train_model(model, dataloader, optimizer, device, scaler=None):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        rgb = batch['RGB_image'].to(device)
        depth = batch['depth_image'].to(device)
        gt_q = batch['rotation'].to(device)
        gt_t = batch['translation'].to(device)

        optimizer.zero_grad()

        if scaler:  # Mixed precision
            with torch.cuda.amp.autocast():
                pred_q, pred_t = model(rgb, depth)
                loss = pose_loss(pred_q, pred_t, gt_q, gt_t)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            pred_q, pred_t = model(rgb, depth)
            loss = pose_loss(pred_q, pred_t, gt_q, gt_t)
               # Debug print (only for the first batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # optional
            optimizer.step()
        # Print per-batch loss
        print(f"[Batch {i+1}/{len(dataloader)}] Loss: {loss.item():.4f}")
        total_loss += loss.item()

    return total_loss / len(dataloader)


In [10]:
def validate_model(model, dataloader, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            rgb = batch['RGB_image'].to(device)
            depth = batch['depth_image'].to(device)
            gt_q = batch['rotation'].to(device)
            gt_t = batch['translation'].to(device)

            pred_q, pred_t = model(rgb, depth)
            loss = pose_loss(pred_q, pred_t, gt_q, gt_t)
            total_loss += loss.item()

    return total_loss / len(dataloader)


In [11]:
from scipy.spatial.transform import Rotation as R
import numpy as np
import open3d as o3d
from tqdm import tqdm
import os
from collections import defaultdict


def load_model_points(models_dir, class_id):
    class_id = int(class_id)  # Ensure it's an integer
    model_path = os.path.join(models_dir, f"obj_{class_id:02d}.ply")
    if not os.path.exists(model_path):
        print(f"[ERROR] File not found: {model_path}")
    mesh = o3d.io.read_triangle_mesh(model_path)
    return np.asarray(mesh.vertices).astype(np.float32)


def compute_ADD(R_pred, t_pred, R_gt, t_gt, model_points):
    pred_pts = (R_pred @ model_points.T).T + t_pred
    gt_pts = (R_gt @ model_points.T).T + t_gt
    return np.mean(np.linalg.norm(pred_pts - gt_pts, axis=1))

def evaluate_ADD_per_class(model, dataloader, device, threshold=0.1):
    model.eval()

    models_dir = os.path.join("/kaggle/input/linemod/Linemod_preprocessed", "models")

    add_scores_per_class = defaultdict(list)
    correct_counts_per_class = defaultdict(int)
    total_counts_per_class = defaultdict(int)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating ADD per class"):
            rgb = batch['RGB_image'].to(device)
            depth = batch['depth_image'].to(device)
            gt_q = batch['rotation'].cpu().numpy()
            gt_t = batch['translation'].cpu().numpy()
            class_ids = batch['class_id'].cpu().numpy()

            pred_q, pred_t = model(rgb, depth)
            pred_q = pred_q.cpu().numpy()
            pred_t = pred_t.cpu().numpy()

            # Normalize predicted quaternions
            norms = np.linalg.norm(pred_q, axis=1, keepdims=True) + 1e-8  # avoid division by zero
            pred_q = pred_q / norms

            for i in range(len(gt_q)):
                class_id = class_ids[i]
                R_gt = R.from_quat(gt_q[i]).as_matrix()
                R_pred = R.from_quat(pred_q[i]).as_matrix()
                t_gt = gt_t[i]
                t_pred = pred_t[i]

                model_points = load_model_points(models_dir, class_id)
                model_points = model_points / 1000.0
                add = compute_ADD(R_pred, t_pred, R_gt, t_gt, model_points)

                add_scores_per_class[class_id].append(add)
                total_counts_per_class[class_id] += 1
                if add < threshold:
                    correct_counts_per_class[class_id] += 1

    print("\n=== ADD per class ===")
    mean_adds = []
    for class_id in sorted(add_scores_per_class.keys()):
        mean_add = np.mean(add_scores_per_class[class_id])
        mean_adds.append(mean_add)
        acc = 100.0 * correct_counts_per_class[class_id] / total_counts_per_class[class_id]
        print(f"Class {class_id:02d} → Mean ADD: {mean_add:.4f} m, Accuracy (<{threshold*100:.0f}cm): {acc:.2f}%")
    overall_mean_add = np.mean(mean_adds)
    print(f"\n=== Overall Mean ADD (averaged over classes): {overall_mean_add:.4f} m ===")

    return add_scores_per_class, correct_counts_per_class, total_counts_per_class,overall_mean_add

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [12]:
import torch
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


In [18]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

RGB_cropped_dir = "/kaggle/input/rgboutput/RGB_crop/train/train_cropped_objects"
depth_cropped_dir = "/kaggle/input/depthoutput/depth_crop/train/train_cropped_objects"
linemod_root = "/kaggle/input/linemod/Linemod_preprocessed/data"
save_path = "/kaggle/working/test_dataset.pt"
set_seed(42)
# Load dataset
RGB_image_files = [f for f in os.listdir(RGB_cropped_dir) if f.endswith(".png")]

test_dataset = PoseDataset(
    rgb_dir=RGB_cropped_dir,
    depth_dir=depth_cropped_dir,
    linemod_root=linemod_root
)
print(f"len of test_dataset is={len(test_dataset)}")
torch.save(test_dataset, save_path)
print(f"test_dataset saved at: {save_path}")



test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
print("test loader done")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PoseNet6D_MLP_ConcatFusion() #PoseNet6D_MLP_ConcatFusion   PoseNet6D_MLP_Fusion
model.to(device)
# Load model weights
model.load_state_dict(torch.load("/kaggle/input/loss-dataset/mse-loss/best_model.pth"))
print("model loading done")
model.to(device)
model.eval()
# Evaluation
with torch.no_grad():
    add_scores_per_class, correct_counts_per_class, total_counts_per_class, overall_mean_add = evaluate_ADD_per_class(
        model, test_loader, device
    )
# add_scores_per_class, correct_counts_per_class, total_counts_per_class,overall_mean_add= evaluate_ADD_per_class(model, test_loader, device)




len of test_dataset is=2373
test_dataset saved at: /kaggle/working/test_dataset.pt
test loader done
model loading done


Evaluating ADD per class: 100%|██████████| 149/149 [01:35<00:00,  1.56it/s]



=== ADD per class ===
Class 01 → Mean ADD: 0.0552 m, Accuracy (<10cm): 90.86%
Class 02 → Mean ADD: 0.0372 m, Accuracy (<10cm): 95.58%
Class 04 → Mean ADD: 0.0428 m, Accuracy (<10cm): 93.37%
Class 05 → Mean ADD: 0.0433 m, Accuracy (<10cm): 94.44%
Class 06 → Mean ADD: 0.0451 m, Accuracy (<10cm): 90.96%
Class 08 → Mean ADD: 0.0427 m, Accuracy (<10cm): 91.62%
Class 09 → Mean ADD: 0.0525 m, Accuracy (<10cm): 89.95%
Class 10 → Mean ADD: 0.0516 m, Accuracy (<10cm): 88.83%
Class 11 → Mean ADD: 0.0459 m, Accuracy (<10cm): 95.11%
Class 12 → Mean ADD: 0.0444 m, Accuracy (<10cm): 91.94%
Class 13 → Mean ADD: 0.0475 m, Accuracy (<10cm): 90.75%
Class 14 → Mean ADD: 0.0432 m, Accuracy (<10cm): 91.35%
Class 15 → Mean ADD: 0.0408 m, Accuracy (<10cm): 95.11%

=== Overall Mean ADD (averaged over classes): 0.0456 m ===


In [14]:
# test_dataset = torch.load("/content/drive/MyDrive/test_dataset.pt", weights_only=False)
# print("test_dataset loaded.")
# test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
# print("test loader done")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = PoseNet6D_MLP_GatingFusion()
# model.to(device)
# # Load model weights
# model.load_state_dict(torch.load("/content/drive/MyDrive/yolo_models/linemod_yolo_v8n/extension_model_all/final_model.pth"))
# print("model loading done")
# model.to(device)
# add_scores_per_class, correct_counts_per_class, total_counts_per_class,overall_mean_add= evaluate_ADD_per_class(model, test_loader, device)



In [15]:
# from scipy.spatial.transform import Rotation as R
# import numpy as np
# import open3d as o3d
# from tqdm import tqdm
# import os
# from collections import defaultdict
# mean_adds = []
# for class_id in sorted(add_scores_per_class.keys()):
#     mean_add = np.mean(add_scores_per_class[class_id])
#     mean_adds.append(mean_add)
#     acc = 100.0 * correct_counts_per_class[class_id] / total_counts_per_class[class_id]
#     print(f"Class {class_id:02d} → Mean ADD: {mean_add:.4f} m")
# overall_mean_add = np.mean(mean_adds)
# print(f"\n=== Overall Mean ADD (averaged over classes): {overall_mean_add:.4f} m ===")