In [1]:
from models import MVCNN
from tools import ImgDataset
import torch
from tqdm import tqdm

# Calculating Importance

## Initialization

In [2]:
device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
model = MVCNN.SVCNN('svcnn')
weights = torch.load('../../../MVCNN/MVCNN/model-00050.pth', map_location=device)
model.load_state_dict(weights)
model.to(device)
model.eval()

SVCNN(
  (net_1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  

In [4]:
# Extract CNN feature extractor from the model
feature_extractor = model.net_1
classifier = model.net_2

feature_extractor.eval()
classifier.eval()

num_views = 12  # Number of views per model

## Baseline Accuracy

In [None]:
test_dataset = ImgDataset.MultiviewImgDataset(
    root_dir='../../../MVCNN/ModelNet40-12View/*/test',
    scale_aug=False,
    rot_aug=False,
    test_mode=True,
    num_models=100,
    num_views=12,
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

In [None]:
baseline_correct = 0
baseline_total = 0

with torch.no_grad():
    for labels, views, paths in tqdm(test_loader, desc="Baseline Eval"):
        # Expected input: views shape = [batch_size, num_views, C, H, W]
        # But you're getting: views shape = [num_views, batch_size, C, H, W]
        
        # First, transpose to match expected format
        if views.dim() == 5:
            # If views is [V, N, C, H, W], transpose to [N, V, C, H, W]
            if views.shape[0] == 12:  # assuming 12 views
                views = views.transpose(0, 1)  # Now [N, V, C, H, W]
        
        N, V, C, H, W = views.shape
        
        # Reshape exactly like the original validation code
        in_data = views.view(-1, C, H, W).to(device)  # [N*V, C, H, W]
        target = labels.to(device).repeat_interleave(V)  # [N*V]
        
        # Process through the model (like original validation)
        # If using separate feature extractor and classifier
        features = feature_extractor(in_data)  # [N*V, feature_dim, H', W']
        
        # Pool features
        pooled_features = torch.nn.functional.adaptive_avg_pool2d(features, (7, 7))
        pooled_features = pooled_features.view(N*V, -1)  # [N*V, feature_dim]
        
        # Get predictions for all views
        out_data = classifier(pooled_features)  # [N*V, num_classes]
            
            # If using a single end-to-end model
        out_data = model(in_data)  # Replace 'model' with your actual model
        
        pred = torch.max(out_data, 1)[1]  # [N*V]
        
        # Now do majority voting like the original validation code
        for i in range(N):
            obj_preds = pred[i*V:(i+1)*V]  # Predictions for all views of object i
            obj_target = target[i*V]  # True label for object i
            
            # Majority voting across views
            obj_pred = torch.mode(obj_preds.cpu())[0].to(device)
            
            if obj_pred == obj_target:
                baseline_correct += 1
            baseline_total += 1

print(f"Baseline Accuracy: {baseline_correct/baseline_total:.4f}")
print(f"Total objects processed: {baseline_total}")

## Accuracy Of Each View

In [None]:
test_dataset = ImgDataset.SingleImgDataset(
    root_dir='../../../MVCNN/ModelNet40-12View/*/test',
    scale_aug=False,
    rot_aug=False,
    test_mode=True,
    num_models=100,
    num_views=12,
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

"---------------------------------------------------------------------------------------------------------------------"

# More efficient version - processes all single views at once
single_view_results = {i: {'correct': 0, 'total': 0} for i in range(num_views)}

with torch.no_grad():
    for labels, views, paths in tqdm(test_loader, desc="Single View Analysis"):
        # SingleImgDataset should return [batch_size, C, H, W]
        if views.dim() == 4:
            N, C, H, W = views.shape
            view_data = views.to(device)
            target = labels.to(device)
            
            # Get predictions using the full model
            out_data = model(view_data)
            pred = torch.max(out_data, 1)[1]
            
            # Extract view index from paths and accumulate results
            for i in range(N):
                path = paths[i]
                # Extract view index from path (assuming format like "object_shaded_X.png")
                view_idx = int(path.split('_')[-1].split('.')[0])
                
                correct = (pred[i] == target[i]).item()
                single_view_results[view_idx]['correct'] += correct
                single_view_results[view_idx]['total'] += 1
        else:
            print(f"Unexpected tensor shape: {views.shape}")
            break

# Print results
view_order = [0, 1, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9]
print("\nAccuracy using only one view:")
for view_idx in view_order:
    if single_view_results[view_idx]['total'] > 0:
        accuracy = single_view_results[view_idx]['correct'] / single_view_results[view_idx]['total']
        print(f"View {view_idx} only: {accuracy:.4f}")

# Summary statistics
accuracies = [(view_idx, single_view_results[view_idx]['correct'] / single_view_results[view_idx]['total']) 
              for view_idx in range(num_views) if single_view_results[view_idx]['total'] > 0]
accuracies.sort(key=lambda x: x[1], reverse=True)

print("\nViews ranked by accuracy:")
for view_idx, acc in accuracies:
    print(f"View {view_idx}: {acc:.4f}")

print(f"\nBest single view: View {accuracies[0][0]} ({accuracies[0][1]:.4f})")
print(f"Worst single view: View {accuracies[-1][0]} ({accuracies[-1][1]:.4f})")
print(f"Average single view accuracy: {sum(acc for _, acc in accuracies) / len(accuracies):.4f}")

Single View Analysis: 100%|██████████| 22320/22320 [02:52<00:00, 129.56it/s]


Accuracy using only one view:
View 0 only: 0.7005
View 1 only: 0.7522
View 10 only: 0.7699
View 11 only: 0.7435
View 2 only: 0.7522
View 3 only: 0.7312
View 4 only: 0.7349
View 5 only: 0.6984
View 6 only: 0.6285
View 7 only: 0.7118
View 8 only: 0.7285
View 9 only: 0.7312

Views ranked by accuracy:
View 10: 0.7699
View 1: 0.7522
View 2: 0.7522
View 11: 0.7435
View 4: 0.7349
View 3: 0.7312
View 9: 0.7312
View 8: 0.7285
View 7: 0.7118
View 0: 0.7005
View 5: 0.6984
View 6: 0.6285

Best single view: View 10 (0.7699)
Worst single view: View 6 (0.6285)
Average single view accuracy: 0.7236





## Accuracy While Removing Views

#### Use Multi View Dataset

In [14]:
test_dataset = ImgDataset.MultiviewImgDataset(
    root_dir='../../../MVCNN/ModelNet40-12View/*/test',
    scale_aug=False,
    rot_aug=False,
    test_mode=True,
    num_models=100,
    num_views=12,
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

"---------------------------------------------------------------------------------------------------------------------"

drop_results = {i: {'correct': 0, 'total': 0} for i in range(num_views)}

with torch.no_grad():
    for labels, views, paths in tqdm(test_loader, desc="Drop View Analysis", dynamic_ncols=True):
        if views.dim() == 5 and views.shape[0] == 12:
            views = views.transpose(0, 1)  # Now [N, V, C, H, W]
        
        N, V, C, H, W = views.shape
        
        # For each object, test dropping each view
        for i in range(N):
            obj_views = views[i]  # [V, C, H, W] - all 12 views for object i
            obj_label = labels[i].to(device)
            
            # Test dropping each view
            for drop_view in range(V):
                # Create tensor with all views except the dropped one
                remaining_views = torch.cat([
                    obj_views[:drop_view], 
                    obj_views[drop_view+1:]
                ], dim=0).to(device)  # [V-1, C, H, W]
                
                # Get predictions for remaining views only
                features = feature_extractor(remaining_views)
                pooled_features = torch.nn.functional.adaptive_avg_pool2d(features, (7, 7))
                pooled_features = pooled_features.view(V-1, -1)
                out_data = classifier(pooled_features)
                preds = torch.max(out_data, 1)[1]  # [V-1]
                
                # Majority voting on remaining predictions
                obj_pred = torch.mode(preds.cpu())[0].to(device)
                
                if obj_pred == obj_label:
                    drop_results[drop_view]['correct'] += 1
                drop_results[drop_view]['total'] += 1

# Print results
view_order = [0, 1, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9]
print("\nAccuracy when dropping each view:")
for view_idx in view_order:
    accuracy = drop_results[view_idx]['correct'] / drop_results[view_idx]['total']
    print(f"Drop view {view_idx}: {accuracy:.4f}")

Drop View Analysis: 100%|██████████| 233/233 [16:04<00:00,  4.14s/it]


Accuracy when dropping each view:
Drop view 0: 0.8978
Drop view 1: 0.8973
Drop view 10: 0.9000
Drop view 11: 0.8978
Drop view 2: 0.9005
Drop view 3: 0.9000
Drop view 4: 0.8995
Drop view 5: 0.8989
Drop view 6: 0.8995
Drop view 7: 0.9016
Drop view 8: 0.8984
Drop view 9: 0.8989



