# Evaluation: Confusion Matrix and Clinical Emphasis

This notebook evaluates the trained MLP on extracted features with a focus on clinical safety: false negatives are dangerous, so recall is prioritized.

In [1]:
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# Paths
feature_dir = Path('.')
val_feat_path = feature_dir / 'val_features.npy'
val_lbl_path = feature_dir / 'val_labels.npy'
model_path = feature_dir / 'best_mlp_model.pt'

for p in [val_feat_path, val_lbl_path, model_path]:
    if not p.exists():
        raise FileNotFoundError(f'Missing file: {p.resolve()}')

print('Files found.')

Files found.


In [3]:
# Load validation features and labels
val_features = np.load(val_feat_path)
val_labels = np.load(val_lbl_path)

print('Val features:', val_features.shape)
print('Val labels:', val_labels.shape)

Val features: (16, 512)
Val labels: (16,)


In [4]:
# MLP model definition (must match training)
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=512, num_classes=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.net(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Load model
checkpoint = torch.load(model_path, map_location=device)
input_dim = checkpoint.get('input_dim', val_features.shape[1])
num_classes = checkpoint.get('num_classes', 2)

model = MLPClassifier(input_dim=input_dim, num_classes=num_classes).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print('Model loaded.')

Model loaded.


In [6]:
# Build dataloader
batch_size = 64
val_ds = TensorDataset(
    torch.from_numpy(val_features).float(),
    torch.from_numpy(val_labels).long()
)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

print(f'Val batches: {len(val_loader)}')

Val batches: 1


In [7]:
# Predictions
all_preds = []
all_labels = []
with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        logits = model(x)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(y.numpy())

y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_labels)

In [8]:
# Confusion matrix and metrics (binary classification)
# Class mapping: 0 = Normal, 1 = Pneumonia
tp = int(((y_true == 1) & (y_pred == 1)).sum())
tn = int(((y_true == 0) & (y_pred == 0)).sum())
fp = int(((y_true == 0) & (y_pred == 1)).sum())
fn = int(((y_true == 1) & (y_pred == 0)).sum())

precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

confusion = np.array([[tn, fp], [fn, tp]])

print('Confusion Matrix (rows=true, cols=pred):')
print(confusion)
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Confusion Matrix (rows=true, cols=pred):
[[1 7]
 [0 8]]
Precision: 0.5333
Recall: 1.0000


## Health-Focused Interpretation

- **False negatives (FN)** are the most dangerous outcome because they can miss pneumonia cases.
- **Recall** is prioritized to minimize FN. A higher recall means fewer missed pneumonia cases.
- **Precision** indicates how many predicted pneumonia cases are correct, but in this setting, recall is the primary safety metric.

If recall is low, consider lowering the decision threshold or retraining with class-weighting to reduce false negatives.