# Baseline Model Evaluation on Real Images

This notebook evaluates the baseline ResNet18 model trained on simulated images on the real dataset.

In [None]:
import os
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

## Configuration

In [None]:
# Paths
REAL_TEST_DIR = 'data/real/test'  # Path to real test images
MODEL_PATH = 'model/baseline_resnet18.pth'

# Parameters
BATCH_SIZE = 32
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. Data Transforms & Loader

In [None]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = datasets.ImageFolder(REAL_TEST_DIR, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

classes = test_dataset.classes
print(f'Classes: {classes}')
print(f'Number of test samples: {len(test_dataset)}')

## 2. Load Model

In [None]:
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, len(classes))
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model = model.to(DEVICE)
model.eval()

## 3. Evaluate on Real Test Set

In [None]:
all_labels = []
all_preds = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Overall Accuracy
accuracy = np.mean(all_preds == all_labels)
print(f'Baseline Model Accuracy on Real Test Set: {accuracy:.4f}')

# Classification Report
report = classification_report(all_labels, all_preds, target_names=classes)
print('Classification Report:\n', report)

## 4. Confusion Matrix

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix on Real Test Set')
plt.show()

## 5. Show Sample Misclassified Images

In [None]:
import random
from torchvision.utils import make_grid

misclassified_idx = np.where(all_preds != all_labels)[0]
print(f'Total Misclassified Samples: {len(misclassified_idx)}')

if len(misclassified_idx) > 0:
    sample_idx = random.sample(list(misclassified_idx), min(12, len(misclassified_idx)))
    images = []
    titles = []
    for idx in sample_idx:
        path, _ = test_dataset.samples[idx]
        img = transforms.ToTensor()(transforms.Resize((224,224))(transforms.Image.open(path).convert('RGB')))
        images.append(img)
        titles.append(f'True: {classes[all_labels[idx]]}\nPred: {classes[all_preds[idx]]}')
    
    grid_img = make_grid(images, nrow=4)
    plt.figure(figsize=(12,8))
    plt.imshow(np.transpose(grid_img.numpy(), (1,2,0)))
    plt.axis('off')
    plt.title('Sample Misclassified Images')
    plt.show()