# Baseline Model Evaluation Report

This notebook provides a comprehensive evaluation of the baseline ResNet18 model (`melanoma_model_weights.pth`) created in `3_create_first_model.ipynb`.

## Evaluation Metrics:
1. Overall Accuracy
2. Per-class Precision, Recall, F1-Score
3. **Melanoma Recall** (PRIMARY METRIC for medical applications)
4. Confusion Matrix
5. ROC Curves and AUC scores
6. Classification Report
7. Model architecture summary

## Purpose:
- Establish baseline performance for comparison with improved models
- Identify weaknesses in baseline model
- Document starting point before optimization

In [1]:
import os
import json
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    roc_auc_score,
)
from sklearn.preprocessing import label_binarize

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## 1. Load Baseline Model

In [2]:
# Load the baseline ResNet18 model
model = models.resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 3)  # 3 classes: benign, suspicious, melanoma

model_path = "melanoma_model_weights.pth"

if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"‚úì Baseline model loaded from: {model_path}")
else:
    raise FileNotFoundError(f"Model file not found: {model_path}")

model = model.to(device)
model.eval()

print(f"\nModel Architecture: ResNet18")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")



‚úì Baseline model loaded from: melanoma_model_weights.pth

Model Architecture: ResNet18
Number of parameters: 11,178,051
Trainable parameters: 11,178,051


## 2. Prepare Test Dataset

In [3]:
class HAM10000Dataset(Dataset):
    """Dataset class for HAM10000."""

    def __init__(self, image_dir, ann_dir, image_files, transform=None):
        self.image_dir = image_dir
        self.ann_dir = ann_dir
        self.image_files = image_files
        self.transform = transform

        # Class mapping from original labels to 3-class groups
        self.group_map = {
            "melanoma": "melanoma",
            "basal cell carcinoma": "suspicious",
            "actinic keratoses": "suspicious",
            "melanocytic nevi": "benign",
            "benign keratosis-like lesions": "benign",
            "dermatofibroma": "benign",
            "vascular lesions": "benign",
        }

        self.group_to_idx = {"benign": 0, "suspicious": 1, "melanoma": 2}
        self.idx_to_group = {0: "benign", 1: "suspicious", 2: "melanoma"}

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)

        # Load image
        image = Image.open(img_path).convert("RGB")

        # Load annotation
        ann_path = os.path.join(self.ann_dir, img_name + ".json")
        try:
            with open(ann_path, "r") as f:
                ann = json.load(f)
            original_label = ann["objects"][0]["classTitle"]
            group_label = self.group_map[original_label]
            label = self.group_to_idx[group_label]
        except Exception as e:
            print(f"Error loading {ann_path}: {e}")
            label = 0  # Default to benign

        # Apply transforms
        if self.transform:
            image = self.transform(image)

        return image, label

In [4]:
# Load test set files
from sklearn.model_selection import train_test_split

image_dir = "data/ham10000/ds/img"
ann_dir = "data/ham10000/ds/ann"

# Get all image files
all_files = [f for f in os.listdir(image_dir) if f.endswith(".jpg")]
print(f"Total images found: {len(all_files)}")

# Split into train/val/test (same as in other notebooks for consistency)
train_files, test_files = train_test_split(all_files, test_size=0.2, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=0.2, random_state=42)

print(f"\nDataset splits:")
print(f"  Training:   {len(train_files)} images ({len(train_files)/len(all_files)*100:.1f}%)")
print(f"  Validation: {len(val_files)} images ({len(val_files)/len(all_files)*100:.1f}%)")
print(f"  Test:       {len(test_files)} images ({len(test_files)/len(all_files)*100:.1f}%)")

# Create test dataset (NO AUGMENTATION)
test_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

test_dataset = HAM10000Dataset(image_dir, ann_dir, test_files, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

print(f"\n‚úì Test dataset created with {len(test_dataset)} images")

Total images found: 10015

Dataset splits:
  Training:   6409 images (64.0%)
  Validation: 1603 images (16.0%)
  Test:       2003 images (20.0%)

‚úì Test dataset created with 2003 images


## 3. Run Model Evaluation

In [5]:
def evaluate_model(model, test_loader, device):
    """Comprehensive model evaluation."""
    model.eval()

    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for images, batch_labels in tqdm(test_loader, desc="Evaluating"):
            images = images.to(device)
            outputs = model(images)
            probs = F.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.numpy())
            all_probs.extend(probs.cpu().numpy())

    return np.array(all_preds), np.array(all_labels), np.array(all_probs)


# Run evaluation
print("Evaluating baseline model on test set...\n")
preds, labels, probs = evaluate_model(model, test_loader, device)

print(f"‚úì Evaluation complete")
print(f"  Predictions generated: {len(preds)}")
print(f"  Probability scores computed: {probs.shape}")

Evaluating baseline model on test set...



Evaluating:   0%|          | 0/63 [00:05<?, ?it/s]



RuntimeError: DataLoader worker (pid(s) 19976, 16676) exited unexpectedly

## 4. Overall Performance Metrics

In [None]:
class_names = ["benign", "suspicious", "melanoma"]

# Calculate overall accuracy
test_acc = accuracy_score(labels, preds)

print("=" * 70)
print("BASELINE MODEL PERFORMANCE REPORT")
print("=" * 70)
print(f"\nModel: ResNet18 (melanoma_model_weights.pth)")
print(f"Test Set Size: {len(labels)} images")
print(f"Evaluation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\n" + "=" * 70)
print(f"OVERALL ACCURACY: {test_acc:.4f} ({test_acc*100:.2f}%)")
print("=" * 70)

## 5. Per-Class Performance (Clinical Metrics)

In [None]:
print("\n" + "=" * 70)
print("CLINICAL METRICS - PER CLASS PERFORMANCE")
print("=" * 70)

for i, class_name in enumerate(class_names):
    class_preds_binary = (preds == i).astype(int)
    class_labels_binary = (labels == i).astype(int)

    recall = recall_score(class_labels_binary, class_preds_binary, zero_division=0)
    precision = precision_score(class_labels_binary, class_preds_binary, zero_division=0)
    f1 = f1_score(class_labels_binary, class_preds_binary, zero_division=0)
    support = np.sum(labels == i)

    print(f"\n{class_name.upper()}:")
    print(f"  Support (# samples):  {support}")
    print(f"  Precision:            {precision:.4f} ({precision*100:.2f}%)")
    print(
        f"  Recall (Sensitivity): {recall:.4f} ({recall*100:.2f}%) {'üî¥ CRITICAL!' if class_name == 'melanoma' else ''}"
    )
    print(f"  F1-Score:             {f1:.4f}")

# Calculate macro and weighted averages
macro_precision = precision_score(labels, preds, average="macro", zero_division=0)
macro_recall = recall_score(labels, preds, average="macro", zero_division=0)
macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)

weighted_precision = precision_score(labels, preds, average="weighted", zero_division=0)
weighted_recall = recall_score(labels, preds, average="weighted", zero_division=0)
weighted_f1 = f1_score(labels, preds, average="weighted", zero_division=0)

# Melanoma-specific recall (PRIMARY METRIC)
melanoma_recall = recall_score((labels == 2).astype(int), (preds == 2).astype(int), zero_division=0)

print("\n" + "=" * 70)
print("AGGREGATE METRICS")
print("=" * 70)
print(f"\nMacro Average (equal weight per class):")
print(f"  Precision: {macro_precision:.4f}")
print(f"  Recall:    {macro_recall:.4f}")
print(f"  F1-Score:  {macro_f1:.4f}")

print(f"\nWeighted Average (by class support):")
print(f"  Precision: {weighted_precision:.4f}")
print(f"  Recall:    {weighted_recall:.4f}")
print(f"  F1-Score:  {weighted_f1:.4f}")

print("\n" + "=" * 70)
print(f"üî¥ PRIMARY CLINICAL METRIC - MELANOMA RECALL: {melanoma_recall:.4f} ({melanoma_recall*100:.2f}%)")
print("=" * 70)

# Clinical interpretation
if melanoma_recall >= 0.95:
    print("\n‚úÖ EXCELLENT: Melanoma recall ‚â•95% - Clinically acceptable")
elif melanoma_recall >= 0.90:
    print("\n‚úì GOOD: Melanoma recall ‚â•90% - Acceptable with monitoring")
elif melanoma_recall >= 0.85:
    print("\n‚ö†Ô∏è WARNING: Melanoma recall <90% - Needs improvement")
else:
    print("\n‚ùå CRITICAL: Melanoma recall <85% - NOT clinically safe")

## 6. Classification Report

In [None]:
print("\n" + "=" * 70)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 70)
print(classification_report(labels, preds, target_names=class_names, digits=4))

## 7. Confusion Matrix

In [None]:
# Calculate confusion matrix
cm = confusion_matrix(labels, preds)

print("\nConfusion Matrix (Raw Counts):")
print("=" * 70)
print("              Predicted")
print("         Benign  Suspicious  Melanoma")
print(f"Benign      {cm[0,0]:4d}      {cm[0,1]:4d}      {cm[0,2]:4d}")
print(f"Suspicious  {cm[1,0]:4d}      {cm[1,1]:4d}      {cm[1,2]:4d}")
print(f"Melanoma    {cm[2,0]:4d}      {cm[2,1]:4d}      {cm[2,2]:4d}")

# Visualize confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={"label": "Count"},
)
plt.ylabel("True Label", fontsize=12, fontweight="bold")
plt.xlabel("Predicted Label", fontsize=12, fontweight="bold")
plt.title("Confusion Matrix - Baseline ResNet18 Model", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.savefig("baseline_confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n‚úì Confusion matrix saved as: baseline_confusion_matrix.png")

## 8. Normalized Confusion Matrix (Percentages)

In [None]:
# Normalize confusion matrix by row (true labels)
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2%",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={"label": "Percentage"},
)
plt.ylabel("True Label", fontsize=12, fontweight="bold")
plt.xlabel("Predicted Label", fontsize=12, fontweight="bold")
plt.title("Normalized Confusion Matrix - Baseline ResNet18 Model\n(Row percentages)", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.savefig("baseline_confusion_matrix_normalized.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì Normalized confusion matrix saved as: baseline_confusion_matrix_normalized.png")

# Highlight critical errors
print("\n" + "=" * 70)
print("CRITICAL ERROR ANALYSIS")
print("=" * 70)
melanoma_as_benign = cm[2, 0]  # True melanoma predicted as benign
melanoma_total = cm[2, :].sum()
print(f"\nüî¥ MOST DANGEROUS ERROR:")
print(
    f"   Melanoma misclassified as Benign: {melanoma_as_benign}/{melanoma_total} ({melanoma_as_benign/melanoma_total*100:.2f}%)"
)
print(f"   These are FALSE NEGATIVES - patient doesn't get treatment!")

melanoma_as_suspicious = cm[2, 1]  # True melanoma predicted as suspicious
print(f"\n‚ö†Ô∏è MODERATE RISK:")
print(
    f"   Melanoma misclassified as Suspicious: {melanoma_as_suspicious}/{melanoma_total} ({melanoma_as_suspicious/melanoma_total*100:.2f}%)"
)
print(f"   These still get medical attention (better than benign)")

benign_as_melanoma = cm[0, 2]  # True benign predicted as melanoma
benign_total = cm[0, :].sum()
print(f"\n‚ö†Ô∏è FALSE ALARM (less critical):")
print(
    f"   Benign misclassified as Melanoma: {benign_as_melanoma}/{benign_total} ({benign_as_melanoma/benign_total*100:.2f}%)"
)
print(f"   These cause unnecessary anxiety but are safer than false negatives")

## 9. ROC Curves and AUC Scores

In [None]:
# Binarize labels for ROC curve (one-vs-rest)
labels_binarized = label_binarize(labels, classes=[0, 1, 2])
n_classes = 3

# Compute ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(labels_binarized[:, i], probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves
plt.figure(figsize=(12, 8))
colors = ["blue", "orange", "red"]
line_styles = ["-", "--", "-."]

for i, (color, ls) in enumerate(zip(colors, line_styles)):
    plt.plot(
        fpr[i], tpr[i], color=color, linestyle=ls, linewidth=2, label=f"{class_names[i]} (AUC = {roc_auc[i]:.4f})"
    )

plt.plot([0, 1], [0, 1], "k--", linewidth=1, label="Random Classifier (AUC = 0.5000)")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate", fontsize=12, fontweight="bold")
plt.ylabel("True Positive Rate (Recall)", fontsize=12, fontweight="bold")
plt.title("ROC Curves - Baseline ResNet18 Model\n(One-vs-Rest)", fontsize=14, fontweight="bold")
plt.legend(loc="lower right", fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("baseline_roc_curves.png", dpi=300, bbox_inches="tight")
plt.show()

print("‚úì ROC curves saved as: baseline_roc_curves.png")

print("\n" + "=" * 70)
print("AUC SCORES (Area Under ROC Curve)")
print("=" * 70)
for i, class_name in enumerate(class_names):
    print(f"{class_name:12s}: {roc_auc[i]:.4f}")

# Macro average AUC
macro_auc = np.mean(list(roc_auc.values()))
print(f"\nMacro Average AUC: {macro_auc:.4f}")
print("\nInterpretation:")
print("  AUC = 1.0: Perfect classifier")
print("  AUC = 0.9-1.0: Excellent")
print("  AUC = 0.8-0.9: Good")
print("  AUC = 0.7-0.8: Fair")
print("  AUC = 0.5-0.7: Poor")
print("  AUC = 0.5: Random guess")

## 10. Model Weaknesses and Improvement Opportunities

In [None]:
print("=" * 70)
print("BASELINE MODEL WEAKNESSES & IMPROVEMENT OPPORTUNITIES")
print("=" * 70)

# Analyze performance gaps
print("\n1. MELANOMA DETECTION (Most Critical):")
if melanoma_recall < 0.90:
    print(f"   ‚ö†Ô∏è Melanoma recall ({melanoma_recall:.4f}) is below clinical threshold (0.90)")
    print(f"   ‚Üí Need to increase sensitivity for melanoma class")
    print(f"   ‚Üí Consider class weighting, focal loss, or ensemble methods")
else:
    print(f"   ‚úì Melanoma recall ({melanoma_recall:.4f}) meets clinical threshold")

print("\n2. CLASS IMBALANCE:")
class_counts = [np.sum(labels == i) for i in range(3)]
print(f"   Benign:     {class_counts[0]} samples")
print(f"   Suspicious: {class_counts[1]} samples")
print(f"   Melanoma:   {class_counts[2]} samples")
imbalance_ratio = max(class_counts) / min(class_counts)
print(f"   Imbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print(f"   ‚ö†Ô∏è Significant class imbalance detected")
    print(f"   ‚Üí Use weighted loss function")
    print(f"   ‚Üí Consider oversampling minority classes")

print("\n3. MODEL ARCHITECTURE:")
print(f"   Current: ResNet18 (~11M parameters)")
print(f"   ‚Üí Try deeper models: ResNet50, EfficientNet, DenseNet")
print(f"   ‚Üí Implement ensemble methods for robustness")

print("\n4. DATA AUGMENTATION:")
print(f"   Current: Unknown (check training notebook)")
print(f"   ‚Üí Add rotation, flipping, color jitter")
print(f"   ‚Üí Test different augmentation intensities")

print("\n5. HYPERPARAMETER OPTIMIZATION:")
print(f"   Current: Manual selection")
print(f"   ‚Üí Use Optuna for systematic hyperparameter search")
print(f"   ‚Üí Optimize learning rate, batch size, weight decay")

print("\n6. OPTIMIZATION METRIC:")
print(f"   Current: Likely accuracy-based")
print(f"   ‚Üí Switch to recall-focused optimization")
print(f"   ‚Üí Use composite score: 70% melanoma recall + 30% macro recall")

print("\n" + "=" * 70)
print("RECOMMENDED NEXT STEPS:")
print("=" * 70)
print("1. Implement class-weighted loss function")
print("2. Add comprehensive data augmentation")
print("3. Try EfficientNet-B0 or ResNet50 architectures")
print("4. Run Optuna hyperparameter optimization")
print("5. Optimize for melanoma recall (not accuracy)")
print("6. Consider ensemble of top 3 models")
print("\n‚Üí See 6_model_improvement.ipynb for implementation")

## 11. Save Evaluation Report to File

In [None]:
report_path = "baseline_model_evaluation_report.txt"

with open(report_path, "w") as f:
    f.write("=" * 80 + "\n")
    f.write("BASELINE MODEL EVALUATION REPORT\n")
    f.write("=" * 80 + "\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Model: ResNet18 (melanoma_model_weights.pth)\n")
    f.write(f"Test Set Size: {len(labels)} images\n")
    f.write("=" * 80 + "\n\n")

    # Overall metrics
    f.write("OVERALL PERFORMANCE\n")
    f.write("-" * 80 + "\n")
    f.write(f"Overall Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)\n")
    f.write(f"Macro Precision:  {macro_precision:.4f}\n")
    f.write(f"Macro Recall:     {macro_recall:.4f}\n")
    f.write(f"Macro F1-Score:   {macro_f1:.4f}\n")
    f.write("\n")

    # Per-class metrics
    f.write("PER-CLASS PERFORMANCE\n")
    f.write("-" * 80 + "\n")
    for i, class_name in enumerate(class_names):
        class_preds_binary = (preds == i).astype(int)
        class_labels_binary = (labels == i).astype(int)
        recall = recall_score(class_labels_binary, class_preds_binary, zero_division=0)
        precision = precision_score(class_labels_binary, class_preds_binary, zero_division=0)
        f1 = f1_score(class_labels_binary, class_preds_binary, zero_division=0)
        support = np.sum(labels == i)

        f.write(f"\n{class_name.upper()}:\n")
        f.write(f"  Support:   {support}\n")
        f.write(f"  Precision: {precision:.4f}\n")
        f.write(f"  Recall:    {recall:.4f}\n")
        f.write(f"  F1-Score:  {f1:.4f}\n")

    # Primary metric
    f.write("\n" + "=" * 80 + "\n")
    f.write(f"PRIMARY CLINICAL METRIC - MELANOMA RECALL: {melanoma_recall:.4f} ({melanoma_recall*100:.2f}%)\n")
    f.write("=" * 80 + "\n\n")

    # AUC scores
    f.write("AUC SCORES\n")
    f.write("-" * 80 + "\n")
    for i, class_name in enumerate(class_names):
        f.write(f"{class_name:12s}: {roc_auc[i]:.4f}\n")
    f.write(f"\nMacro Average: {macro_auc:.4f}\n\n")

    # Confusion matrix
    f.write("CONFUSION MATRIX\n")
    f.write("-" * 80 + "\n")
    f.write("              Predicted\n")
    f.write("         Benign  Suspicious  Melanoma\n")
    f.write(f"Benign      {cm[0,0]:4d}      {cm[0,1]:4d}      {cm[0,2]:4d}\n")
    f.write(f"Suspicious  {cm[1,0]:4d}      {cm[1,1]:4d}      {cm[1,2]:4d}\n")
    f.write(f"Melanoma    {cm[2,0]:4d}      {cm[2,1]:4d}      {cm[2,2]:4d}\n\n")

    # Critical errors
    f.write("CRITICAL ERROR ANALYSIS\n")
    f.write("-" * 80 + "\n")
    f.write(
        f"Melanoma as Benign (FALSE NEGATIVE): {melanoma_as_benign}/{melanoma_total} ({melanoma_as_benign/melanoma_total*100:.2f}%)\n"
    )
    f.write(
        f"Melanoma as Suspicious: {melanoma_as_suspicious}/{melanoma_total} ({melanoma_as_suspicious/melanoma_total*100:.2f}%)\n"
    )
    f.write(
        f"Benign as Melanoma (FALSE POSITIVE): {benign_as_melanoma}/{benign_total} ({benign_as_melanoma/benign_total*100:.2f}%)\n\n"
    )

    f.write("=" * 80 + "\n")
    f.write("END OF REPORT\n")
    f.write("=" * 80 + "\n")

print(f"\n‚úì Evaluation report saved to: {report_path}")

## Summary

This evaluation provides a comprehensive baseline for your melanoma detection model. Key takeaways:

1. **Primary Metric**: Melanoma recall is the most important for patient safety
2. **False Negatives**: Most dangerous - missing melanoma cases
3. **False Positives**: Less critical but cause unnecessary anxiety
4. **Improvement Path**: See `6_model_improvement.ipynb` for optimization strategies

### Files Generated:
- `baseline_confusion_matrix.png`
- `baseline_confusion_matrix_normalized.png`
- `baseline_roc_curves.png`
- `baseline_model_evaluation_report.txt`

### Next Steps:
Run `6_model_improvement.ipynb` to:
- Optimize hyperparameters with Optuna
- Test advanced architectures
- Improve melanoma recall
- Compare against this baseline