In [None]:
import torch
import wandb
import os
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.init as init
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau


# Set device to GPU if available, otherwise CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

# Set a global seed value
seed = 42

# For NumPy
np.random.seed(seed)

# For PyTorch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    
    # Ensure deterministic behavior for CUDA operations.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
model = models.resnet50(pretrained=True)  # Load pre-trained ResNet-50

In [None]:
# Initialize Weights & Biases
wandb.login(key="9ab13478cceef58f66f93a6be9e5f1e7a1f7e3d4")  
wandb.init(project="Assignment5", entity="usf-magma")
wandb.run.name = "bermudezm"
wandb.run.save()
config = wandb.config
wandb.init(project="Assignment5", entity="usf-magma")

# Define paths to your local dataset
annotations_path = "Assignment 05/archive/annotations"  # Update this to your local annotations path
images_path = "Assignment 05/archive/images"            # Update this to your local images path

# Check if the annotation path exists
if not os.path.exists(annotations_path):
    print(f"Error: The annotations path {annotations_path} does not exist!")

# Print contents of the annotations folder
print("Folders in annotations path:", os.listdir(annotations_path))

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Get class labels
def get_class_labels(annotations_path):
    class_labels = {}
    for breed_folder in os.listdir(annotations_path):
        folder_path = os.path.join(annotations_path, breed_folder)
        if os.path.isdir(folder_path):
            if breed_folder not in class_labels:
                class_labels[breed_folder] = len(class_labels)
    return class_labels

class_labels = get_class_labels(annotations_path)

# Load Stanford Dogs dataset
def load_stanford_dogs(images_path, annotations_path, transform):
    dataset = []
    for breed_folder in os.listdir(annotations_path):
        annotation_folder_path = os.path.join(annotations_path, breed_folder)
        image_folder_path = os.path.join(images_path, breed_folder)

        if not os.path.isdir(annotation_folder_path) or not os.path.exists(image_folder_path):
            print(f"Skipping {breed_folder}, image folder not found!")
            continue

        class_index = class_labels.get(breed_folder, -1)
        if class_index == -1:
            continue

        image_files = os.listdir(image_folder_path)

        for image_file in image_files:
            image_path = os.path.join(image_folder_path, image_file)
            if image_file.endswith(".jpg") and os.path.exists(image_path):
                image = Image.open(image_path).convert("RGB")
                image = transform(image)
                dataset.append((image, class_index))

    return dataset

# Load datasets
full_dataset = load_stanford_dogs(images_path, annotations_path, transform)

# Create data loaders
trainset, testset = train_test_split(full_dataset, test_size=0.2, random_state=42)

print(f"Train set size: {len(trainset)} images")
print(f"Test set size: {len(testset)} images")

# Validate dataset size before passing to DataLoader
if len(trainset) == 0:
    raise ValueError("Error: trainset is empty! Check image paths.")

# Create data loaders
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=32, shuffle=False, num_workers=2)

print(f"Loaded Stanford Dogs dataset with {len(class_labels)} classes and {len(trainset)} images.")

# Log dataset information to W&B
wandb.config.update({
    "dataset": "Stanford Dogs",
    "num_classes": len(class_labels),
    "train_size": len(trainset),
    "test_size": len(testset),
    "batch_size": 32
})

print(f"Loaded Stanford Dogs dataset with {len(class_labels)} classes and {len(trainset)} images.")

In [None]:
# Define the path to your dataset
dataset_path = "Assignment 05/archive/images/Images"  # Images directory

# Extract class names from folder names
classes = tuple(sorted(os.listdir(dataset_path)))  # Sorting to ensure consistent order

print(f"Detected {len(classes)} classes:")
print(classes)

In [None]:
# Load pre-trained ResNet-18
model = models.resnet18(weights='IMAGENET1K_V1')

# Modify the final layer to match the number of dog breeds (120)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, len(classes))  # 120 classes in Stanford Dogs

# Move model to the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Log model details to W&B
wandb.config.update({
    "model": "ResNet-18",
    "pretrained": True,
    "num_classes": len(classes),
    "learning_rate": 0.001
})

wandb.watch(model, log="all", log_freq=100)

In [None]:
# Freeze all layers except the final layer
for param in model.parameters():
    param.requires_grad = False

# Unfreeze parameters of the final fully connected layer
for param in model.fc.parameters():
    param.requires_grad = True

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%} of total)")

# Log to W&B
wandb.config.update({
    "trainable_params": trainable_params,
    "total_params": total_params,
    "approach": "feature_extraction"
})

In [None]:
# First, add scikit-learn for metrics calculation


# Ensure the model is on the correct device
model = model.to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

wandb.init(project="Assignment5", entity="usf-magma")
# Log hyperparameters to W&B
wandb.config.update({
    "optimizer": "Adam",
    "learning_rate": 0.001,
    "criterion": "CrossEntropyLoss",
    "epochs": 5  # We'll train for just 5 epochs for this example
})

# Number of classes
num_classes = len(class_labels)
class_names = list(class_labels.keys())

# Training loop
def train_model(model, trainloader, testloader, criterion, optimizer, num_epochs=5):
    # Track best accuracy
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        all_train_preds = []
        all_train_labels = []

        for i, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            # Collect predictions and labels for F1 calculation
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

            # Log batch statistics (every 100 batches)
            if i % 100 == 99:
                batch_acc = 100. * correct / total
                batch_loss = running_loss / total
                print(f'Batch {i+1}, Loss: {batch_loss:.4f}, Acc: {batch_acc:.2f}%')

                wandb.log({
                    "train_batch_loss": batch_loss,
                    "train_batch_acc": batch_acc,
                    "epoch": epoch + i/len(trainloader)
                })

        # Calculate epoch statistics
        train_loss = running_loss / len(trainloader.dataset)
        train_acc = 100. * correct / total

        # Calculate F1 score for the training epoch
        train_f1_macro = f1_score(all_train_labels, all_train_preds, average='macro')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, F1-macro: {train_f1_macro:.4f}')

        # Evaluation phase
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        class_correct = list(0. for _ in range(num_classes))
        class_total = list(0. for _ in range(num_classes))
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for inputs, labels in testloader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Statistics
                test_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                # Per-class accuracy
                c = (predicted == labels).squeeze()
                for i in range(labels.size(0)):
                    label = labels[i].item()
                    class_correct[label] += c[i].item()
                    class_total[label] += 1

                # Store for confusion matrix and F1 calculation
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate test statistics
        test_loss = test_loss / len(testloader.dataset)
        test_acc = 100. * correct / total

        # Calculate F1 score for test data
        test_f1_macro = f1_score(all_labels, all_preds, average='macro')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, F1-macro: {test_f1_macro:.4f}')

        # Per-class accuracy
        for i in range(num_classes):
            class_acc = 100 * class_correct[i] / class_total[i] if class_total[i] > 0 else 0


        # Log epoch statistics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "train_f1_macro": train_f1_macro,
            "test_loss": test_loss,
            "test_acc": test_acc,
            "test_f1_macro": test_f1_macro
        })

        # Log per-class accuracy
        class_acc_dict = {f"class_acc_{class_names[i]}": 100 * class_correct[i] / class_total[i]
                         if class_total[i] > 0 else 0 for i in range(num_classes)}
        wandb.log(class_acc_dict)

        # Log confusion matrix
        wandb.log({
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=all_labels,
                preds=all_preds,
                class_names=class_names
            )
        })

        # Save model if it's the best so far
        if test_acc > best_acc:
            best_acc = test_acc
            best_f1 = test_f1_macro
            torch.save(model.state_dict(), f'resnet18_stanforddogs_epoch_{epoch+1}.pth')
            wandb.save(f'resnet18_stanforddogs_epoch_{epoch+1}.pth')

            # Log best model metrics to W&B summary
            wandb.run.summary["best_accuracy"] = best_acc
            wandb.run.summary["best_f1_macro"] = best_f1
            wandb.run.summary["best_epoch"] = epoch + 1

    print(f'Best test accuracy: {best_acc:.2f}%')
    return model

# Train the model
model = train_model(model, trainloader, testloader, criterion, optimizer)

# Finish the W&B run
wandb.finish()

In [None]:
# Ensure the model is on the correct device
model = model.to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

wandb.init(project="Assignment5", entity="usf-magma")
# Log hyperparameters to W&B
wandb.config.update({
    "optimizer": "Adam",
    "learning_rate": 0.001,
    "criterion": "CrossEntropyLoss",
    "epochs": 15  # Updated to 15 epochs
})

# Number of classes
num_classes = len(class_labels)
class_names = list(class_labels.keys())

# Training loop
def train_model(model, trainloader, testloader, criterion, optimizer, num_epochs=15):  # Updated to 15 epochs
    # Track best accuracy
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        all_train_preds = []
        all_train_labels = []

        for i, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            # Collect predictions and labels for F1 calculation
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

            # Log batch statistics (every 100 batches)
            if i % 100 == 99:
                batch_acc = 100. * correct / total
                batch_loss = running_loss / total
                print(f'Batch {i+1}, Loss: {batch_loss:.4f}, Acc: {batch_acc:.2f}%')

                wandb.log({
                    "train_batch_loss": batch_loss,
                    "train_batch_acc": batch_acc,
                    "epoch": epoch + i/len(trainloader)
                })

        # Calculate epoch statistics
        train_loss = running_loss / len(trainloader.dataset)
        train_acc = 100. * correct / total

        # Calculate F1 score for the training epoch
        train_f1_macro = f1_score(all_train_labels, all_train_preds, average='macro')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, F1-macro: {train_f1_macro:.4f}')

        # Evaluation phase
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        class_correct = list(0. for _ in range(num_classes))
        class_total = list(0. for _ in range(num_classes))
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for inputs, labels in testloader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Statistics
                test_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                # Per-class accuracy
                c = (predicted == labels).squeeze()
                for i in range(labels.size(0)):
                    label = labels[i].item()
                    class_correct[label] += c[i].item()
                    class_total[label] += 1

                # Store for confusion matrix and F1 calculation
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate test statistics
        test_loss = test_loss / len(testloader.dataset)
        test_acc = 100. * correct / total

        # Calculate F1 score for test data
        test_f1_macro = f1_score(all_labels, all_preds, average='macro')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, F1-macro: {test_f1_macro:.4f}')

        # Per-class accuracy
        for i in range(num_classes):
            class_acc = 100 * class_correct[i] / class_total[i] if class_total[i] > 0 else 0

        # Log epoch statistics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "train_f1_macro": train_f1_macro,
            "test_loss": test_loss,
            "test_acc": test_acc,
            "test_f1_macro": test_f1_macro
        })

        # Log per-class accuracy
        class_acc_dict = {f"class_acc_{class_names[i]}": 100 * class_correct[i] / class_total[i]
                         if class_total[i] > 0 else 0 for i in range(num_classes)}
        wandb.log(class_acc_dict)

        # Log confusion matrix
        wandb.log({
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=all_labels,
                preds=all_preds,
                class_names=class_names
            )
        })

        # Save model if it's the best so far
        if test_acc > best_acc:
            best_acc = test_acc
            best_f1 = test_f1_macro
            torch.save(model.state_dict(), f'resnet18_stanforddogs_epoch_{epoch+1}.pth')
            wandb.save(f'resnet18_stanforddogs_epoch_{epoch+1}.pth')

            # Log best model metrics to W&B summary
            wandb.run.summary["best_accuracy"] = best_acc
            wandb.run.summary["best_f1_macro"] = best_f1
            wandb.run.summary["best_epoch"] = epoch + 1

    print(f'Best test accuracy: {best_acc:.2f}%')
    return model

# Train the model
model = train_model(model, trainloader, testloader, criterion, optimizer)

# Finish the W&B run
wandb.finish()


In [None]:
# Data Augmentation
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Ensure the model is on the correct device
model = model.to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.fc.parameters(), lr=0.005, weight_decay=1e-4)

# Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Early Stopping Variables
best_loss = float("inf")
patience = 5
patience_counter = 0
wandb.init(project="Assignment5", entity="usf-magma")
# Log hyperparameters to W&B
wandb.config.update({
    "optimizer": "AdamW",
    "learning_rate": 0.005,
    "criterion": "CrossEntropyLoss",
    "epochs": 15  # Increased epochs for better convergence
})

# Number of classes
num_classes = len(class_labels)
class_names = list(class_labels.keys())

# Training loop
def train_model(model, trainloader, testloader, criterion, optimizer, num_epochs=15):
    best_acc = 0.0
    global best_loss, patience_counter

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        all_train_preds = []
        all_train_labels = []

        for i, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

        train_loss = running_loss / len(trainloader.dataset)
        train_acc = 100. * correct / total
        train_f1_macro = f1_score(all_train_labels, all_train_preds, average='macro')

        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, F1-macro: {train_f1_macro:.4f}')

        # Evaluation phase
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []
        misclassified_samples = []

        with torch.no_grad():
            for inputs, labels in testloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                test_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                # Capture misclassified samples
                for i in range(len(labels)):
                    if predicted[i] != labels[i]:
                        misclassified_samples.append((inputs[i].cpu(), labels[i].cpu(), predicted[i].cpu()))

        test_loss = test_loss / len(testloader.dataset)
        test_acc = 100. * correct / total
        test_f1_macro = f1_score(all_labels, all_preds, average='macro')

        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, F1-macro: {test_f1_macro:.4f}')

        # Log statistics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "train_f1_macro": train_f1_macro,
            "test_loss": test_loss,
            "test_acc": test_acc,
            "test_f1_macro": test_f1_macro
        })

        # Log misclassified samples
        for img, true_label, pred_label in misclassified_samples[:10]:  # Log only first 10
            wandb.log({
                "Misclassified Sample": [wandb.Image(img, caption=f"True: {class_names[true_label]} | Pred: {class_names[pred_label]}")]
            })

        # Update learning rate scheduler
        scheduler.step(test_loss)

        # Early stopping
        if test_loss < best_loss:
            best_loss = test_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'best_model.pth')
            wandb.save('best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    print(f'Best test accuracy: {best_acc:.2f}%')
    return model

# Train the model
model = train_model(model, trainloader, testloader, criterion, optimizer)

# Finish the W&B run
wandb.finish()


In [None]:
# Ensure the model is on the correct device
model = model.to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

# Log hyperparameters to W&B
wandb.config.update({
    "optimizer": "Adam",
    "learning_rate": 0.001,
    "criterion": "CrossEntropyLoss",
    "epochs": 5  # We'll train for just 5 epochs for this example
})

# Number of classes
num_classes = len(class_labels)
class_names = list(class_labels.keys())

# Training loop
def train_model(model, trainloader, criterion, optimizer, num_epochs=5):
    # Track best accuracy
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        all_train_preds = []
        all_train_labels = []

        for i, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            # Collect predictions and labels for F1 calculation
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

            # Log batch statistics (every 100 batches)
            if i % 100 == 99:
                batch_acc = 100. * correct / total
                batch_loss = running_loss / total
                print(f'Batch {i+1}, Loss: {batch_loss:.4f}, Acc: {batch_acc:.2f}%')

                wandb.log({
                    "train_batch_loss": batch_loss,
                    "train_batch_acc": batch_acc,
                    "epoch": epoch + i/len(trainloader)
                })

        # Calculate epoch statistics
        train_loss = running_loss / len(trainloader.dataset)
        train_acc = 100. * correct / total

        # Calculate F1 score for the training epoch
        train_f1_macro = f1_score(all_train_labels, all_train_preds, average='macro')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, F1-macro: {train_f1_macro:.4f}')

        # Evaluation phase
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        class_correct = list(0. for _ in range(num_classes))
        class_total = list(0. for _ in range(num_classes))
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for inputs, labels in testloader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Statistics
                test_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                # Per-class accuracy
                c = (predicted == labels).squeeze()
                for i in range(labels.size(0)):
                    label = labels[i].item()
                    class_correct[label] += c[i].item()
                    class_total[label] += 1

                # Store for confusion matrix and F1 calculation
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate test statistics
        test_loss = test_loss / len(testloader.dataset)
        test_acc = 100. * correct / total

        # Calculate F1 score for test data
        test_f1_macro = f1_score(all_labels, all_preds, average='macro')
        print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%, F1-macro: {test_f1_macro:.4f}')

        # Per-class accuracy
        for i in range(num_classes):
            class_acc = 100 * class_correct[i] / class_total[i] if class_total[i] > 0 else 0
            print(f'Accuracy of {class_names[i]}: {class_acc:.2f}%')

        # Log epoch statistics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "train_f1_macro": train_f1_macro,
            "test_loss": test_loss,
            "test_acc": test_acc,
            "test_f1_macro": test_f1_macro
        })

        # Log per-class accuracy
        class_acc_dict = {f"class_acc_{class_names[i]}": 100 * class_correct[i] / class_total[i]
                         if class_total[i] > 0 else 0 for i in range(num_classes)}
        wandb.log(class_acc_dict)

        # Log confusion matrix
        wandb.log({
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=all_labels,
                preds=all_preds,
                class_names=class_names
            )
        })

        # Save model if it's the best so far
        if test_acc > best_acc:
            best_acc = test_acc
            best_f1 = test_f1_macro
            torch.save(model.state_dict(), f'resnet18_stanforddogs_epoch_{epoch+1}.pth')
            wandb.save(f'resnet18_stanforddogs_epoch_{epoch+1}.pth')

            # Log best model metrics to W&B summary
            wandb.run.summary["best_accuracy"] = best_acc
            wandb.run.summary["best_f1_macro"] = best_f1
            wandb.run.summary["best_epoch"] = epoch + 1

    print(f'Best test accuracy: {best_acc:.2f}%')
    return model

# Train the model (without passing testloader)
model = train_model(model, trainloader, criterion, optimizer)

# Save the trained model
torch.save(model.state_dict(), 'resnet18_stanforddogs_final.pth')