# Model Testing on Test Set

Runs inference on the test audio files and displays the final metrics of the chosen model

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Add parent directory to path for imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import our custom modules
from utils.inference import perform_audio_inference
from utils.models import BirdCNN
from utils.metrics import plot_confusion_matrix

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

All imports successful!
PyTorch version: 2.7.1+cu128
CUDA available: True
Device: cuda


In [None]:
# Config
NUM_CLASSES = 27
MODEL_WEIGHTS_PATH = os.path.join('..', 'models', 'bird_cnn.pth')
TEST_AUDIO_DIR = os.path.join('..', 'database', 'audio', 'test')
TEST_METADATA_PATH = os.path.join('..', 'database', 'meta', 'test_data.csv')

# Load test metadata
print("Loading test metadata...")
test_df = pd.read_csv(TEST_METADATA_PATH)
print(f"✓ Loaded {len(test_df)} test samples")
print(f"✓ Columns: {list(test_df.columns)}")
print(f"✓ Classes represented: {test_df['class_id'].nunique()}")
print(f"✓ Class distribution:")
print(test_df['class_id'].value_counts().sort_index())

# Show first few rows
print("\nFirst 5 rows:")
print(test_df.head())

In [None]:
# Verify audio files exist and create file mapping
print("Verifying audio files...")
audio_files = os.listdir(TEST_AUDIO_DIR)
print(f"✓ Found {len(audio_files)} audio files in test directory")

# Create mapping of filename to full path and check if all metadata files exist
test_files = []
missing_files = []

for _, row in test_df.iterrows():
    filename = row['filename']
    audio_path = os.path.join(TEST_AUDIO_DIR, filename)
    
    if os.path.exists(audio_path):
        test_files.append({
            'filename': filename,
            'path': audio_path,
            'class_id': row['class_id']
        })
    else:
        missing_files.append(filename)

print(f"✓ Valid files: {len(test_files)}")
if missing_files:
    print(f"⚠ Missing files: {len(missing_files)}")
    print(f"  First few missing: {missing_files[:5]}")
else:
    print("✓ All metadata files found in audio directory")

print(f"\nReady to test on {len(test_files)} audio files")

In [None]:
# Define evaluation metrics functions
def calculate_metrics(y_true, y_pred_probs, y_pred_classes):
    """
    Calculate comprehensive evaluation metrics
    
    Args:
        y_true: True class labels (numpy array)
        y_pred_probs: Predicted probabilities (numpy array, shape: [n_samples, n_classes])
        y_pred_classes: Predicted class labels (numpy array)
    
    Returns:
        dict: Dictionary containing all metrics
    """
    metrics = {}
    
    # Accuracy
    metrics['accuracy'] = accuracy_score(y_true, y_pred_classes)
    
    # F1 Score (macro average)
    metrics['f1_macro'] = f1_score(y_true, y_pred_classes, average='macro', zero_division=0)
    
    # Cross Entropy Loss
    # Convert to one-hot for cross entropy calculation
    y_true_onehot = np.zeros((len(y_true), NUM_CLASSES))
    y_true_onehot[np.arange(len(y_true)), y_true] = 1
    
    # Clip probabilities to avoid log(0)
    y_pred_probs_clipped = np.clip(y_pred_probs, 1e-15, 1 - 1e-15)
    metrics['cross_entropy'] = log_loss(y_true, y_pred_probs_clipped)
    
    # Confusion Matrix
    metrics['confusion_matrix'] = confusion_matrix(y_true, y_pred_classes, labels=range(NUM_CLASSES))
    
    return metrics

def print_metrics(metrics):
    """Print evaluation metrics in a nice format"""
    print("="*60)
    print("EVALUATION METRICS")
    print("="*60)
    print(f"Accuracy: {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.2f}%)")
    print(f"F1 Score (Macro): {metrics['f1_macro']:.4f}")
    print(f"Cross Entropy Loss: {metrics['cross_entropy']:.4f}")
    print("="*60)

In [None]:
# Run inference on all test files
print("Starting inference on test set...")
print(f"Total files to process: {len(test_files)}")

# Storage for results
all_predictions = []
all_probabilities = []
all_true_labels = []
failed_files = []

# Process each test file
for i, file_info in enumerate(tqdm(test_files, desc="Processing audio files")):
    try:
        # Perform inference
        probabilities = perform_audio_inference(
            audio_path=file_info['path'],
            model_class=BirdCNN,
            model_weights_path=MODEL_WEIGHTS_PATH,
            reduce_noise=True
        )
        
        # Get predicted class (argmax of probabilities)
        predicted_class = np.argmax(probabilities)
        
        # Store results
        all_probabilities.append(probabilities)
        all_predictions.append(predicted_class)
        all_true_labels.append(file_info['class_id'])
        
        # Progress update every 50 files
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_files)} files")
            
    except Exception as e:
        print(f"Failed to process {file_info['filename']}: {str(e)}")
        failed_files.append(file_info['filename'])
        continue

print(f"\n✓ Inference completed!")
print(f"✓ Successfully processed: {len(all_predictions)} files")
print(f"✓ Failed files: {len(failed_files)}")

if failed_files:
    print(f"Failed files: {failed_files[:5]}{'...' if len(failed_files) > 5 else ''}")

In [None]:
# Calculate evaluation metrics
if len(all_predictions) > 0:
    print("Calculating evaluation metrics...")
    
    # Convert to numpy arrays
    y_true = np.array(all_true_labels)
    y_pred_classes = np.array(all_predictions)
    y_pred_probs = np.array(all_probabilities)
    
    # Calculate metrics
    metrics = calculate_metrics(y_true, y_pred_probs, y_pred_classes)
    
    # Print results
    print_metrics(metrics)
    
    # Additional detailed statistics
    print(f"\nDetailed Statistics:")
    print(f"Total test samples: {len(y_true)}")
    print(f"Correctly classified: {np.sum(y_true == y_pred_classes)}")
    print(f"Misclassified: {np.sum(y_true != y_pred_classes)}")
    
    # Per-class accuracy
    print(f"\nPer-class Statistics:")
    unique_classes = np.unique(y_true)
    print(f"Classes present in test set: {len(unique_classes)}")
    
    for class_id in sorted(unique_classes):
        class_mask = (y_true == class_id)
        class_correct = np.sum((y_true == y_pred_classes) & class_mask)
        class_total = np.sum(class_mask)
        class_acc = class_correct / class_total if class_total > 0 else 0
        print(f"Class {class_id:2d}: {class_correct:2d}/{class_total:2d} = {class_acc:.3f}")

else:
    print("❌ No successful predictions to evaluate!")

In [None]:
# Plot Confusion Matrix
if len(all_predictions) > 0:
    print("Generating confusion matrix visualization...")
    
    # Create confusion matrix plot
    plt.figure(figsize=(15, 12))
    
    # Use the plot_confusion_matrix function from metrics
    plot_confusion_matrix(
        metrics['confusion_matrix'], 
        title="Test Set Confusion Matrix",
        figsize=(15, 12),
        show_counts=True
    )
    
    plt.tight_layout()
    plt.show()
    
    # Print confusion matrix statistics
    cm = metrics['confusion_matrix']
    print(f"\nConfusion Matrix Shape: {cm.shape}")
    print(f"Total predictions: {np.sum(cm)}")
    print(f"Diagonal sum (correct predictions): {np.trace(cm)}")
    
    # Find most confused classes
    print(f"\nMost Confused Class Pairs (excluding diagonal):")
    cm_off_diag = cm.copy()
    np.fill_diagonal(cm_off_diag, 0)
    
    # Get indices of highest confusion values
    max_indices = np.unravel_index(np.argsort(cm_off_diag.ravel())[-10:], cm_off_diag.shape)
    
    for i in range(len(max_indices[0])-1, -1, -1):  # Reverse to show highest first
        true_class = max_indices[0][i]
        pred_class = max_indices[1][i]
        count = cm_off_diag[true_class, pred_class]
        if count > 0:
            print(f"True: {true_class:2d} → Predicted: {pred_class:2d} ({count:2d} times)")

else:
    print("❌ No predictions available for confusion matrix!")

In [None]:
# Save detailed results to CSV
if len(all_predictions) > 0:
    print("Saving detailed results...")
    
    # Create results dataframe
    results_df = pd.DataFrame({
        'filename': [test_files[i]['filename'] for i in range(len(all_predictions))],
        'true_class': y_true,
        'predicted_class': y_pred_classes,
        'correct': y_true == y_pred_classes,
        'max_probability': [max(probs) for probs in y_pred_probs],
        'prediction_confidence': [y_pred_probs[i][y_pred_classes[i]] for i in range(len(y_pred_classes))]
    })
    
    # Add individual class probabilities
    for class_id in range(NUM_CLASSES):
        results_df[f'prob_class_{class_id}'] = [probs[class_id] for probs in y_pred_probs]
    
    # Save to CSV
    results_csv_path = 'test_results_detailed.csv'
    results_df.to_csv(results_csv_path, index=False)
    print(f"✓ Detailed results saved to: {results_csv_path}")
    
    # Summary statistics
    print(f"\nFinal Summary:")
    print(f"{'='*60}")
    print(f"Model Performance on Test Set")
    print(f"{'='*60}")
    print(f"Total samples tested: {len(y_true)}")
    print(f"Accuracy: {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.2f}%)")
    print(f"F1 Score (Macro): {metrics['f1_macro']:.4f}")
    print(f"Cross Entropy Loss: {metrics['cross_entropy']:.4f}")
    print(f"Correctly classified: {np.sum(y_pred_classes == y_true)}/{len(y_true)}")
    print(f"Average prediction confidence: {np.mean([results_df.loc[i, 'prediction_confidence'] for i in range(len(results_df))]):.4f}")
    print(f"{'='*60}")
    
    # Show some examples of correct and incorrect predictions
    print(f"\nExample Correct Predictions:")
    correct_mask = results_df['correct'] == True
    if correct_mask.sum() > 0:
        correct_examples = results_df[correct_mask].nlargest(5, 'prediction_confidence')
        for _, row in correct_examples.iterrows():
            print(f"  {row['filename']}: True={row['true_class']}, Pred={row['predicted_class']}, Conf={row['prediction_confidence']:.3f}")
    
    print(f"\nExample Incorrect Predictions:")
    incorrect_mask = results_df['correct'] == False
    if incorrect_mask.sum() > 0:
        incorrect_examples = results_df[incorrect_mask].nlargest(5, 'prediction_confidence')
        for _, row in incorrect_examples.iterrows():
            print(f"  {row['filename']}: True={row['true_class']}, Pred={row['predicted_class']}, Conf={row['prediction_confidence']:.3f}")

print("\n🎉 Model testing completed successfully!")