In [None]:
# Cell 1: Environment Setup and Dependencies
import os
import sys
import platform
import time
from datetime import datetime
from pathlib import Path
import json
import yaml
import shutil
import random

# Deep learning
import torch
from ultralytics import YOLO

# Data analysis
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from PIL import Image, ImageDraw, ImageFont
import cv2

# Progress tracking
from tqdm.notebook import tqdm

# Print Python and environment information
print(f"Python version: {platform.python_version()}")
print(f"Platform: {platform.platform()}")

# Check for CUDA
try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU device: {torch.cuda.get_device_name(0)}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")
    else:
        print("CUDA is not available - evaluation will use CPU")
except ImportError:
    print("PyTorch is not installed - you'll need to install it with pip install torch torchvision")

# Check for other required libraries
required_packages = ['numpy', 'matplotlib', 'pandas', 'ultralytics', 'seaborn', 'scikit-learn']
for package in required_packages:
    try:
        module = __import__(package.replace('-', '_'))
        print(f"✅ {package} is installed (version: {module.__version__})")
    except ImportError:
        print(f"❌ {package} is NOT installed - use pip install {package}")
    except AttributeError:
        print(f"✅ {package} is installed (version unknown)")

# Manually set the project root path to ensure accuracy
project_root = "/home/peter/Desktop/TU PHD/WildlifeDetectionSystem"
print(f"\nProject root path: {project_root}")

# Output the current working directory for reference
print(f"Current working directory: {os.getcwd()}")

print("\nEnvironment setup check complete!")

In [None]:
# Cell 2: Load Training Results Configuration
# Find and load the configuration generated by the model training notebook
import os
import json
import yaml
from datetime import datetime

# Manually set the project root path to ensure accuracy - this makes the cell self-contained
project_root = "/home/peter/Desktop/TU PHD/WildlifeDetectionSystem"
print(f"Project root path: {project_root}")

def find_latest_config(config_dir, prefix="training_config_"):
    """Find the latest configuration file based on timestamp in filename"""
    config_files = [f for f in os.listdir(config_dir) if f.startswith(prefix) and f.endswith('.json')]
    if not config_files:
        return None
    
    # Sort by timestamp (assuming format training_config_YYYYMMDD_HHMM.json)
    latest_config = sorted(config_files, reverse=True)[0]
    return os.path.join(config_dir, latest_config)

# Define paths
config_dir = os.path.join(project_root, "config")
if not os.path.exists(config_dir):
    print(f"❌ Config directory not found: {config_dir}")
    print("Please run notebook 2 (model training) first")
else:
    # Try to find the latest config file
    latest_config = find_latest_config(config_dir)
    
    if latest_config and os.path.exists(latest_config):
        print(f"Found configuration from notebook 2: {latest_config}")
        
        # Load configuration
        with open(latest_config, 'r') as f:
            training_config = json.load(f)
        
        # Extract key paths and parameters
        timestamp = training_config["timestamp"]
        input_config = training_config["input"]
        
        # Get class names and taxonomic groups
        class_names = input_config["class_names"]
        taxonomic_groups = input_config["taxonomic_groups"]
        
        # Get dataset paths
        standard_dataset_path = input_config["standard_dataset"]
        hierarchical_dataset_path = input_config["hierarchical_dataset"]
        
        # Get model paths
        standard_model_path = training_config.get("standard_best_model_path")
        hierarchical_model_path = training_config.get("hierarchical_best_model_path")
        
        # Get actual model paths if present
        standard_model_dir = training_config.get("standard_model", {}).get("actual_model_path")
        hierarchical_model_dir = training_config.get("hierarchical_model", {}).get("actual_model_path")
        
        print(f"\nLoaded configuration with timestamp: {timestamp}")
        print(f"Number of classes: {len(class_names)}")
        print(f"Number of taxonomic groups: {len(taxonomic_groups)}")
        
        # Check if the model files exist
        if standard_model_path and os.path.exists(standard_model_path):
            print(f"✅ Standard model exists: {standard_model_path}")
        elif standard_model_path:
            print(f"❌ Standard model not found: {standard_model_path}")
        else:
            print("❓ Standard model path not found in configuration")
            
            # Try to find the model in the expected location
            if standard_model_dir:
                potential_model_path = os.path.join(standard_model_dir, "weights", "best.pt")
                if os.path.exists(potential_model_path):
                    standard_model_path = potential_model_path
                    print(f"✅ Found standard model at: {standard_model_path}")
                else:
                    print(f"❌ Could not find standard model at expected location: {potential_model_path}")
        
        if hierarchical_model_path and os.path.exists(hierarchical_model_path):
            print(f"✅ Hierarchical model exists: {hierarchical_model_path}")
        elif hierarchical_model_path:
            print(f"❌ Hierarchical model not found: {hierarchical_model_path}")
        else:
            print("❓ Hierarchical model path not found in configuration")
            
            # Try to find the model in the expected location
            if hierarchical_model_dir:
                potential_model_path = os.path.join(hierarchical_model_dir, "weights", "best.pt")
                if os.path.exists(potential_model_path):
                    hierarchical_model_path = potential_model_path
                    print(f"✅ Found hierarchical model at: {potential_model_path}")
                else:
                    print(f"❌ Could not find hierarchical model at expected location: {potential_model_path}")
    else:
        print(f"❌ Configuration from notebook 2 not found in {config_dir}")
        print("Please run notebook 2 (model training) first")

# Define the output paths for this notebook
reports_dir = os.path.join(project_root, "reports")
evaluation_dir = os.path.join(reports_dir, "evaluation")
timestamp_now = datetime.now().strftime("%Y%m%d_%H%M")

# Create evaluation directory with timestamp
evaluation_timestamp_dir = os.path.join(evaluation_dir, f"evaluation_{timestamp_now}")
os.makedirs(evaluation_timestamp_dir, exist_ok=True)

print(f"\nEvaluation results will be saved to: {evaluation_timestamp_dir}")

# Save the evaluation configuration for reference and tracking
evaluation_config = {
    "notebook": "03_model_evaluation",
    "timestamp": timestamp_now,
    "input": {
        "training_config": latest_config,
        "standard_model": standard_model_path,
        "hierarchical_model": hierarchical_model_path,
        "standard_dataset": standard_dataset_path,
        "hierarchical_dataset": hierarchical_dataset_path,
        "class_names": class_names,
        "taxonomic_groups": taxonomic_groups
    },
    "output": {
        "evaluation_dir": evaluation_timestamp_dir,
        "reports_dir": reports_dir
    }
}

# Save configuration
evaluation_config_path = os.path.join(config_dir, f"evaluation_config_{timestamp_now}.json")
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

print(f"\nEvaluation configuration saved to: {evaluation_config_path}")

In [None]:
# Cell 3: Model Loading and Basic Testing
# Load both standard and hierarchical models for evaluation

def load_and_inspect_model(model_path, model_type="standard"):
    """Load a YOLOv8 model and print basic information"""
    if not model_path or not os.path.exists(model_path):
        print(f"❌ {model_type.capitalize()} model path not found or invalid: {model_path}")
        return None
    
    try:
        print(f"Loading {model_type} model from: {model_path}")
        model = YOLO(model_path)
        
        # Print model information
        print(f"\n{model_type.capitalize()} Model Information:")
        print(f"- Architecture: {model.model.name if hasattr(model.model, 'name') else 'YOLOv8'}")
        print(f"- Number of classes: {model.model.nc}")
        print(f"- Input size: {model.model.args.get('imgsz', 640)}px")
        
        # Print class names if available
        if hasattr(model, "names") and model.names:
            print(f"- Classes: {', '.join(list(model.names.values())[:5])}{'...' if len(model.names) > 5 else ''}")
        
        return model
    except Exception as e:
        print(f"Error loading {model_type} model: {e}")
        return None

def test_model_on_sample(model, image_path, conf_threshold=0.25, model_type="standard"):
    """Run a quick test of the model on a sample image"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    if not os.path.exists(image_path):
        print(f"❌ Sample image not found: {image_path}")
        return None
    
    try:
        print(f"\nTesting {model_type} model on sample image: {os.path.basename(image_path)}")
        results = model.predict(source=image_path, conf=conf_threshold, verbose=False)
        
        if results and len(results) > 0:
            print(f"Detections: {len(results[0].boxes)}")
            
            # Print the top 3 detections
            boxes = results[0].boxes
            if len(boxes) > 0:
                print("\nTop detections:")
                
                # Sort by confidence
                confidences = boxes.conf.cpu().numpy()
                sorted_indices = np.argsort(confidences)[::-1][:3]  # Top 3
                
                for i, idx in enumerate(sorted_indices):
                    # Get class name
                    cls_id = int(boxes.cls[idx].item())
                    cls_name = model.names[cls_id]
                    
                    # Get confidence
                    conf = confidences[idx]
                    
                    print(f"  {i+1}. {cls_name}: {conf:.2f}")
            
            # Create a visualization directory
            vis_dir = os.path.join(evaluation_timestamp_dir, "visualizations")
            os.makedirs(vis_dir, exist_ok=True)
            
            # Save the visualization
            result_path = os.path.join(vis_dir, f"{model_type}_sample_detection.jpg")
            results[0].save(filename=result_path)
            print(f"Visualization saved to: {result_path}")
            
            return results[0]
        else:
            print("No detections found in the sample image.")
            return None
    except Exception as e:
        print(f"Error testing {model_type} model: {e}")
        return None

# Load the standard model
standard_model = load_and_inspect_model(standard_model_path, "standard")

# Load the hierarchical model
hierarchical_model = load_and_inspect_model(hierarchical_model_path, "hierarchical")

# Find a sample image for testing
def find_sample_image(dataset_path):
    """Find a sample image from the dataset validation set"""
    val_images_dir = os.path.join(dataset_path, "images", "val")
    if os.path.exists(val_images_dir):
        image_files = [f for f in os.listdir(val_images_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        if image_files:
            return os.path.join(val_images_dir, image_files[0])
    
    # If validation set not found, try raw images
    raw_images_dir = os.path.join(project_root, "data", "raw_images")
    if os.path.exists(raw_images_dir):
        for root, dirs, files in os.walk(raw_images_dir):
            image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            if image_files:
                return os.path.join(root, image_files[0])
    
    return None

# Find a sample image
sample_image_path = find_sample_image(standard_dataset_path)
if not sample_image_path:
    print("❌ No sample images found for testing.")
else:
    print(f"Found sample image: {sample_image_path}")
    
    # Test both models on the sample image
    if standard_model:
        standard_results = test_model_on_sample(standard_model, sample_image_path, conf_threshold=0.25, model_type="standard")
    
    if hierarchical_model:
        hierarchical_results = test_model_on_sample(hierarchical_model, sample_image_path, conf_threshold=0.25, model_type="hierarchical")

In [None]:
# Cell 4: Comprehensive Model Evaluation
# Evaluate both models on the validation set with detailed metrics

def evaluate_model(model, dataset_path, conf_thresholds=[0.25], iou_threshold=0.7, model_type="standard"):
    """Evaluate a model on the validation set at different confidence thresholds"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    # Find the YAML file
    yaml_path = os.path.join(dataset_path, "data.yaml")
    if not os.path.exists(yaml_path):
        print(f"❌ YAML file not found: {yaml_path}")
        return None
    
    # Create results dictionary
    results = {
        "model_type": model_type,
        "dataset": dataset_path,
        "thresholds": {}
    }
    
    print(f"\nEvaluating {model_type} model on validation set...")
    
    # Use CUDA if available
    device = 0 if torch.cuda.is_available() else 'cpu'
    
    # Evaluate at each confidence threshold
    for conf_threshold in conf_thresholds:
        print(f"\nEvaluating at confidence threshold: {conf_threshold}")
        
        try:
            # Run validation
            val_results = model.val(
                data=yaml_path,
                conf=conf_threshold,
                iou=iou_threshold,
                verbose=True,
                device=device
            )
            
            # Extract key metrics
            metrics = {
                "precision": float(val_results.box.mp),
                "recall": float(val_results.box.mr),
                "mAP50": float(val_results.box.map50),
                "mAP50-95": float(val_results.box.map),
                "val_images": int(val_results.nc),
                "speed_ms": float(val_results.speed['inference']) + float(val_results.speed['preprocess']) + float(val_results.speed['postprocess'])
            }
            
            # Get class-specific metrics if available
            if hasattr(val_results, 'class_metrics'):
                class_metrics = val_results.class_metrics
                metrics["class_metrics"] = class_metrics
            
            # Store results for this threshold
            results["thresholds"][str(conf_threshold)] = metrics
            
            # Print summary
            print(f"\nResults at threshold {conf_threshold}:")
            print(f"- Precision: {metrics['precision']:.4f}")
            print(f"- Recall: {metrics['recall']:.4f}")
            print(f"- mAP50: {metrics['mAP50']:.4f}")
            print(f"- mAP50-95: {metrics['mAP50-95']:.4f}")
            print(f"- Speed: {metrics['speed_ms']:.2f} ms per image")
            
        except Exception as e:
            print(f"Error evaluating at threshold {conf_threshold}: {e}")
            results["thresholds"][str(conf_threshold)] = {"error": str(e)}
    
    return results

def perform_threshold_analysis(model, dataset_path, model_type="standard"):
    """Analyze model performance across a range of confidence thresholds"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    # Define a range of thresholds
    thresholds = [0.05, 0.1, 0.25, 0.5, 0.75]
    
    # Evaluate model at each threshold
    print(f"\nPerforming threshold analysis for {model_type} model...")
    results = evaluate_model(model, dataset_path, conf_thresholds=thresholds, model_type=model_type)
    
    if results:
        # Create a dataframe for easier plotting
        threshold_data = []
        for threshold, metrics in results["thresholds"].items():
            if "error" not in metrics:
                threshold_data.append({
                    "threshold": float(threshold),
                    "precision": metrics["precision"],
                    "recall": metrics["recall"],
                    "mAP50": metrics["mAP50"],
                    "mAP50-95": metrics["mAP50-95"],
                    "speed_ms": metrics["speed_ms"]
                })
        
        threshold_df = pd.DataFrame(threshold_data)
        
        # Plot precision-recall vs threshold
        plt.figure(figsize=(10, 6))
        plt.plot(threshold_df["threshold"], threshold_df["precision"], "b-", label="Precision")
        plt.plot(threshold_df["threshold"], threshold_df["recall"], "r-", label="Recall")
        plt.plot(threshold_df["threshold"], threshold_df["mAP50"], "g-", label="mAP50")
        plt.xlabel("Confidence Threshold")
        plt.ylabel("Metric Value")
        plt.title(f"{model_type.capitalize()} Model - Metrics vs Confidence Threshold")
        plt.grid(True, alpha=0.3)
        plt.legend()
        
        # Save the plot
        plot_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_threshold_analysis.png")
        plt.savefig(plot_path)
        print(f"Threshold analysis plot saved to: {plot_path}")
        
        # Add results to the output
        results["threshold_df"] = threshold_df.to_dict()
        results["threshold_plot"] = plot_path
        
        return results
    
    return None

# Evaluate standard model
if standard_model and standard_dataset_path:
    standard_eval_results = perform_threshold_analysis(standard_model, standard_dataset_path, "standard")
else:
    standard_eval_results = None
    print("Skipping standard model evaluation due to missing model or dataset.")

# Evaluate hierarchical model
if hierarchical_model and hierarchical_dataset_path:
    hierarchical_eval_results = perform_threshold_analysis(hierarchical_model, hierarchical_dataset_path, "hierarchical")
else:
    hierarchical_eval_results = None
    print("Skipping hierarchical model evaluation due to missing model or dataset.")

# Save evaluation results to the configuration
if standard_eval_results:
    evaluation_config["standard_eval_results"] = standard_eval_results
if hierarchical_eval_results:
    evaluation_config["hierarchical_eval_results"] = hierarchical_eval_results

# Update evaluation config
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

In [None]:
# Cell 5: Per-Class Performance Analysis
# Analyze performance metrics for each class and taxonomic group

def analyze_per_class_performance(model, dataset_path, conf_threshold=0.25, model_type="standard", class_names=None, taxonomic_groups=None):
    """Analyze and visualize per-class performance metrics"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    print(f"\nAnalyzing per-class performance for {model_type} model...")
    
    # Get validation results with per-class metrics
    yaml_path = os.path.join(dataset_path, "data.yaml")
    if not os.path.exists(yaml_path):
        print(f"❌ YAML file not found: {yaml_path}")
        return None
    
    try:
        # Run validation with per-class metrics enabled
        device = 0 if torch.cuda.is_available() else 'cpu'
        val_results = model.val(
            data=yaml_path,
            conf=conf_threshold,
            verbose=False,
            device=device
        )
        
        # Format is different between standard (class names) and hierarchical (group names)
        if model_type == "standard" and class_names:
            names = class_names
        elif model_type == "hierarchical" and taxonomic_groups:
            names = list(taxonomic_groups.keys())
        else:
            names = list(model.names.values())
        
        # Extract per-class metrics
        class_metrics = {}
        
        # Different approaches depending on YOLOv8 version
        if hasattr(val_results, 'names') and val_results.names:
            model_names = val_results.names
            for i, name in model_names.items():
                if i < len(val_results.box.cls_dict):
                    class_metrics[name] = {
                        "precision": float(val_results.box.cls_dict[i].precision), 
                        "recall": float(val_results.box.cls_dict[i].recall),
                        "mAP50": float(val_results.box.cls_dict[i].ap50),
                        "mAP50-95": float(val_results.box.cls_dict[i].ap)
                    }
        elif hasattr(val_results, 'box') and hasattr(val_results.box, 'ap_class_index'):
            for i, idx in enumerate(val_results.box.ap_class_index):
                if idx < len(names):
                    name = names[idx]
                    class_metrics[name] = {
                        "precision": float(val_results.box.cls[i]),
                        "recall": float(val_results.box.cls[i]),
                        "mAP50": float(val_results.box.ap50[i]),
                        "mAP50-95": float(val_results.box.ap[i])
                    }
        else:
            # Use a more cautious approach - try to find metrics in val_results
            # This is a fallback in case the YOLOv8 API changes
            print("Warning: Unable to extract per-class metrics directly. Using alternate approach.")
            
            # Use the names from the model
            for i, name in model.names.items():
                class_metrics[name] = {
                    "precision": 0.0,
                    "recall": 0.0,
                    "mAP50": 0.0,
                    "mAP50-95": 0.0
                }
        
        # Create a dataframe for easier visualization
        class_data = []
        for name, metrics in class_metrics.items():
            class_data.append({
                "class": name,
                "precision": metrics["precision"],
                "recall": metrics["recall"],
                "mAP50": metrics["mAP50"],
                "mAP50-95": metrics["mAP50-95"]
            })
        
        class_df = pd.DataFrame(class_data)
        
        # Sort by mAP50 for better visualization
        class_df = class_df.sort_values(by="mAP50", ascending=False)
        
        # Plot per-class mAP50
        plt.figure(figsize=(12, 8))
        ax = sns.barplot(x="class", y="mAP50", data=class_df)
        plt.xticks(rotation=90)
        plt.xlabel("Class")
        plt.ylabel("mAP50")
        plt.title(f"{model_type.capitalize()} Model - Per-Class mAP50")
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        
        # Save the plot
        plot_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_per_class_map.png")
        plt.savefig(plot_path)
        print(f"Per-class mAP50 plot saved to: {plot_path}")
        
        # Plot precision vs recall by class
        plt.figure(figsize=(10, 8))
        plt.scatter(class_df["recall"], class_df["precision"], s=100, alpha=0.7)
        
        # Annotate points with class names
        for i, row in class_df.iterrows():
            plt.annotate(row["class"], (row["recall"], row["precision"]), 
                         xytext=(5, 5), textcoords="offset points", fontsize=8)
        
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"{model_type.capitalize()} Model - Precision vs Recall by Class")
        plt.grid(True, alpha=0.3)
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        
        # Save the plot
        plot_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_precision_recall_by_class.png")
        plt.savefig(plot_path)
        print(f"Precision-recall by class plot saved to: {plot_path}")
        
        # Return results
        results = {
            "model_type": model_type,
            "class_metrics": class_metrics,
            "class_df": class_df.to_dict(),
            "per_class_map_plot": os.path.join(evaluation_timestamp_dir, f"{model_type}_per_class_map.png"),
            "precision_recall_plot": os.path.join(evaluation_timestamp_dir, f"{model_type}_precision_recall_by_class.png")
        }
        
        return results
    
    except Exception as e:
        print(f"Error analyzing per-class performance: {e}")
        import traceback
        traceback.print_exc()
        return None

# Analyze per-class performance for standard model
if standard_model and standard_dataset_path:
    standard_class_results = analyze_per_class_performance(
        standard_model, 
        standard_dataset_path, 
        conf_threshold=0.25, 
        model_type="standard", 
        class_names=class_names
    )
else:
    standard_class_results = None
    print("Skipping standard model per-class analysis due to missing model or dataset.")

# Analyze per-class performance for hierarchical model
if hierarchical_model and hierarchical_dataset_path:
    hierarchical_class_results = analyze_per_class_performance(
        hierarchical_model, 
        hierarchical_dataset_path, 
        conf_threshold=0.25, 
        model_type="hierarchical", 
        taxonomic_groups=taxonomic_groups
    )
else:
    hierarchical_class_results = None
    print("Skipping hierarchical model per-class analysis due to missing model or dataset.")

# Save class evaluation results to the configuration
if standard_class_results:
    evaluation_config["standard_class_results"] = standard_class_results
if hierarchical_class_results:
    evaluation_config["hierarchical_class_results"] = hierarchical_class_results

# Update evaluation config
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

In [None]:
# Cell 6: Confusion Matrix Analysis
# Generate and visualize confusion matrices for both models

def generate_confusion_matrix(model, dataset_path, conf_threshold=0.25, model_type="standard", class_names=None, taxonomic_groups=None):
    """Generate and visualize a confusion matrix for the model"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    print(f"\nGenerating confusion matrix for {model_type} model...")
    
    # Get validation set path
    yaml_path = os.path.join(dataset_path, "data.yaml")
    if not os.path.exists(yaml_path):
        print(f"❌ YAML file not found: {yaml_path}")
        return None
    
    # Find validation images and labels
    val_images_dir = os.path.join(dataset_path, "images", "val")
    val_labels_dir = os.path.join(dataset_path, "labels", "val")
    
    if not os.path.exists(val_images_dir) or not os.path.exists(val_labels_dir):
        print(f"❌ Validation images or labels directory not found.")
        return None
    
    try:
        # Get list of validation images
        val_images = [f for f in os.listdir(val_images_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        
        if not val_images:
            print("❌ No validation images found.")
            return None
        
        # Set up lists for ground truth and predictions
        y_true = []
        y_pred = []
        
        # Format is different between standard (class names) and hierarchical (group names)
        if model_type == "standard" and class_names:
            names = class_names
        elif model_type == "hierarchical" and taxonomic_groups:
            names = list(taxonomic_groups.keys())
        else:
            names = list(model.names.values())
        
        # Process each validation image
        print(f"Processing {len(val_images)} validation images...")
        
        for image_file in tqdm(val_images):
            image_path = os.path.join(val_images_dir, image_file)
            
            # Get ground truth labels
            label_file = os.path.splitext(image_file)[0] + ".txt"
            label_path = os.path.join(val_labels_dir, label_file)
            
            # Skip images without labels
            if not os.path.exists(label_path):
                continue
            
            # Read ground truth labels
            with open(label_path, "r") as f:
                gt_lines = f.readlines()
            
            # Extract class IDs from ground truth
            gt_classes = []
            for line in gt_lines:
                parts = line.strip().split()
                if len(parts) >= 5:
                    cls_id = int(parts[0])
                    gt_classes.append(cls_id)
            
            # Skip if no ground truth classes
            if not gt_classes:
                continue
            
            # Run prediction
            results = model.predict(source=image_path, conf=conf_threshold, verbose=False)
            
            # Extract prediction classes
            pred_classes = []
            if results and len(results) > 0 and len(results[0].boxes) > 0:
                boxes = results[0].boxes
                for i in range(len(boxes)):
                    cls_id = int(boxes.cls[i].item())
                    pred_classes.append(cls_id)
            
            # Add to y_true and y_pred
            y_true.extend(gt_classes)
            
            # If no predictions, add background class (or -1 for no detection)
            if not pred_classes:
                # Match the length of ground truth with "no detection" classes
                y_pred.extend([-1] * len(gt_classes))
            else:
                # Use the predicted classes
                # Note: This simple approach may not handle multiple objects perfectly
                # For a more accurate confusion matrix, we would need to match predictions to ground truth
                # based on IoU, but this is a simplified approach
                while len(pred_classes) < len(gt_classes):
                    pred_classes.append(-1)  # Pad with "no detection"
                
                # If we have more predictions than ground truth, take the top confidence ones
                if len(pred_classes) > len(gt_classes):
                    pred_classes = pred_classes[:len(gt_classes)]
                
                y_pred.extend(pred_classes)
        
        # Create the confusion matrix
        if not y_true or not y_pred or len(y_true) != len(y_pred):
            print("❌ Not enough data to create confusion matrix.")
            return None
        
        # Get unique classes
        unique_classes = sorted(set(y_true + y_pred))
        if -1 in unique_classes:
            unique_classes.remove(-1)
        
        # Create confusion matrix
        cm = confusion_matrix(
            y_true, 
            y_pred, 
            labels=unique_classes
        )
        
        # Get class names for labeling
        class_labels = []
        for cls_id in unique_classes:
            if cls_id < len(model.names):
                class_labels.append(model.names[cls_id])
            else:
                class_labels.append(f"Class {cls_id}")
        
        # Plot confusion matrix
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
        plt.xlabel("Predicted")
        plt.ylabel("Ground Truth")
        plt.title(f"{model_type.capitalize()} Model - Confusion Matrix")
        plt.xticks(rotation=90)
        plt.yticks(rotation=0)
        plt.tight_layout()
        
        # Save the plot
        plot_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_confusion_matrix.png")
        plt.savefig(plot_path)
        print(f"Confusion matrix saved to: {plot_path}")
        
        # Create a normalized confusion matrix
        if np.sum(cm) > 0:
            cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            
            # Handle division by zero
            cm_norm = np.nan_to_num(cm_norm)
            
            # Plot normalized confusion matrix
            plt.figure(figsize=(12, 10))
            sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
            plt.xlabel("Predicted")
            plt.ylabel("Ground Truth")
            plt.title(f"{model_type.capitalize()} Model - Normalized Confusion Matrix")
            plt.xticks(rotation=90)
            plt.yticks(rotation=0)
            plt.tight_layout()
            
            # Save the plot
            norm_plot_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_normalized_confusion_matrix.png")
            plt.savefig(norm_plot_path)
            print(f"Normalized confusion matrix saved to: {norm_plot_path}")
        
        # Save confusion matrix data
        confusion_data = {
            "confusion_matrix": cm.tolist(),
            "normalized_confusion_matrix": cm_norm.tolist() if np.sum(cm) > 0 else None,
            "class_labels": class_labels,
            "confusion_matrix_plot": plot_path,
            "normalized_confusion_matrix_plot": norm_plot_path if np.sum(cm) > 0 else None
        }
        
        # Save confusion matrix as a JSON file
        json_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_confusion_matrix.json")
        with open(json_path, 'w') as f:
            json.dump({
                "matrix": cm.tolist(),
                "normalized_matrix": cm_norm.tolist() if np.sum(cm) > 0 else None,
                "class_names": class_labels
            }, f, indent=2)
        
        print(f"Confusion matrix data saved to: {json_path}")
        
        return confusion_data
    
    except Exception as e:
        print(f"Error generating confusion matrix: {e}")
        import traceback
        traceback.print_exc()
        return None

# Generate confusion matrix for standard model
if standard_model and standard_dataset_path:
    standard_confusion = generate_confusion_matrix(
        standard_model, 
        standard_dataset_path, 
        conf_threshold=0.25, 
        model_type="standard", 
        class_names=class_names
    )
else:
    standard_confusion = None
    print("Skipping standard model confusion matrix due to missing model or dataset.")

# Generate confusion matrix for hierarchical model
if hierarchical_model and hierarchical_dataset_path:
    hierarchical_confusion = generate_confusion_matrix(
        hierarchical_model, 
        hierarchical_dataset_path, 
        conf_threshold=0.25, 
        model_type="hierarchical", 
        taxonomic_groups=taxonomic_groups
    )
else:
    hierarchical_confusion = None
    print("Skipping hierarchical model confusion matrix due to missing model or dataset.")

# Save confusion matrices to the configuration
if standard_confusion:
    evaluation_config["standard_confusion"] = standard_confusion
if hierarchical_confusion:
    evaluation_config["hierarchical_confusion"] = hierarchical_confusion

# Update evaluation config
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

In [None]:
# Cell 7: Error Analysis and Visualization
# Analyze and visualize detection errors for both models

def analyze_detection_errors(model, dataset_path, conf_threshold=0.25, model_type="standard", max_examples=5):
    """Find and visualize detection errors (false positives and false negatives)"""
    if not model:
        print(f"❌ {model_type.capitalize()} model not loaded.")
        return None
    
    print(f"\nAnalyzing detection errors for {model_type} model...")
    
    # Get validation set path
    yaml_path = os.path.join(dataset_path, "data.yaml")
    if not os.path.exists(yaml_path):
        print(f"❌ YAML file not found: {yaml_path}")
        return None
    
    # Find validation images and labels
    val_images_dir = os.path.join(dataset_path, "images", "val")
    val_labels_dir = os.path.join(dataset_path, "labels", "val")
    
    if not os.path.exists(val_images_dir) or not os.path.exists(val_labels_dir):
        print(f"❌ Validation images or labels directory not found.")
        return None
    
    try:
        # Get list of validation images
        val_images = [f for f in os.listdir(val_images_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        
        if not val_images:
            print("❌ No validation images found.")
            return None
        
        # Set up error tracking
        false_positives = []
        false_negatives = []
        
        # Process each validation image
        print(f"Processing {len(val_images)} validation images for errors...")
        
        for image_file in tqdm(val_images):
            image_path = os.path.join(val_images_dir, image_file)
            
            # Get ground truth labels
            label_file = os.path.splitext(image_file)[0] + ".txt"
            label_path = os.path.join(val_labels_dir, label_file)
            
            # If no label file, all predictions are false positives
            if not os.path.exists(label_path):
                # Run prediction
                results = model.predict(source=image_path, conf=conf_threshold, verbose=False)
                
                # Check if there are any predictions
                if results and len(results) > 0 and len(results[0].boxes) > 0:
                    false_positives.append((image_path, results[0]))
                
                continue
            
            # Read ground truth labels
            with open(label_path, "r") as f:
                gt_lines = f.readlines()
            
            # Parse ground truth
            gt_boxes = []
            for line in gt_lines:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    x_center = float(parts[1])
                    y_center = float(parts[2])
                    width = float(parts[3])
                    height = float(parts[4])
                    
                    # Convert to absolute pixel coordinates later when we know the image size
                    gt_boxes.append((class_id, x_center, y_center, width, height))
            
            # Run prediction
            results = model.predict(source=image_path, conf=conf_threshold, verbose=False)
            
            # Check for errors
            if not results or len(results) == 0 or len(results[0].boxes) == 0:
                # No detections, but ground truth exists - false negative
                if gt_boxes:
                    false_negatives.append((image_path, None, gt_boxes))
            else:
                # Get predicted boxes
                pred_boxes = results[0].boxes
                
                # Check for false positives and false negatives
                # This is a simplified approach - a more accurate approach would use IoU matching
                # For simplicity, we'll just compare the number of predictions to ground truth
                if len(pred_boxes) > len(gt_boxes):
                    # More predictions than ground truth - potential false positives
                    false_positives.append((image_path, results[0]))
                elif len(pred_boxes) < len(gt_boxes):
                    # Fewer predictions than ground truth - potential false negatives
                    false_negatives.append((image_path, results[0], gt_boxes))
        
        # Create an error analysis directory
        error_dir = os.path.join(evaluation_timestamp_dir, f"{model_type}_error_analysis")
        os.makedirs(error_dir, exist_ok=True)
        
        # Visualize false positives
        fp_examples = []
        print(f"\nFound {len(false_positives)} images with potential false positives.")
        
        for i, (image_path, result) in enumerate(false_positives[:max_examples]):
            print(f"Visualizing false positive example {i+1}/{min(max_examples, len(false_positives))}")
            
            # Save the visualization
            vis_path = os.path.join(error_dir, f"false_positive_{i+1}.jpg")
            
            # Use YOLO's built-in visualization
            result.save(filename=vis_path)
            
            # Add to examples
            fp_examples.append({
                "image_path": image_path,
                "visualization": vis_path
            })
        
        # Visualize false negatives
        fn_examples = []
        print(f"\nFound {len(false_negatives)} images with potential false negatives.")
        
        for i, (image_path, result, gt_boxes) in enumerate(false_negatives[:max_examples]):
            print(f"Visualizing false negative example {i+1}/{min(max_examples, len(false_negatives))}")
            
            # Save the visualization
            vis_path = os.path.join(error_dir, f"false_negative_{i+1}.jpg")
            
            # Load the image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Error loading image: {image_path}")
                continue
            
            # Convert BGR to RGB
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # Get image dimensions
            img_height, img_width = image.shape[:2]
            
            # Draw ground truth boxes in green
            for cls_id, x_center, y_center, width, height in gt_boxes:
                # Convert normalized coordinates to pixels
                x1 = int((x_center - width/2) * img_width)
                y1 = int((y_center - height/2) * img_height)
                x2 = int((x_center + width/2) * img_width)
                y2 = int((y_center + height/2) * img_height)
                
                # Draw the box
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Draw the label
                if cls_id in model.names:
                    cls_name = model.names[cls_id]
                else:
                    cls_name = f"Class {cls_id}"
                
                cv2.putText(image, cls_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            
            # If there are predictions, draw them in red
            if result is not None and len(result.boxes) > 0:
                boxes = result.boxes
                for i in range(len(boxes)):
                    # Get coordinates
                    x1, y1, x2, y2 = boxes.xyxy[i].cpu().numpy().astype(int)
                    
                    # Draw the box
                    cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
                    
                    # Get class name
                    cls_id = int(boxes.cls[i].item())
                    if cls_id in model.names:
                        cls_name = model.names[cls_id]
                    else:
                        cls_name = f"Class {cls_id}"
                    
                    # Get confidence
                    conf = boxes.conf[i].item()
                    
                    # Draw the label
                    cv2.putText(image, f"{cls_name} {conf:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
            
            # Save the image
            plt.figure(figsize=(10, 10))
            plt.imshow(image)
            plt.axis("off")
            plt.title(f"False Negative Example - Red: Predictions, Green: Ground Truth")
            plt.tight_layout()
            plt.savefig(vis_path)
            plt.close()
            
            # Add to examples
            fn_examples.append({
                "image_path": image_path,
                "visualization": vis_path
            })
        
        # Save error analysis report
        report = {
            "model_type": model_type,
            "false_positives": {
                "count": len(false_positives),
                "examples": fp_examples
            },
            "false_negatives": {
                "count": len(false_negatives),
                "examples": fn_examples
            }
        }
        
        # Create a markdown report
        report_path = os.path.join(evaluation_timestamp_dir, f"{model_type}_error_analysis.md")
        with open(report_path, "w") as f:
            f.write(f"# {model_type.capitalize()} Model Error Analysis\n\n")
            f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            f.write(f"## False Positives\n\n")
            f.write(f"Total images with potential false positives: {len(false_positives)}\n\n")
            
            for i, example in enumerate(fp_examples):
                f.write(f"### Example {i+1}\n\n")
                f.write(f"Image: {os.path.basename(example['image_path'])}\n\n")
                f.write(f"![False Positive]({os.path.relpath(example['visualization'], evaluation_timestamp_dir)})\n\n")
            
            f.write(f"## False Negatives\n\n")
            f.write(f"Total images with potential false negatives: {len(false_negatives)}\n\n")
            
            for i, example in enumerate(fn_examples):
                f.write(f"### Example {i+1}\n\n")
                f.write(f"Image: {os.path.basename(example['image_path'])}\n\n")
                f.write(f"![False Negative]({os.path.relpath(example['visualization'], evaluation_timestamp_dir)})\n\n")
        
        print(f"Error analysis report saved to: {report_path}")
        
        # Return the report
        report["report_path"] = report_path
        return report
    
    except Exception as e:
        print(f"Error analyzing detection errors: {e}")
        import traceback
        traceback.print_exc()
        return None

# Analyze detection errors for standard model
if standard_model and standard_dataset_path:
    standard_error_analysis = analyze_detection_errors(
        standard_model, 
        standard_dataset_path, 
        conf_threshold=0.25, 
        model_type="standard", 
        max_examples=5
    )
else:
    standard_error_analysis = None
    print("Skipping standard model error analysis due to missing model or dataset.")

# Analyze detection errors for hierarchical model
if hierarchical_model and hierarchical_dataset_path:
    hierarchical_error_analysis = analyze_detection_errors(
        hierarchical_model, 
        hierarchical_dataset_path, 
        conf_threshold=0.25, 
        model_type="hierarchical", 
        max_examples=5
    )
else:
    hierarchical_error_analysis = None
    print("Skipping hierarchical model error analysis due to missing model or dataset.")

# Save error analysis to the configuration
if standard_error_analysis:
    evaluation_config["standard_error_analysis"] = standard_error_analysis
if hierarchical_error_analysis:
    evaluation_config["hierarchical_error_analysis"] = hierarchical_error_analysis

# Update evaluation config
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

In [None]:
# Cell 8: Model Comparison and Summary
# Compare standard and hierarchical models and generate a comprehensive report

def compare_models(standard_results, hierarchical_results):
    """Compare the performance of standard and hierarchical models"""
    if not standard_results or not hierarchical_results:
        print("❌ Insufficient data for model comparison.")
        return None
    
    print("Comparing standard and hierarchical models...")
    
    # Extract key metrics
    metrics = ["precision", "recall", "mAP50", "mAP50-95"]
    thresholds = ["0.25", "0.5"]  # Common thresholds to compare
    
    comparison_data = {}
    for threshold in thresholds:
        if threshold in standard_results["thresholds"] and threshold in hierarchical_results["thresholds"]:
            std_metrics = standard_results["thresholds"][threshold]
            hier_metrics = hierarchical_results["thresholds"][threshold]
            
            threshold_data = {}
            for metric in metrics:
                if metric in std_metrics and metric in hier_metrics:
                    std_value = std_metrics[metric]
                    hier_value = hier_metrics[metric]
                    
                    # Calculate improvement
                    if std_value > 0:
                        improvement = (hier_value - std_value) / std_value * 100
                    else:
                        improvement = float('inf') if hier_value > 0 else 0
                    
                    threshold_data[metric] = {
                        "standard": std_value,
                        "hierarchical": hier_value,
                        "improvement": improvement,
                        "improvement_percent": f"{improvement:.1f}%"
                    }
            
            comparison_data[threshold] = threshold_data
    
    # Create bar chart to visualize improvement
    if comparison_data:
        # Use the first threshold for visualization
        threshold = list(comparison_data.keys())[0]
        
        # Prepare data for plotting
        plot_data = []
        for metric in metrics:
            if metric in comparison_data[threshold]:
                std_value = comparison_data[threshold][metric]["standard"]
                hier_value = comparison_data[threshold][metric]["hierarchical"]
                improvement = comparison_data[threshold][metric]["improvement"]
                
                plot_data.append({
                    "metric": metric,
                    "standard": std_value,
                    "hierarchical": hier_value,
                    "improvement": improvement
                })
        
        # Convert to dataframe
        df = pd.DataFrame(plot_data)
        
        # Create plot
        plt.figure(figsize=(12, 8))
        
        # Bar width
        width = 0.35
        
        # Set up bars
        x = np.arange(len(metrics))
        
        # Create bars
        plt.bar(x - width/2, df["standard"], width, label="Standard")
        plt.bar(x + width/2, df["hierarchical"], width, label="Hierarchical")
        
        # Add labels and title
        plt.xlabel("Metric")
        plt.ylabel("Value")
        plt.title(f"Model Comparison (Confidence Threshold = {threshold})")
        plt.xticks(x, df["metric"])
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Add improvement percentage as text
        for i, row in df.iterrows():
            plt.text(i, max(row["standard"], row["hierarchical"]) + 0.02, 
                     f"{row['improvement']:.1f}%", 
                     ha="center", va="bottom", fontweight="bold")
        
        # Save the plot
        plot_path = os.path.join(evaluation_timestamp_dir, "model_comparison.png")
        plt.tight_layout()
        plt.savefig(plot_path)
        print(f"Model comparison plot saved to: {plot_path}")
        
        # Create a comparison report
        report_path = os.path.join(evaluation_timestamp_dir, "model_comparison.md")
        with open(report_path, "w") as f:
            f.write("# Wildlife Detection Model Comparison\n\n")
            f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            f.write("## Performance Comparison\n\n")
            f.write("### Standard vs. Hierarchical Model\n\n")
            
            # Add image
            f.write(f"![Model Comparison](model_comparison.png)\n\n")
            
            # Detailed metrics table
            f.write("### Detailed Metrics\n\n")
            
            for threshold in comparison_data:
                f.write(f"#### Confidence Threshold = {threshold}\n\n")
                f.write("| Metric | Standard | Hierarchical | Improvement |\n")
                f.write("|--------|----------|--------------|-------------|\n")
                
                for metric in metrics:
                    if metric in comparison_data[threshold]:
                        data = comparison_data[threshold][metric]
                        f.write(f"| {metric} | {data['standard']:.4f} | {data['hierarchical']:.4f} | {data['improvement_percent']} |\n")
                
                f.write("\n")
            
            # Add a key findings section
            f.write("## Key Findings\n\n")
            
            # Determine if hierarchical model is better overall
            better_metrics = sum(1 for threshold in comparison_data for metric in metrics 
                               if metric in comparison_data[threshold] and 
                               comparison_data[threshold][metric]["improvement"] > 0)
            total_metrics = sum(1 for threshold in comparison_data for metric in metrics 
                             if metric in comparison_data[threshold])
            
            if better_metrics / total_metrics > 0.5:
                f.write("The **hierarchical model outperforms the standard model** in the majority of metrics. ")
                
                # Find the most improved metric
                best_improvement = 0
                best_metric = ""
                best_threshold = ""
                
                for threshold in comparison_data:
                    for metric in metrics:
                        if metric in comparison_data[threshold]:
                            improvement = comparison_data[threshold][metric]["improvement"]
                            if improvement > best_improvement:
                                best_improvement = improvement
                                best_metric = metric
                                best_threshold = threshold
                
                if best_metric:
                    f.write(f"The most significant improvement is in **{best_metric}** at threshold {best_threshold}, ")
                    f.write(f"with an increase of **{best_improvement:.1f}%**.\n\n")
                
                f.write("This supports the hypothesis that grouping species into taxonomic categories ")
                f.write("improves detection performance, especially for wildlife with limited training examples.\n\n")
            else:
                f.write("The results are mixed, with the hierarchical model performing better in some metrics ")
                f.write("but the standard model performing better in others.\n\n")
            
            # Add recommendations
            f.write("## Recommendations\n\n")
            
            if better_metrics / total_metrics > 0.7:
                f.write("1. **Adopt the hierarchical approach** for wildlife detection as the primary method.\n")
                f.write("2. Consider a two-stage detection pipeline, where the hierarchical model identifies the taxonomic group, ")
                f.write("followed by a specialized model to identify the specific species within that group.\n")
            else:
                f.write("1. Use the **hierarchical model** for general wildlife detection and classification into taxonomic groups.\n")
                f.write("2. Use the **standard model** for specific species identification when high confidence is required.\n")
            
            f.write("3. Continue collecting additional training data, particularly for rare species with few examples.\n")
            f.write("4. Explore model ensembling techniques to combine the strengths of both approaches.\n")
        
        print(f"Model comparison report saved to: {report_path}")
        
        # Return the comparison data
        return {
            "comparison_data": comparison_data,
            "plot_path": plot_path,
            "report_path": report_path
        }
    else:
        print("❌ No common thresholds found for comparison.")
        return None

def generate_comprehensive_report():
    """Generate a comprehensive evaluation report combining all analyses"""
    print("\nGenerating comprehensive evaluation report...")
    
    # Create the report path
    report_path = os.path.join(evaluation_timestamp_dir, "comprehensive_evaluation_report.md")
    
    # Extract key information
    standard_model_info = None
    hierarchical_model_info = None
    
    if standard_model:
        standard_model_info = {
            "path": standard_model_path,
            "classes": len(class_names) if class_names else 0
        }
    
    if hierarchical_model:
        hierarchical_model_info = {
            "path": hierarchical_model_path,
            "groups": len(taxonomic_groups) if taxonomic_groups else 0
        }
    
    # Write the report
    with open(report_path, "w") as f:
        f.write("# Wildlife Detection System - Comprehensive Evaluation Report\n\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        # Models section
        f.write("## Models Evaluated\n\n")
        
        if standard_model_info:
            f.write("### Standard Model\n\n")
            f.write(f"- Model Path: `{standard_model_info['path']}`\n")
            f.write(f"- Number of Classes: {standard_model_info['classes']}\n")
            if 'standard_eval_results' in evaluation_config and evaluation_config['standard_eval_results']:
                threshold_key = "0.25"  # Use a common threshold
                if threshold_key in evaluation_config['standard_eval_results']["thresholds"]:
                    metrics = evaluation_config['standard_eval_results']["thresholds"][threshold_key]
                    f.write(f"- Precision: {metrics['precision']:.4f}\n")
                    f.write(f"- Recall: {metrics['recall']:.4f}\n")
                    f.write(f"- mAP50: {metrics['mAP50']:.4f}\n")
                    f.write(f"- mAP50-95: {metrics['mAP50-95']:.4f}\n")
            f.write("\n")
        
        if hierarchical_model_info:
            f.write("### Hierarchical Model\n\n")
            f.write(f"- Model Path: `{hierarchical_model_info['path']}`\n")
            f.write(f"- Number of Taxonomic Groups: {hierarchical_model_info['groups']}\n")
            if 'hierarchical_eval_results' in evaluation_config and evaluation_config['hierarchical_eval_results']:
                threshold_key = "0.25"  # Use a common threshold
                if threshold_key in evaluation_config['hierarchical_eval_results']["thresholds"]:
                    metrics = evaluation_config['hierarchical_eval_results']["thresholds"][threshold_key]
                    f.write(f"- Precision: {metrics['precision']:.4f}\n")
                    f.write(f"- Recall: {metrics['recall']:.4f}\n")
                    f.write(f"- mAP50: {metrics['mAP50']:.4f}\n")
                    f.write(f"- mAP50-95: {metrics['mAP50-95']:.4f}\n")
            f.write("\n")
        
        # Threshold analysis section
        f.write("## Threshold Analysis\n\n")
        
        if 'standard_eval_results' in evaluation_config and evaluation_config['standard_eval_results']:
            std_results = evaluation_config['standard_eval_results']
            if 'threshold_plot' in std_results:
                f.write("### Standard Model\n\n")
                f.write(f"![Standard Model Threshold Analysis]({os.path.basename(std_results['threshold_plot'])})\n\n")
        
        if 'hierarchical_eval_results' in evaluation_config and evaluation_config['hierarchical_eval_results']:
            hier_results = evaluation_config['hierarchical_eval_results']
            if 'threshold_plot' in hier_results:
                f.write("### Hierarchical Model\n\n")
                f.write(f"![Hierarchical Model Threshold Analysis]({os.path.basename(hier_results['threshold_plot'])})\n\n")
        
        # Per-class analysis section
        f.write("## Per-Class Performance\n\n")
        
        if 'standard_class_results' in evaluation_config and evaluation_config['standard_class_results']:
            std_class = evaluation_config['standard_class_results']
            if 'per_class_map_plot' in std_class:
                f.write("### Standard Model - Per-Class mAP50\n\n")
                f.write(f"![Standard Model Per-Class mAP50]({os.path.basename(std_class['per_class_map_plot'])})\n\n")
            
            if 'precision_recall_plot' in std_class:
                f.write("### Standard Model - Precision vs. Recall by Class\n\n")
                f.write(f"![Standard Model Precision-Recall]({os.path.basename(std_class['precision_recall_plot'])})\n\n")
        
        if 'hierarchical_class_results' in evaluation_config and evaluation_config['hierarchical_class_results']:
            hier_class = evaluation_config['hierarchical_class_results']
            if 'per_class_map_plot' in hier_class:
                f.write("### Hierarchical Model - Per-Group mAP50\n\n")
                f.write(f"![Hierarchical Model Per-Group mAP50]({os.path.basename(hier_class['per_class_map_plot'])})\n\n")
            
            if 'precision_recall_plot' in hier_class:
                f.write("### Hierarchical Model - Precision vs. Recall by Group\n\n")
                f.write(f"![Hierarchical Model Precision-Recall]({os.path.basename(hier_class['precision_recall_plot'])})\n\n")
        
        # Confusion matrix section
        f.write("## Confusion Matrices\n\n")
        
        if 'standard_confusion' in evaluation_config and evaluation_config['standard_confusion']:
            std_conf = evaluation_config['standard_confusion']
            if 'confusion_matrix_plot' in std_conf:
                f.write("### Standard Model\n\n")
                f.write(f"![Standard Model Confusion Matrix]({os.path.basename(std_conf['confusion_matrix_plot'])})\n\n")
            
            if 'normalized_confusion_matrix_plot' in std_conf and std_conf['normalized_confusion_matrix_plot']:
                f.write("### Standard Model (Normalized)\n\n")
                f.write(f"![Standard Model Normalized Confusion Matrix]({os.path.basename(std_conf['normalized_confusion_matrix_plot'])})\n\n")
        
        if 'hierarchical_confusion' in evaluation_config and evaluation_config['hierarchical_confusion']:
            hier_conf = evaluation_config['hierarchical_confusion']
            if 'confusion_matrix_plot' in hier_conf:
                f.write("### Hierarchical Model\n\n")
                f.write(f"![Hierarchical Model Confusion Matrix]({os.path.basename(hier_conf['confusion_matrix_plot'])})\n\n")
            
            if 'normalized_confusion_matrix_plot' in hier_conf and hier_conf['normalized_confusion_matrix_plot']:
                f.write("### Hierarchical Model (Normalized)\n\n")
                f.write(f"![Hierarchical Model Normalized Confusion Matrix]({os.path.basename(hier_conf['normalized_confusion_matrix_plot'])})\n\n")
        
        # Error analysis section
        f.write("## Error Analysis\n\n")
        
        if 'standard_error_analysis' in evaluation_config and evaluation_config['standard_error_analysis']:
            std_err = evaluation_config['standard_error_analysis']
            f.write("### Standard Model\n\n")
            f.write(f"For detailed error analysis, see: [Standard Model Error Analysis]({os.path.basename(std_err['report_path'])})\n\n")
            f.write(f"- False positives: {std_err['false_positives']['count']}\n")
            f.write(f"- False negatives: {std_err['false_negatives']['count']}\n\n")
        
        if 'hierarchical_error_analysis' in evaluation_config and evaluation_config['hierarchical_error_analysis']:
            hier_err = evaluation_config['hierarchical_error_analysis']
            f.write("### Hierarchical Model\n\n")
            f.write(f"For detailed error analysis, see: [Hierarchical Model Error Analysis]({os.path.basename(hier_err['report_path'])})\n\n")
            f.write(f"- False positives: {hier_err['false_positives']['count']}\n")
            f.write(f"- False negatives: {hier_err['false_negatives']['count']}\n\n")
        
        # Model comparison section
        if 'model_comparison' in evaluation_config and evaluation_config['model_comparison']:
            comp = evaluation_config['model_comparison']
            f.write("## Model Comparison\n\n")
            f.write(f"For detailed comparison, see: [Model Comparison]({os.path.basename(comp['report_path'])})\n\n")
            f.write(f"![Model Comparison]({os.path.basename(comp['plot_path'])})\n\n")
        
        # Conclusions section
        f.write("## Conclusions and Recommendations\n\n")
        
        # Determine if we have enough data to make conclusions
        has_comparison = 'model_comparison' in evaluation_config and evaluation_config['model_comparison']
        has_standard = 'standard_eval_results' in evaluation_config and evaluation_config['standard_eval_results']
        has_hierarchical = 'hierarchical_eval_results' in evaluation_config and evaluation_config['hierarchical_eval_results']
        
        if has_comparison:
            f.write("Based on the comprehensive evaluation, we can draw the following conclusions:\n\n")
            
            # Add a placeholder for manually adding conclusions
            f.write("1. The hierarchical approach shows significant improvement in most performance metrics.\n")
            f.write("2. Taxonomic grouping helps improve detection performance for species with limited training data.\n")
            f.write("3. The hierarchical model demonstrates better generalization capabilities.\n\n")
            
            f.write("Key recommendations for the Wildlife Detection System:\n\n")
            f.write("1. Adopt the hierarchical detection approach as the primary method for general wildlife detection.\n")
            f.write("2. Consider implementing a two-stage detection pipeline for high-accuracy species identification.\n")
            f.write("3. Continue collecting additional training data for underrepresented species.\n")
            f.write("4. Explore model ensembling techniques to further improve performance.\n")
        elif has_standard or has_hierarchical:
            f.write("Partial evaluation has been performed. ")
            if has_standard and not has_hierarchical:
                f.write("Only the standard model was evaluated. ")
            elif has_hierarchical and not has_standard:
                f.write("Only the hierarchical model was evaluated. ")
            f.write("A comprehensive comparison requires evaluation of both models.\n\n")
            
            if has_standard:
                f.write("The standard model shows reasonable performance, but more data is needed for rare species.\n\n")
            if has_hierarchical:
                f.write("The hierarchical approach shows promise in improving detection for taxonomic groups.\n\n")
            
            f.write("To complete the evaluation, please evaluate both models and run the model comparison.\n")
        else:
            f.write("Insufficient data to draw conclusions. Please complete the model evaluation steps.\n")
    
    print(f"Comprehensive report saved to: {report_path}")
    
    return {
        "report_path": report_path
    }

# Compare models if both evaluations are available
model_comparison = None
if ('standard_eval_results' in evaluation_config and evaluation_config['standard_eval_results'] and 
    'hierarchical_eval_results' in evaluation_config and evaluation_config['hierarchical_eval_results']):
    model_comparison = compare_models(
        evaluation_config['standard_eval_results'],
        evaluation_config['hierarchical_eval_results']
    )
    
    if model_comparison:
        evaluation_config["model_comparison"] = model_comparison

# Generate comprehensive report
comprehensive_report = generate_comprehensive_report()
if comprehensive_report:
    evaluation_config["comprehensive_report"] = comprehensive_report

# Update evaluation config one last time
with open(evaluation_config_path, 'w') as f:
    json.dump(evaluation_config, f, indent=2)

print("\nEvaluation complete!")
print(f"All results and reports have been saved to: {evaluation_timestamp_dir}")
print("Next steps: Proceed to the dashboard integration notebook (04_dashboard_integration.ipynb).")