# CADI AI Model Training Pipeline

Clean modular training pipeline using external Python scripts.
This notebook orchestrates the training process using `!python` commands to run our modular scripts.

## Pipeline Overview:
1. **Environment Setup** - Install dependencies and set paths
2. **Dataset Preparation** - Create data.yaml and validate dataset
3. **Model Training** - Train YOLO model with optimized settings
4. **Evaluation** - Validate and test the trained model

## Requirements:
- Python scripts: `dataset_utils.py`, `train.py`
- Configuration file: `config.yaml`
- Dataset in proper YOLO format
- GPU recommended for training

In [None]:
# !git pull origin main

In [1]:
# Install required packages
!pip install -U -q ultralytics roboflow opencv-python supervision PyYAML "numpy <2"

# Import basic libraries
import os
import sys
import yaml
from pathlib import Path

print("✅ Dependencies installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.9/86.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.2/207.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K 

In [2]:
# Environment Configuration
# Adjust these paths based on your environment (Kaggle, Colab, or local)

# For Kaggle:
if '/kaggle' in os.getcwd():
    # This is where the repository will be cloned
    PROJECT_DIR = '/kaggle/working/cadi-ai'
    # Assumes the dataset is in /kaggle/input
    DATASET_PATH = '/kaggle/input/cadi-ai-retraining/combined_dataset/combined_dataset'  # Update this
    WORKING_DIR = '/kaggle/working/cadi-training-2508'
    ENVIRONMENT = 'kaggle'

# For Google Colab:
elif '/content' in os.getcwd():
    PROJECT_DIR = '/content/cadi-ai'
    DATASET_PATH = '/content/dataset'  # Update this
    WORKING_DIR = '/content/cadi-training-2508'
    ENVIRONMENT = 'colab'

# For local development:
else:
    PROJECT_DIR = r'c:\Users\Mecha Mino 5 Outlook\Documents\Mino Health AI labs\cadi-ai'
    DATASET_PATH = r'c:\Users\Mecha Mino 5 Outlook\Documents\Mino Health AI labs\cadi-ai\dataset'  # Update this
    WORKING_DIR = r'c:\Users\Mecha Mino 5 Outlook\Documents\Mino Health AI labs\cadi-ai\training_outputs'
    ENVIRONMENT = 'local'

# Create working directory
os.makedirs(WORKING_DIR, exist_ok=True)

# In Kaggle, we need to clone the repo first
if ENVIRONMENT == 'kaggle':
    !git clone https://github.com/minoHealth/cadi-ai.git 
    os.chdir(PROJECT_DIR)
else:
    os.chdir(PROJECT_DIR)

print(f"🔧 Environment: {ENVIRONMENT}")
print(f"📁 Project directory: {os.getcwd()}")
print(f"💾 Working directory: {WORKING_DIR}")


fatal: destination path 'cadi-ai' already exists and is not an empty directory.
🔧 Environment: kaggle
📁 Project directory: /kaggle/working/cadi-ai
💾 Working directory: /kaggle/working/cadi-training-2508
🔧 Environment: kaggle
📁 Project directory: /kaggle/working/cadi-ai
💾 Working directory: /kaggle/working/cadi-training-2508


In [3]:
!git pull origin main

From https://github.com/minoHealth/cadi-ai
 * branch            main       -> FETCH_HEAD
Already up to date.


In [4]:
# # Recreate data.yaml with improved class name detection
# print("🔄 Recreating data.yaml with CADI AI class names...")

# !python dataset_utils.py --create-yaml {DATASET_PATH} --output-path {os.path.join(PROJECT_DIR, 'data.yaml')} --cache-dir {os.path.join(WORKING_DIR, 'cache')}

# # print("\n📊 Re-validating with proper class names...")
# !python dataset_utils.py --validate {os.path.join(PROJECT_DIR, 'data.yaml')}

In [5]:
# # Check current data.yaml and config.yaml cache settings
# print("📋 Current data.yaml content:")
# data_yaml_path = os.path.join(PROJECT_DIR, 'data.yaml')
# if os.path.exists(data_yaml_path):
#     with open(data_yaml_path, 'r') as f:
#         data_content = f.read()
#         print(data_content)
# else:
#     print("❌ data.yaml not found")

# print("\n⚙️ Current config.yaml cache setting:")
# config_yaml_path = os.path.join(PROJECT_DIR, 'config.yaml')
# if os.path.exists(config_yaml_path):
#     with open(config_yaml_path, 'r') as f:
#         config = yaml.safe_load(f)
#         cache_setting = config.get('cache', 'not set')
#         print(f"cache: {cache_setting}")
# else:
#     print("❌ config.yaml not found")

# print(f"\n📁 Cache directory: {os.path.join(WORKING_DIR, 'cache')}")
# print(f"📁 Project directory: {PROJECT_DIR}")
# print(f"📁 Working directory: {WORKING_DIR}")

In [6]:
# Step 2: Find optimal batch size
print("🔍 Finding optimal batch size for your hardware...")
print("This may take a few minutes as it tests different batch sizes.")

# We use the --find-batch argument from our updated train.py script
!python train.py --config config.yaml --find-batch

🔍 Finding optimal batch size for your hardware...
This may take a few minutes as it tests different batch sizes.
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Finding optimal batch size...
🔍 Finding optimal batch size...
  Testing batch size: 32
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo1
Ultralytics 8.3.174 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=32, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=data.yaml, degrees=0.0, deterministic=True

In [None]:
# # Step 3: Start training with optimal settings
# # The `train.py` script will automatically use the settings from `config.yaml`
# # including the auto-detected batch size if you set it to "auto" in the config.

# print("🚀 Starting training...")
# print("Training will save results to the output directory specified in config.yaml.")
# print("You can monitor progress in the output below.")

# !python train.py --config config.yaml --batch 16

🚀 Starting training...
Training will save results to the output directory specified in config.yaml.
You can monitor progress in the output below.
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
🚀 Starting CADI AI Model Training
💻 System Information:
  Python: 3.11.13
  PyTorch: 2.6.0+cu124
  CUDA available: True
  GPU: Tesla P100-PCIE-16GB
  GPU Memory: 17.1 GB

📁 Validating data paths...
  ✅ train: 2813 images
  ✅ val: 501 images

📦 Using configured batch size: 16
🤖 Loading model: yolo11m.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo1
🏋️ Starting training...
Ultralytics 8.3.174 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0magnostic_n

In [6]:
# 🔄 Resume Training from Last Checkpoint
# This will automatically find and resume from the most recent checkpoint

print("🔄 Resuming training from last checkpoint...")
print("The script will automatically detect the most recent checkpoint and continue training.")
print("✨ Training progress, optimizer state, and epoch count will be restored!")

# Resume training with auto-detection of latest checkpoint
!python train.py --config config.yaml --resume auto --batch 16

🔄 Resuming training from last checkpoint...
The script will automatically detect the most recent checkpoint and continue training.
✨ Training progress, optimizer state, and epoch count will be restored!
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
🚀 Starting CADI AI Model Training
💻 System Information:
  Python: 3.11.13
  PyTorch: 2.6.0+cu124
  CUDA available: True
  GPU: Tesla P100-PCIE-16GB
  GPU Memory: 17.1 GB

🔄 Auto-detected latest checkpoint: runs/cadi-training-2508/weights/last.pt
   📅 Last modified: Wed Aug  6 13:57:09 2025
🔄 Resuming training from checkpoint: runs/cadi-training-2508/weights/last.pt
📁 Validating data paths...
  ✅ train: 2813 images
  ✅ val: 501 images

📦 Using configured batch size: 16
🤖 Loading mod

In [4]:
# 📊 Check Training Checkpoints Status
import glob
import os
import time

print("📊 Checking for available training checkpoints...")

# Look for checkpoints in common locations
checkpoint_patterns = [
    "runs/**/weights/last.pt",
    "runs/**/weights/best.pt"
]

all_checkpoints = []
for pattern in checkpoint_patterns:
    found = glob.glob(pattern, recursive=True)
    all_checkpoints.extend(found)

if all_checkpoints:
    print(f"\n✅ Found {len(all_checkpoints)} checkpoint(s):")
    
    # Sort by modification time
    all_checkpoints.sort(key=lambda x: os.path.getmtime(x), reverse=True)
    
    for i, checkpoint in enumerate(all_checkpoints, 1):
        mod_time = os.path.getmtime(checkpoint)
        file_size = os.path.getsize(checkpoint) / (1024*1024)  # MB
        is_latest = i == 1
        
        status = "🔄 LATEST" if is_latest else "📁"
        checkpoint_type = "best" if "best.pt" in checkpoint else "last"
        
        print(f"{status} {checkpoint}")
        print(f"   📅 Modified: {time.ctime(mod_time)}")
        print(f"   📏 Size: {file_size:.1f} MB")
        print(f"   🏷️  Type: {checkpoint_type} checkpoint")
        print()
        
    print("💡 To resume training, run the next cell or use:")
    print(f"   !python train.py --config config.yaml --resume auto")
    
else:
    print("❌ No checkpoints found.")
    print("   Start fresh training first, then you can resume from checkpoints.")
    print("   Checkpoints are saved in runs/[experiment-name]/weights/")

📊 Checking for available training checkpoints...

✅ Found 2 checkpoint(s):
🔄 LATEST runs/cadi-training-2508/weights/best.pt
   📅 Modified: Thu Aug  7 10:42:45 2025
   📏 Size: 38.7 MB
   🏷️  Type: best checkpoint

📁 runs/cadi-training-2508/weights/last.pt
   📅 Modified: Thu Aug  7 10:42:45 2025
   📏 Size: 38.7 MB
   🏷️  Type: last checkpoint

💡 To resume training, run the next cell or use:
   !python train.py --config config.yaml --resume auto


In [6]:
# Step 4: Evaluate the trained model
import glob
from ultralytics import YOLO

# Find the best model weights from the runs directory
weight_files = glob.glob('runs/**/weights/best.pt', recursive=True)

if weight_files:
    best_weights = weight_files[0]  # Get the most recent
    print(f"📊 Evaluating model: {best_weights}")
    
    # Load the model
    model = YOLO(best_weights)
    
    # Find the correct data.yaml location
    data_yaml_locations = [
        os.path.join(PROJECT_DIR, 'data.yaml'),
        os.path.join(WORKING_DIR, 'data.yaml'),
        'data.yaml'
    ]
    
    data_yaml_path = None
    for location in data_yaml_locations:
        if os.path.exists(location):
            data_yaml_path = location
            break
    
    if data_yaml_path:
        print(f"📋 Using data.yaml: {data_yaml_path}")
        
        # Run validation
        print("\n🧪 Running validation...")
        val_results = model.val(data=data_yaml_path)
        
        print("\n📈 Validation Results:")
        print(f"   mAP50-95: {val_results.box.map:.3f}")
        print(f"   mAP50: {val_results.box.map50:.3f}")
        print(f"   mAP75: {val_results.box.map75:.3f}")
        
    else:
        print("❌ data.yaml not found in any expected location.")
        print("Expected locations checked:")
        for loc in data_yaml_locations:
            print(f"   {loc}")
        print("\nPlease run the dataset preparation step first.")
        
else:
    print("❌ No trained model weights found. Please run training first.")
    print("Looking for weights in: runs/**/weights/best.pt")

📊 Evaluating model: runs/cadi-training-2508/weights/best.pt
📋 Using data.yaml: /kaggle/working/cadi-ai/data.yaml

🧪 Running validation...
Ultralytics 8.3.175 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
YOLO11m summary (fused): 125 layers, 20,032,345 parameters, 0 gradients, 67.7 GFLOPs


Downloading https://ultralytics.com/assets/Arial.ttf to '/root/.config/Ultralytics/Arial.ttf': 100%|██████████| 755k/755k [00:00<00:00, 64.0MB/s]


[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 11.5±1.3 MB/s, size: 95.4 KB)


[34m[1mval: [0mScanning /kaggle/input/cadi-ai-retraining/combined_dataset/combined_dataset/valid/labels... 501 images, 0 backgrounds, 0 corrupt: 100%|██████████| 501/501 [00:04<00:00, 123.09it/s]




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [00:10<00:00,  3.09it/s]
  xa[xa < 0] = -1
  xa[xa < 0] = -1


                   all        501       1715      0.383      0.363      0.326      0.128
               abiotic        256       1020      0.406      0.285      0.258     0.0843
               disease        172        418      0.113     0.0813     0.0356     0.0115
                insect        168        277       0.63      0.722      0.685      0.289
Speed: 0.8ms preprocess, 13.1ms inference, 0.0ms loss, 2.4ms postprocess per image
Results saved to [1mruns/detect/val2[0m

📈 Validation Results:
   mAP50-95: 0.128
   mAP50: 0.326
   mAP75: 0.075
               abiotic        256       1020      0.406      0.285      0.258     0.0843
               disease        172        418      0.113     0.0813     0.0356     0.0115
                insect        168        277       0.63      0.722      0.685      0.289
Speed: 0.8ms preprocess, 13.1ms inference, 0.0ms loss, 2.4ms postprocess per image
Results saved to [1mruns/detect/val2[0m

📈 Validation Results:
   mAP50-95: 0.128
   mAP50: 0

In [None]:
external_data = 

## External Validation & Diagnostics

This section runs the unified `external_eval.py` script. On Kaggle it temporarily overrides the validation path to use the external dataset located at `/kaggle/input/cadi-ai/Data/val`, runs diagnostics (dataset balance, rare class detection performance, anchors, confusion matrix), then restores the original `data.yaml`.

Artifacts (plots, CSVs, JSON report) are saved under `training_outputs/eval/<timestamp>/`.

Adjust parameters (`--conf`, `--auto-problematic-k`, or provide `--problematic`) as needed.

In [None]:
# Run unified evaluation with optional external Kaggle validation set (non-invasive)
import os, yaml, json, glob, pathlib, sys

PROJECT_DIR = os.getcwd()
DATA_YAML = os.path.join(PROJECT_DIR, 'data.yaml')
EXTERNAL_VAL = '/kaggle/input/cadi-ai/Data/val'
USE_EXTERNAL = os.path.isdir(EXTERNAL_VAL)

print(f"Using project dir: {PROJECT_DIR}")
print(f"data.yaml present: {os.path.exists(DATA_YAML)}")
print(f"External validation dir exists: {USE_EXTERNAL}")

# Problematic classes and confidence can be provided via env vars
PROBLEMATIC = os.environ.get('CADI_PROBLEMATIC', '')  # e.g. "abiotic,disease"
conf = float(os.environ.get('CADI_EVAL_CONF', '0.2'))
extra_args = []
if PROBLEMATIC:
    extra_args.extend(['--problematic', PROBLEMATIC])
if USE_EXTERNAL:
    extra_args.extend(['--external-val', EXTERNAL_VAL])

cmd = [
    sys.executable,
    'external_eval.py',
    '--config', 'config.yaml',
    '--conf', str(conf),
    '--batch', '16'
] + extra_args

print('Running:', ' '.join(cmd))
rc = os.system(' '.join(cmd))
print(f"external_eval.py exit code: {rc}")

# Quick summary: locate latest eval report
reports = sorted(glob.glob(os.path.join(PROJECT_DIR, 'training_outputs', 'eval', '*', 'analysis_report.json')), key=os.path.getmtime)
if reports:
    latest = reports[-1]
    with open(latest, 'r') as f:
        rpt = json.load(f)
    print('\nLatest Evaluation Summary:')
    print('  Weights:', rpt.get('weights'))
    print('  External val used:', rpt.get('external_val_used'))
    print('  External val path:', rpt.get('external_val_path'))
    print('  Problematic classes:', rpt.get('problematic_classes'))
    print('  Detection rates:', rpt.get('detection_rates'))
    print('  Output dir:', pathlib.Path(latest).parent)
else:
    print('No evaluation reports found yet.')