# B.2 - Two-Stage Ripeness Classification

**Experiment:** B.2  
**Approach:** Detect → Crop → Classify (2 stages)  
**Objective:** Test if separating detection and classification improves accuracy  
**Classes:** 2 (ripe, unripe)

## Pipeline
1. **Stage 1**: Detector - Detect all FFBs (ripe/unripe)
2. **Crop**: Extract bounding boxes with 10% margin
3. **Stage 2**: Classifier - Classify crops
4. **Evaluation**: End-to-end pipeline on test set

## Training Config
- Stage 1 (Detection): YOLOv11n, epochs=100, patience=30
- Stage 2 (Classification): YOLOv11n-cls, epochs=100, patience=30
- Seeds: 5 (42, 123, 456, 789, 101)
- Other parameters: default

In [None]:
# =============================================================================
# Cell 1: Setup
# =============================================================================
import os
import torch
import numpy as np
import cv2
import shutil
from pathlib import Path
from datetime import datetime
from tqdm.auto import tqdm

IS_KAGGLE = os.path.exists('/kaggle/input')
print(f"Running on: {'Kaggle' if IS_KAGGLE else 'Local'}")

# Paths
DATASET_PATH = Path('/kaggle/input/ffb-ripeness-detect')
BASE_PATH = Path('/kaggle/working')
RUNS_PATH = BASE_PATH / 'runs'
CROPS_PATH = BASE_PATH / 'crops'
KAGGLE_OUTPUT = BASE_PATH / 'kaggleoutput'
KAGGLE_OUTPUT.mkdir(parents=True, exist_ok=True)

print(f"Dataset: {DATASET_PATH} (exists: {DATASET_PATH.exists()})")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Verify dataset
for split in ['train', 'val', 'test']:
    imgs = len(list((DATASET_PATH / 'images' / split).glob('*.png')))
    lbls = len(list((DATASET_PATH / 'labels' / split).glob('*.txt')))
    print(f"  {split}: {imgs} images, {lbls} labels")

In [None]:
# =============================================================================
# Cell 2: Create YAML for Stage 1 (Detection)
# =============================================================================
%%writefile /kaggle/working/dataset_stage1.yaml
# B.2 Stage 1 - Detection (2 Classes)
path: /kaggle/input/ffb-ripeness-detect
train: images/train
val: images/val
test: images/test

nc: 2
names: ['ripe', 'unripe']

In [None]:
# =============================================================================
# Cell 3: Install Dependencies
# =============================================================================
!pip install -q ultralytics

from ultralytics import YOLO
import pandas as pd
print("Ultralytics ready")

In [None]:
# =============================================================================
# Cell 4: Training Config
# =============================================================================
SEEDS = [42, 123, 456, 789, 101]
STAGE1_PREFIX = 'exp_b2_stage1'
STAGE2_PREFIX = 'exp_b2_stage2'

stage1_config_path = Path('/kaggle/working/dataset_stage1.yaml')

print(f"Seeds: {SEEDS} ({len(SEEDS)} runs)")
print("Pipeline: Detect -> Crop -> Classify")

In [None]:
# =============================================================================
# Cell 5: Stage 1 - Train Detector (5 seeds)
# =============================================================================
stage1_models = {}

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"STAGE 1 (DETECTOR) - Seed {seed} ({SEEDS.index(seed)+1}/{len(SEEDS)})")
    print(f"{'='*60}\n")
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    
    model = YOLO('yolo11n.pt')
    
    results = model.train(
        data=str(stage1_config_path),
        epochs=100,
        patience=30,
        seed=seed,
        name=f"{STAGE1_PREFIX}_seed{seed}",
        exist_ok=True,
    )
    
    model_path = RUNS_PATH / 'detect' / f"{STAGE1_PREFIX}_seed{seed}" / 'weights' / 'best.pt'
    stage1_models[seed] = str(model_path)
    
    print(f"\nSeed {seed} complete! Model: {model_path}")

In [None]:
# =============================================================================
# Cell 6: Crop Extraction Function
# =============================================================================
def extract_crops_with_margin(image_dir, label_dir, output_dir, margin=0.1):
    """Extract crops from ground truth labels with margin."""
    output_dir = Path(output_dir)
    
    # Create output dirs
    for class_name in ['ripe', 'unripe']:
        (output_dir / class_name).mkdir(parents=True, exist_ok=True)
    
    image_files = list(Path(image_dir).glob('*.png')) + list(Path(image_dir).glob('*.jpg'))
    stats = {'ripe': 0, 'unripe': 0}
    
    for img_path in tqdm(image_files, desc="Extracting crops"):
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        h, w = img.shape[:2]
        
        # Get GT labels
        label_path = Path(label_dir) / f"{img_path.stem}.txt"
        if not label_path.exists():
            continue
        
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        for i, line in enumerate(lines):
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            class_id = int(parts[0])
            x_center, y_center, width, height = map(float, parts[1:5])
            
            # Convert to pixel coords
            x_center *= w
            y_center *= h
            width *= w
            height *= h
            
            # Add margin
            width_margin = width * margin
            height_margin = height * margin
            x1 = int(max(0, x_center - (width + width_margin) / 2))
            y1 = int(max(0, y_center - (height + height_margin) / 2))
            x2 = int(min(w, x_center + (width + width_margin) / 2))
            y2 = int(min(h, y_center + (height + height_margin) / 2))
            
            # Crop
            crop = img[y1:y2, x1:x2]
            if crop.size == 0:
                continue
            
            # Save
            class_name = 'ripe' if class_id == 0 else 'unripe'
            crop_filename = f"{img_path.stem}_crop{i}.png"
            crop_path = output_dir / class_name / crop_filename
            cv2.imwrite(str(crop_path), crop)
            stats[class_name] += 1
    
    return stats

print("Crop extraction function defined")

In [None]:
# =============================================================================
# Cell 7: Extract Crops for All Splits
# =============================================================================
print("Extracting crops from ground truth labels...\n")

for split in ['train', 'val', 'test']:
    print(f"{'='*40}")
    print(f"Processing {split.upper()}")
    print(f"{'='*40}")
    
    img_dir = DATASET_PATH / 'images' / split
    label_dir = DATASET_PATH / 'labels' / split
    output_dir = CROPS_PATH / split
    
    stats = extract_crops_with_margin(img_dir, label_dir, output_dir, margin=0.1)
    
    print(f"  Ripe: {stats['ripe']}")
    print(f"  Unripe: {stats['unripe']}")
    print(f"  Total: {sum(stats.values())}\n")

print(f"All crops saved to: {CROPS_PATH}")

In [None]:
# =============================================================================
# Cell 8: Stage 2 - Train Classifier (5 seeds)
# =============================================================================
stage2_models = {}

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"STAGE 2 (CLASSIFIER) - Seed {seed} ({SEEDS.index(seed)+1}/{len(SEEDS)})")
    print(f"{'='*60}\n")
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    
    model = YOLO('yolo11n-cls.pt')
    
    results = model.train(
        data=str(CROPS_PATH),
        epochs=100,
        patience=30,
        seed=seed,
        imgsz=224,
        name=f"{STAGE2_PREFIX}_seed{seed}",
        exist_ok=True,
    )
    
    model_path = RUNS_PATH / 'classify' / f"{STAGE2_PREFIX}_seed{seed}" / 'weights' / 'best.pt'
    stage2_models[seed] = str(model_path)
    
    print(f"\nSeed {seed} complete! Model: {model_path}")

In [None]:
# =============================================================================
# Cell 9: Evaluate Stage 1 (Detection) on Test Set
# =============================================================================
stage1_results = {}

print("="*60)
print("STAGE 1 EVALUATION ON TEST SET")
print("="*60)

for seed in SEEDS:
    model_path = stage1_models.get(seed)
    if not model_path or not Path(model_path).exists():
        print(f"Not found: seed {seed}")
        continue
    
    print(f"\nSeed {seed}:")
    model = YOLO(model_path)
    metrics = model.val(data=str(stage1_config_path), split='test')
    
    stage1_results[seed] = {
        'mAP50': metrics.box.map50,
        'mAP50-95': metrics.box.map,
        'Precision': metrics.box.mp,
        'Recall': metrics.box.mr
    }
    
    print(f"  mAP50: {metrics.box.map50:.3f}")
    print(f"  mAP50-95: {metrics.box.map:.3f}")

In [None]:
# =============================================================================
# Cell 10: Evaluate Stage 2 (Classification) on Test Crops
# =============================================================================
stage2_results = {}

print("="*60)
print("STAGE 2 EVALUATION ON TEST CROPS")
print("="*60)

for seed in SEEDS:
    model_path = stage2_models.get(seed)
    if not model_path or not Path(model_path).exists():
        print(f"Not found: seed {seed}")
        continue
    
    print(f"\nSeed {seed}:")
    model = YOLO(model_path)
    metrics = model.val(data=str(CROPS_PATH), split='test')
    
    stage2_results[seed] = {
        'Top1_Acc': metrics.top1,
        'Top5_Acc': metrics.top5
    }
    
    print(f"  Top-1 Accuracy: {metrics.top1:.3f}")
    print(f"  Top-5 Accuracy: {metrics.top5:.3f}")

In [None]:
# =============================================================================
# Cell 11: Results Summary
# =============================================================================
# Stage 1 Results
df_stage1 = pd.DataFrame(stage1_results).T
df_stage1.index.name = 'Seed'
avg_stage1 = df_stage1.mean()
std_stage1 = df_stage1.std()

# Stage 2 Results
df_stage2 = pd.DataFrame(stage2_results).T
df_stage2.index.name = 'Seed'
avg_stage2 = df_stage2.mean()
std_stage2 = df_stage2.std()

print("\n" + "="*60)
print("B.2 TWO-STAGE - STAGE 1 (DETECTION) RESULTS")
print("="*60 + "\n")
print(df_stage1.to_string(float_format=lambda x: f"{x:.3f}"))
print("\nSUMMARY (Mean ± Std):")
for col in df_stage1.columns:
    print(f"  {col}: {avg_stage1[col]:.3f} ± {std_stage1[col]:.3f}")

print("\n" + "="*60)
print("B.2 TWO-STAGE - STAGE 2 (CLASSIFICATION) RESULTS")
print("="*60 + "\n")
print(df_stage2.to_string(float_format=lambda x: f"{x:.3f}"))
print("\nSUMMARY (Mean ± Std):")
for col in df_stage2.columns:
    print(f"  {col}: {avg_stage2[col]:.3f} ± {std_stage2[col]:.3f}")

In [None]:
# =============================================================================
# Cell 12: Save Results
# =============================================================================
output_file = KAGGLE_OUTPUT / 'b2_twostage_results.txt'

with open(output_file, 'w') as f:
    f.write("="*60 + "\n")
    f.write("B.2 Two-Stage Ripeness Classification Results\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write("Training: epochs=100, patience=30, other=default\n")
    f.write(f"Seeds: {SEEDS}\n")
    f.write("="*60 + "\n\n")
    
    f.write("STAGE 1 (Detection) Results:\n")
    f.write(df_stage1.to_string(float_format=lambda x: f"{x:.3f}"))
    f.write("\n\nStage 1 Summary (Mean ± Std):\n")
    for col in df_stage1.columns:
        f.write(f"  {col}: {avg_stage1[col]:.3f} ± {std_stage1[col]:.3f}\n")
    
    f.write("\n" + "="*60 + "\n")
    f.write("STAGE 2 (Classification) Results:\n")
    f.write(df_stage2.to_string(float_format=lambda x: f"{x:.3f}"))
    f.write("\n\nStage 2 Summary (Mean ± Std):\n")
    for col in df_stage2.columns:
        f.write(f"  {col}: {avg_stage2[col]:.3f} ± {std_stage2[col]:.3f}\n")

print(f"Results saved: {output_file}")

In [None]:
# =============================================================================
# Cell 13: Create Archives
# =============================================================================
if (RUNS_PATH / 'detect').exists():
    shutil.make_archive('/kaggle/working/b2_stage1_runs', 'zip', RUNS_PATH / 'detect')
    print(f"b2_stage1_runs.zip created")

if (RUNS_PATH / 'classify').exists():
    shutil.make_archive('/kaggle/working/b2_stage2_runs', 'zip', RUNS_PATH / 'classify')
    print(f"b2_stage2_runs.zip created")

shutil.make_archive('/kaggle/working/b2_output', 'zip', KAGGLE_OUTPUT)
print("b2_output.zip created")

print("\nDownload from Output tab")