In [None]:
def detect_image_type(img_path, edge_threshold=0.15):
    """
    Detect if image is single leaf or rice plant cluster
    
    Strategy:
    - Single leaf: Usually centered, edges are clear background
    - Cluster/plant: Complex, multiple objects, edges have content
    
    Returns: 'single_leaf' or 'cluster'
    """
    try:
        img = cv2.imread(img_path)
        if img is None:
            return 'unknown'
        
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        h, w = gray.shape
        
        # Check edge density (if edges have little content → single leaf)
        edge_width = int(w * 0.1)  # 10% from each edge
        edge_height = int(h * 0.1)
        
        # Extract edge regions
        top_edge = gray[:edge_height, :]
        bottom_edge = gray[h-edge_height:, :]
        left_edge = gray[:, :edge_width]
        right_edge = gray[:, w-edge_width:]
        
        # Calculate edge density using standard deviation
        # Low std → uniform background → single leaf
        # High std → complex content → cluster
        edge_std = np.mean([
            np.std(top_edge),
            np.std(bottom_edge),
            np.std(left_edge),
            np.std(right_edge)
        ])
        
        center_std = np.std(gray[edge_height:h-edge_height, edge_width:w-edge_width])
        
        # If edge is much simpler than center → single leaf
        if center_std > 0:
            edge_ratio = edge_std / center_std
            if edge_ratio < edge_threshold:
                return 'single_leaf'
        
        return 'cluster'
        
    except Exception as e:
        logging.warning(f"Error detecting type for {img_path}: {e}")
        return 'unknown'

def create_pseudo_labels_for_cluster(img_path, label_id):
    """
    For cluster images, create pseudo bounding boxes using image processing
    
    Strategy:
    1. Use color-based segmentation to find green regions (leaves)
    2. Find contours and create bounding boxes
    3. Filter small/noisy detections
    
    Returns: List of (class_id, x_center, y_center, width, height) normalized
    """
    try:
        img = cv2.imread(img_path)
        if img is None:
            return []
        
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img_rgb.shape[:2]
        
        # Convert to HSV for better green detection
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        
        # Define range for green color (leaves)
        lower_green = np.array([25, 30, 30])
        upper_green = np.array([90, 255, 255])
        
        mask = cv2.inRange(hsv, lower_green, upper_green)
        
        # Morphological operations to clean up
        kernel = np.ones((5, 5), np.uint8)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
        
        # Find contours
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        bboxes = []
        min_area = (w * h) * 0.01  # At least 1% of image
        
        for contour in contours:
            area = cv2.contourArea(contour)
            if area < min_area:
                continue
            
            x, y, box_w, box_h = cv2.boundingRect(contour)
            
            # Skip very thin boxes (noise)
            aspect_ratio = box_w / box_h if box_h > 0 else 0
            if aspect_ratio < 0.1 or aspect_ratio > 10:
                continue
            
            # Normalize to YOLO format
            x_center = (x + box_w / 2) / w
            y_center = (y + box_h / 2) / h
            norm_w = box_w / w
            norm_h = box_h / h
            
            bboxes.append((label_id, x_center, y_center, norm_w, norm_h))
        
        # If no boxes found, fall back to full image
        if len(bboxes) == 0:
            bboxes = [(label_id, 0.5, 0.5, 1.0, 1.0)]
        
        return bboxes
        
    except Exception as e:
        logging.warning(f"Error creating pseudo labels for {img_path}: {e}")
        return [(label_id, 0.5, 0.5, 1.0, 1.0)]  # Fallback

def prepare_field_dataset(df, output_dir, val_size=0.15, test_size=0.15):
    """
    Prepare dataset with SMART LABELING:
    - Single leaf images: Use full-image bbox (valid assumption)
    - Cluster images: Use pseudo-labels from image processing
    """
    logging.info("\n" + "="*60)
    logging.info("SMART DATASET PREPARATION")
    logging.info("="*60)
    logging.info("Strategy:")
    logging.info("  • Single leaf images → Full-image bbox")
    logging.info("  • Cluster images → Pseudo-labels from segmentation")
    
    trainval_df, test_df = train_test_split(
        df, test_size=test_size, random_state=42, stratify=df['label_id']
    )
    train_df, val_df = train_test_split(
        trainval_df, test_size=val_size/(1-test_size), random_state=42,
        stratify=trainval_df['label_id']
    )
    
    logging.info(f"\nTrain: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
    logging.info(f"Val:   {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
    logging.info(f"Test:  {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
    
    field_data = {'train': [], 'val': [], 'test': []}
    type_stats = {'single_leaf': 0, 'cluster': 0, 'unknown': 0}
    
    for split, split_df in [('train', train_df), ('val', val_df), ('test', test_df)]:
        split_dir = os.path.join(output_dir, split)
        os.makedirs(split_dir, exist_ok=True)
        
        logging.info(f"\nProcessing {split}...")
        
        for idx, row in tqdm(split_df.iterrows(), total=len(split_df), desc=f"Preparing {split}"):
            src = row['image_path']
            dst = os.path.join(split_dir, f"{split}_{idx:06d}.jpg")
            
            try:
                shutil.copy2(src, dst)
                
                # Detect image type
                img_type = detect_image_type(src)
                type_stats[img_type] = type_stats.get(img_type, 0) + 1
                
                label_file = os.path.join(split_dir, f"{split}_{idx:06d}.txt")
                
                if img_type == 'single_leaf':
                    # Single leaf: Use full-image bbox (valid for centered single leaf)
                    with open(label_file, 'w') as f:
                        f.write(f"{row['label_id']} 0.5 0.5 1.0 1.0\n")
                else:
                    # Cluster or unknown: Use pseudo-labels
                    bboxes = create_pseudo_labels_for_cluster(src, row['label_id'])
                    with open(label_file, 'w') as f:
                        for bbox in bboxes:
                            class_id, x_c, y_c, w, h = bbox
                            f.write(f"{class_id} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}\n")
                
                field_data[split].append({
                    'image_path': dst,
                    'label_path': label_file,
                    'label_name': row['label_name'],
                    'label_id': row['label_id'],
                    'image_type': img_type
                })
            except Exception as e:
                logging.warning(f"Error: {e}")
    
    logging.info(f"\n{'='*60}")
    logging.info("Image Type Statistics:")
    logging.info(f"  Single Leaf: {type_stats.get('single_leaf', 0)} "
                f"({type_stats.get('single_leaf', 0)/len(df)*100:.1f}%)")
    logging.info(f"  Cluster:     {type_stats.get('cluster', 0)} "
                f"({type_stats.get('cluster', 0)/len(df)*100:.1f}%)")
    logging.info(f"  Unknown:     {type_stats.get('unknown', 0)} "
                f"({type_stats.get('unknown', 0)/len(df)*100:.1f}%)")
    logging.info(f"{'='*60}")
    
    # Save statistics
    stats_df = pd.DataFrame([type_stats])
    stats_df.to_csv(os.path.join(output_dir, 'image_type_stats.csv'), index=False)
    
    return field_data

def create_yolo_yaml(data_root, output_path):
    import yaml
    yaml_data = {
        'path': str(Path(data_root).absolute()),
        'train': 'train', 'val': 'val', 'test': 'test',
        'nc': len(LABELS),
        'names': [LABELS[i]['name'] for i in sorted(LABELS.keys())]
    }
    yaml_path = os.path.join(output_path, "data.yaml")
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_data, f, default_flow_style=False)
    logging.info(f"Created data.yaml: {yaml_path}")
    return yaml_path

field_data = prepare_field_dataset(
    collected_df, OUTPUT_DIRS["field_images"], 
    val_size=CONFIG['val_size'], test_size=CONFIG['test_size']
)
yaml_path = create_yolo_yaml(OUTPUT_DIRS["field_images"], OUTPUT_DIRS["field_images"])

 ## PHASE 3: YOLO Training