# 🔧 YOLO Dataset Restructurer

Ce notebook permet de restructurer un dataset d'images d'aiguilles avec annotations YOLO pour l'entraînement.

## Structure attendue en entrée :
```
Dataset/
├── Images/          # Images des aiguilles
└── Labels/          # Fichiers .txt avec annotations YOLO
```

## Structure générée :
```
Restructured_Dataset/
├── metadata/         # Fichiers JSON avec statistiques
├── organized_by_needle/  # Organisation par aiguille
└── yolo_format/      # Format standard YOLO
    ├── train/images & labels
    ├── val/images & labels
    └── test/images & labels
```


In [None]:
# Import des bibliothèques nécessaires
import os
import json
import shutil
from pathlib import Path
from collections import defaultdict, Counter
import re
import numpy as np
from sklearn.model_selection import train_test_split

print("✅ Bibliothèques importées avec succès")


In [None]:
# Configuration des chemins
ORIGINAL_DATASET_PATH = "Dataset"  # Votre dataset actuel
OUTPUT_PATH = "Restructured_Dataset"  # Où créer la nouvelle structure

# Ratios de division (train/val/test)
TRAIN_RATIO = 0.75   # 75% pour l'entraînement
VAL_RATIO = 0.15     # 15% pour la validation  
TEST_RATIO = 0.10    # 10% pour les tests

print(f"📁 Configuration:")
print(f"   - Dataset original: {ORIGINAL_DATASET_PATH}")
print(f"   - Sortie: {OUTPUT_PATH}")
print(f"   - Division: {TRAIN_RATIO:.0%}/{VAL_RATIO:.0%}/{TEST_RATIO:.0%}")


In [None]:
# Fonctions utilitaires
def extract_needle_id(filename):
    """Extract needle ID from filename (AIG1, AIG2, etc.)"""
    match = re.match(r'(AIG\\d+)', filename)
    return match.group(1) if match else None

def find_corresponding_image(label_stem, images_path):
    """Find corresponding image file for a label"""
    possible_extensions = ['.png', '.jpg', '.jpeg']
    for ext in possible_extensions:
        image_path = images_path / f"{label_stem}{ext}"
        if image_path.exists():
            return image_path
    return None

def parse_label_file(label_path):
    """Parse YOLO label file and extract defect types"""
    defect_types = set()
    try:
        with open(label_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                if line:  # Non-empty line
                    parts = line.split()
                    if parts:
                        defect_type = int(parts[0])  # First element is class ID
                        defect_types.add(defect_type)
    except Exception as e:
        print(f"⚠️  Error reading {label_path}: {e}")
    return defect_types

print("✅ Fonctions utilitaires définies")


In [None]:
# Test des fonctions utilitaires (optionnel)
# Décommentez les lignes suivantes pour tester
# test_filename = "AIG1_image_001"
# needle_id = extract_needle_id(test_filename)
# print(f"Needle ID extrait: {needle_id}")


In [None]:
# Fonctions d'analyse du dataset
def analyze_dataset(original_dataset_path):
    """Analyze the current dataset structure and defect distribution"""
    print("🔍 Analyzing dataset...")
    
    original_path = Path(original_dataset_path)
    images_path = original_path / "Images"
    labels_path = original_path / "Labels"
    
    # Get all label files
    label_files = list(labels_path.glob("*.txt"))
    
    needle_data = {}
    defect_stats = defaultdict(list)
    
    for label_file in label_files:
        needle_id = extract_needle_id(label_file.stem)
        
        if needle_id not in needle_data:
            needle_data[needle_id] = {
                'images': [],
                'labels': [],
                'defect_types': set(),
                'visible_defects_count': 0
            }
        
        # Check corresponding image
        image_file = find_corresponding_image(label_file.stem, images_path)
        if image_file:
            needle_data[needle_id]['images'].append(image_file)
            needle_data[needle_id]['labels'].append(label_file)
            
            # Parse label file for defect types
            defect_types = parse_label_file(label_file)
            if defect_types:  # Non-empty label
                needle_data[needle_id]['defect_types'].update(defect_types)
                needle_data[needle_id]['visible_defects_count'] += 1
    
    # Aggregate defect statistics
    for needle_id, data in needle_data.items():
        if data['defect_types']:
            main_defect = list(data['defect_types'])[0]  # Take first defect as main
            defect_stats[main_defect].append(needle_id)
    
    print_analysis_results(needle_data, defect_stats)
    return needle_data, defect_stats

def print_analysis_results(needle_data, defect_stats):
    """Print analysis results"""
    print(f"\\n📊 Dataset Analysis Results:")
    print(f"Total needles found: {len(needle_data)}")
    
    total_images = sum(len(data['images']) for data in needle_data.values())
    total_with_defects = sum(data['visible_defects_count'] for data in needle_data.values())
    
    print(f"Total images: {total_images}")
    print(f"Images with visible defects: {total_with_defects}")
    print(f"Images without defects: {total_images - total_with_defects}")
    
    print(f"\\n🏷️  Defect Distribution:")
    defect_counts = {defect: len(needles) for defect, needles in defect_stats.items()}
    for defect_id, count in sorted(defect_counts.items()):
        print(f"  Class {defect_id}: {count} needles")
    
    print(f"\\n📈 Needles by visible defect count:")
    visible_counts = Counter(data['visible_defects_count'] for data in needle_data.values())
    for count, needles in sorted(visible_counts.items()):
        print(f"  {count} visible defects: {needles} needles")

print("✅ Fonctions d'analyse définies")


In [None]:
# Étape 1: Analyser le dataset
print("🔍 Analyse du dataset...")
needle_data, defect_stats = analyze_dataset(ORIGINAL_DATASET_PATH)


In [None]:
# Fonctions de division stratifiée
def create_stratified_split(needle_data, train_ratio=0.75, val_ratio=0.15, test_ratio=0.10):
    """Create stratified split ensuring balanced defect distribution"""
    print(f"\\n🎯 Creating stratified split ({train_ratio:.0%}/{val_ratio:.0%}/{test_ratio:.0%})...")
    
    # Group needles by their main defect type
    defect_groups = {}
    needles_without_defects = []
    
    for needle_id, data in needle_data.items():
        if data['defect_types']:
            main_defect = list(data['defect_types'])[0]  # Use first defect as main
            if main_defect not in defect_groups:
                defect_groups[main_defect] = []
            defect_groups[main_defect].append(needle_id)
        else:
            needles_without_defects.append(needle_id)
    
    train_needles = []
    val_needles = []
    test_needles = []
    
    # Split each defect group proportionally
    for defect_id, needles in defect_groups.items():
        if len(needles) == 1:
            # Single needle goes to train
            train_needles.extend(needles)
        elif len(needles) == 2:
            # Two needles: one to train, one to val
            train_needles.append(needles[0])
            val_needles.append(needles[1])
        else:
            # Multiple needles: stratified split
            n_train = max(1, int(len(needles) * train_ratio))
            n_val = max(1, int(len(needles) * val_ratio))
            
            train_split = needles[:n_train]
            val_split = needles[n_train:n_train + n_val]
            test_split = needles[n_train + n_val:]
            
            train_needles.extend(train_split)
            val_needles.extend(val_split)
            test_needles.extend(test_split)
    
    # Handle needles without visible defects
    if needles_without_defects:
        if len(needles_without_defects) == 1:
            train_needles.extend(needles_without_defects)
        else:
            n_train = max(1, int(len(needles_without_defects) * train_ratio))
            n_val = max(0, int(len(needles_without_defects) * val_ratio))
            
            train_split = needles_without_defects[:n_train]
            val_split = needles_without_defects[n_train:n_train + n_val]
            test_split = needles_without_defects[n_train + n_val:]
            
            train_needles.extend(train_split)
            val_needles.extend(val_split)
            test_needles.extend(test_split)
    
    split_assignment = {
        'train': sorted(train_needles),
        'val': sorted(val_needles),
        'test': sorted(test_needles)
    }
    
    print_split_results(split_assignment, needle_data)
    return split_assignment

def print_split_results(split_assignment, needle_data):
    """Print split results with defect distribution"""
    print(f"\\n📋 Split Results:")
    
    for split_name, needles in split_assignment.items():
        print(f"\\n{split_name.upper()}:")
        print(f"  Needles: {len(needles)}")
        
        # Count images and defects per split
        total_images = 0
        total_with_defects = 0
        defect_distribution = defaultdict(int)
        
        for needle_id in needles:
            if needle_id in needle_data:
                data = needle_data[needle_id]
                total_images += len(data['images'])
                total_with_defects += data['visible_defects_count']
                
                if data['defect_types']:
                    main_defect = list(data['defect_types'])[0]
                    defect_distribution[main_defect] += 1
        
        print(f"  Total images: {total_images}")
        print(f"  Images with defects: {total_with_defects}")
        print(f"  Defect distribution: {dict(defect_distribution)}")

print("✅ Fonctions de division stratifiée définies")


In [None]:
# Étape 2: Créer la division stratifiée
print("🎯 Création de la division stratifiée...")
split_assignment = create_stratified_split(needle_data, TRAIN_RATIO, VAL_RATIO, TEST_RATIO)


In [None]:
# Fonctions de création de structure et copie de fichiers
def create_directory_structure(output_path, needle_data):
    """Create the new directory structure"""
    print(f"\\n📁 Creating directory structure at {output_path}...")
    
    output_path = Path(output_path)
    
    # Create main directories
    directories = [
        "metadata",
        "organized_by_needle",
        "yolo_format/train/images",
        "yolo_format/train/labels", 
        "yolo_format/val/images",
        "yolo_format/val/labels",
        "yolo_format/test/images",
        "yolo_format/test/labels"
    ]
    
    for directory in directories:
        (output_path / directory).mkdir(parents=True, exist_ok=True)
    
    # Create needle-specific directories
    for needle_id in needle_data.keys():
        needle_dir = output_path / "organized_by_needle" / needle_id
        (needle_dir / "images").mkdir(parents=True, exist_ok=True)
        (needle_dir / "labels").mkdir(parents=True, exist_ok=True)

def copy_files(output_path, needle_data, split_assignment):
    """Copy files to new structure"""
    print(f"\\n📋 Copying files...")
    
    output_path = Path(output_path)
    
    # Copy to organized_by_needle structure
    for needle_id, data in needle_data.items():
        needle_dir = output_path / "organized_by_needle" / needle_id
        
        # Copy images
        for image_path in data['images']:
            shutil.copy2(image_path, needle_dir / "images" / image_path.name)
        
        # Copy labels
        for label_path in data['labels']:
            shutil.copy2(label_path, needle_dir / "labels" / label_path.name)
    
    # Copy to YOLO format structure
    for split_name, needles in split_assignment.items():
        split_dir = output_path / "yolo_format" / split_name
        
        for needle_id in needles:
            if needle_id in needle_data:
                data = needle_data[needle_id]
                
                # Copy images
                for image_path in data['images']:
                    shutil.copy2(image_path, split_dir / "images" / image_path.name)
                
                # Copy labels  
                for label_path in data['labels']:
                    shutil.copy2(label_path, split_dir / "labels" / label_path.name)

print("✅ Fonctions de création de structure et copie définies")


In [None]:
# Étape 3: Créer la structure de répertoires
print("📁 Création de la structure de répertoires...")
create_directory_structure(OUTPUT_PATH, needle_data)


In [None]:
# Étape 4: Copier les fichiers
print("📋 Copie des fichiers...")
copy_files(OUTPUT_PATH, needle_data, split_assignment)


In [None]:
# Fonctions de sauvegarde des métadonnées
def save_metadata(output_path, needle_data, defect_stats, split_assignment):
    """Save metadata files"""
    print(f"\\n💾 Saving metadata...")
    
    output_path = Path(output_path)
    metadata_dir = output_path / "metadata"
    
    # Save needle data
    needle_summary = {}
    for needle_id, data in needle_data.items():
        needle_summary[needle_id] = {
            'image_count': len(data['images']),
            'visible_defects_count': data['visible_defects_count'],
            'defect_types': list(data['defect_types'])
        }
    
    with open(metadata_dir / "needle_summary.json", 'w') as f:
        json.dump(needle_summary, f, indent=2)
    
    # Save defect distribution
    defect_distribution = {str(k): v for k, v in defect_stats.items()}
    with open(metadata_dir / "defect_distribution.json", 'w') as f:
        json.dump(defect_distribution, f, indent=2)
    
    # Save split assignment
    with open(metadata_dir / "split_assignment.json", 'w') as f:
        json.dump(split_assignment, f, indent=2)
    
    # Create data.yaml for YOLO
    create_data_yaml(output_path, needle_data, split_assignment)

def create_data_yaml(output_path, needle_data, split_assignment):
    """Create data.yaml file for YOLO training"""
    output_path = Path(output_path)
    
    # Get all unique defect classes
    all_defects = set()
    for data in needle_data.values():
        all_defects.update(data['defect_types'])
    
    # Create class names (you may want to customize these)
    class_names = {i: f"defect_{i}" for i in sorted(all_defects)}
    
    yaml_content = f"""# YOLOv11 Dataset Configuration
# Generated automatically

path: {output_path / 'yolo_format'}  # dataset root dir
train: train/images  # train images (relative to 'path')
val: val/images      # val images (relative to 'path')
test: test/images    # test images (relative to 'path')

# Classes
nc: {len(class_names)}  # number of classes
names: {list(class_names.values())}  # class names

# Additional info
total_needles: {len(needle_data)}
train_needles: {len(split_assignment.get('train', []))}
val_needles: {len(split_assignment.get('val', []))}
test_needles: {len(split_assignment.get('test', []))}
"""
    
    with open(output_path / "yolo_format" / "data.yaml", 'w') as f:
        f.write(yaml_content)
    
    print(f"✅ Created data.yaml with {len(class_names)} classes")

print("✅ Fonctions de sauvegarde des métadonnées définies")


In [None]:
# Étape 5: Sauvegarder les métadonnées
print("💾 Sauvegarde des métadonnées...")
save_metadata(OUTPUT_PATH, needle_data, defect_stats, split_assignment)


In [None]:
# Fonction de vérification des résultats
def check_directory_structure(base_path):
    """Vérifier la structure des répertoires créés"""
    print(f"📁 Vérification de la structure dans {base_path}:")
    
    if not os.path.exists(base_path):
        print(f"❌ Le répertoire {base_path} n'existe pas!")
        return
    
    # Vérifier les répertoires principaux
    main_dirs = ["metadata", "organized_by_needle", "yolo_format"]
    for dir_name in main_dirs:
        dir_path = os.path.join(base_path, dir_name)
        if os.path.exists(dir_path):
            print(f"✅ {dir_name}/")
        else:
            print(f"❌ {dir_name}/ manquant")
    
    # Vérifier la structure YOLO
    yolo_dirs = ["yolo_format/train/images", "yolo_format/train/labels",
                 "yolo_format/val/images", "yolo_format/val/labels",
                 "yolo_format/test/images", "yolo_format/test/labels"]
    
    print(f"\\n📋 Structure YOLO:")
    for dir_name in yolo_dirs:
        dir_path = os.path.join(base_path, dir_name)
        if os.path.exists(dir_path):
            file_count = len(os.listdir(dir_path))
            print(f"✅ {dir_name}/ ({file_count} fichiers)")
        else:
            print(f"❌ {dir_name}/ manquant")
    
    # Vérifier les fichiers de métadonnées
    metadata_files = ["needle_summary.json", "defect_distribution.json", "split_assignment.json"]
    print(f"\\n📊 Fichiers de métadonnées:")
    for file_name in metadata_files:
        file_path = os.path.join(base_path, "metadata", file_name)
        if os.path.exists(file_path):
            print(f"✅ {file_name}")
        else:
            print(f"❌ {file_name} manquant")
    
    # Vérifier data.yaml
    yaml_path = os.path.join(base_path, "yolo_format", "data.yaml")
    if os.path.exists(yaml_path):
        print(f"✅ data.yaml")
    else:
        print(f"❌ data.yaml manquant")

print("✅ Fonction de vérification définie")


In [None]:
# Étape 6: Vérifier les résultats
print("✅ Vérification des résultats...")
check_directory_structure(OUTPUT_PATH)


## 🎉 Restructuration terminée !

Votre dataset a été restructuré avec succès. Vous pouvez maintenant utiliser le notebook `yolo11_training_analysis.ipynb` pour l'entraînement du modèle YOLO.
