In [2]:
import os
import shutil
import random
from collections import defaultdict
from pathlib import Path

# ================= CONFIGURATION =================

BASE_DIR = Path('..').resolve()
# Dossier source (votre dataset g√©ant actuel)
SOURCE_DIR = BASE_DIR / "data" / "dataset" / "garbage_dataset"

# Dossier de destination (sera cr√©√© s'il n'existe pas)
DEST_DIR = BASE_DIR / "data" / "dataset" / "garbage_dataset_100_reduction" 

# Nombre total d'images voulu (ex: 1000)
TARGET_TOTAL = 100

# R√©partition (Train / Valid / Test)
# Notez l'usage de 'valid' pour respecter votre tree
SPLIT_RATIOS = {'train': 0.8, 'valid': 0.1, 'test': 0.1}

# Extensions d'images accept√©es
IMG_EXT = {'.jpg', '.jpeg', '.png', '.bmp'}
# =================================================

def get_class_from_label(label_path):
    """Lit le premier ID de classe dans le fichier .txt YOLO."""
    if not os.path.exists(label_path):
        return None
    try:
        with open(label_path, 'r') as f:
            line = f.readline()
            if not line.strip(): return None
            # Format YOLO : class_id x y w h
            return int(line.split()[0]) 
    except:
        return None

def main():
    print(f"üîç Analyse de {SOURCE_DIR}...")
    
    files_by_class = defaultdict(list)
    image_count = 0
    
    # 1. SCAN ET INDEXATION (M√™me logique que pr√©c√©demment)
    for root, _, files in os.walk(SOURCE_DIR):
        for file in files:
            ext = os.path.splitext(file)[1].lower()
            if ext in IMG_EXT:
                img_path = os.path.join(root, file)
                base_name = os.path.splitext(file)[0]
                
                # Recherche du label (m√©thode robuste)
                label_path = os.path.join(root, base_name + ".txt")
                if not os.path.exists(label_path):
                    # Essai structure parall√®le images/labels
                    label_path = img_path.replace(f"{os.sep}images{os.sep}", f"{os.sep}labels{os.sep}")
                    label_path = os.path.splitext(label_path)[0] + ".txt"

                if os.path.exists(label_path):
                    class_id = get_class_from_label(label_path)
                    if class_id is not None:
                        files_by_class[class_id].append((img_path, label_path))
                        image_count += 1

    classes = list(files_by_class.keys())
    if not classes:
        print("‚ùå Erreur : Aucune donn√©e trouv√©e.")
        return

    print(f"‚úÖ Trouv√© {image_count} images sur {len(classes)} classes.")

    # 2. S√âLECTION √âQUILIBR√âE
    quota = TARGET_TOTAL // len(classes)
    final_selection = []
    
    print(f"üìä Objectif : ~{quota} images/classe.")

    for class_id in classes:
        pairs = files_by_class[class_id]
        random.shuffle(pairs)
        selected = pairs[:min(quota, len(pairs))]
        final_selection.extend(selected)

    random.shuffle(final_selection) # M√©lange final
    print(f"üì¶ Total s√©lectionn√© : {len(final_selection)} images.")

    # 3. DISTRIBUTION ET COPIE (Nouvelle structure)
    n_train = int(len(final_selection) * SPLIT_RATIOS['train'])
    n_valid = int(len(final_selection) * SPLIT_RATIOS['valid'])
    
    # D√©coupage de la liste
    datasets = {
        'train': final_selection[:n_train],
        'valid': final_selection[n_train:n_train+n_valid],
        'test': final_selection[n_train+n_valid:]
    }

    print(f"\nüöÄ Cr√©ation de la structure dans '{DEST_DIR}'...")

    for split_name, pairs in datasets.items():
        # Construction des chemins selon votre Tree
        # ex: garbage_dataset/train/images
        split_img_dir = os.path.join(DEST_DIR, split_name, 'images')
        # ex: garbage_dataset/train/labels
        split_lbl_dir = os.path.join(DEST_DIR, split_name, 'labels')
        
        os.makedirs(split_img_dir, exist_ok=True)
        os.makedirs(split_lbl_dir, exist_ok=True)

        for img_src, lbl_src in pairs:
            shutil.copy2(img_src, os.path.join(split_img_dir, os.path.basename(img_src)))
            shutil.copy2(lbl_src, os.path.join(split_lbl_dir, os.path.basename(lbl_src)))
            
    print(f"‚úÖ Termin√© ! L'arborescence respecte le format demand√©.")

if __name__ == "__main__":
    main()

üîç Analyse de C:\Users\jansc\OneDrive\Bureau\ECAM_local\ai_project_ma2\ia-llm-project\data\dataset\garbage_dataset...
‚úÖ Trouv√© 3846 images sur 7 classes.
üìä Objectif : ~14 images/classe.
üì¶ Total s√©lectionn√© : 98 images.

üöÄ Cr√©ation de la structure dans 'C:\Users\jansc\OneDrive\Bureau\ECAM_local\ai_project_ma2\ia-llm-project\data\dataset\garbage_dataset_100_reduction'...
‚úÖ Termin√© ! L'arborescence respecte le format demand√©.
