In [1]:
import zipfile
import os

# Ruta en tu PC donde está el ZIP descargado manualmente
ZIP_PATH = r"SIMPD V1 South Indian Medicinal Plants dataset (Version 1).zip"

# Carpeta donde se va a descomprimir
EXTRACT_PATH = r"dataset"

os.makedirs(EXTRACT_PATH, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)

print("Dataset descomprimido en:", EXTRACT_PATH)


Dataset descomprimido en: dataset


In [2]:
import os
import shutil
import random

# ============================
# Semilla fija para reproducibilidad
# ============================
SEED = 42
random.seed(SEED)

# Ruta donde están las carpetas de cada clase — AJUSTAR
DATASET_PATH = r"dataset/SIMPD V1 South Indian Medicinal Plants dataset (Version 1)"

# Carpeta de salida
OUTPUT_PATH = r"dataset_split"

# Crear estructura de salida
splits = ["train", "val", "test"]
for split in splits:
    os.makedirs(os.path.join(OUTPUT_PATH, split), exist_ok=True)

# Listar clases detectando solo carpetas
classes = [c for c in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, c))]
print(f"Clases detectadas ({len(classes)}):", classes)

for cls in classes:
    cls_path = os.path.join(DATASET_PATH, cls)
    images = os.listdir(cls_path)

    # Mezcla reproducible por clase
    random.shuffle(images)

    n = len(images)
    n_train = int(n * 0.70)
    n_val = int(n * 0.15)
    n_test = n - n_train - n_val

    train_imgs = images[:n_train]
    val_imgs   = images[n_train:n_train+n_val]
    test_imgs  = images[n_train+n_val:]

    # Crear carpetas por clase
    for split in splits:
        os.makedirs(os.path.join(OUTPUT_PATH, split, cls), exist_ok=True)

    # Copiar imágenes
    for img in train_imgs:
        shutil.copy(os.path.join(cls_path, img), os.path.join(OUTPUT_PATH, "train", cls, img))

    for img in val_imgs:
        shutil.copy(os.path.join(cls_path, img), os.path.join(OUTPUT_PATH, "val", cls, img))

    for img in test_imgs:
        shutil.copy(os.path.join(cls_path, img), os.path.join(OUTPUT_PATH, "test", cls, img))

    print(f"[OK] {cls}: train={len(train_imgs)}, val={len(val_imgs)}, test={len(test_imgs)}")

print("Separación completada. Carpetas creadas en:", OUTPUT_PATH)


Clases detectadas (20): ['Abutilon Indicum', 'Aloe barbadensis miller', 'Calotropis gigantea', 'Canna indica', 'Cissus quadrangularis', 'Curcuma longa', 'Eclipta prostrate', 'Eichhornia Crassipes', 'Hibiscus Rosasinensis', 'Ixora coccinea', 'Justica adhatoda', 'Murraya koenigii', 'Ocimum tenuiflorum', 'Ouretlanata', 'Phyllanthus amarus', 'Ricinus communis', 'Senna Atriculata', 'Sesbania grandiflora', 'Trifolium Repens', 'Ziziphus mauritiana']
[OK] Abutilon Indicum: train=115, val=24, test=26
[OK] Aloe barbadensis miller: train=101, val=21, test=23
[OK] Calotropis gigantea: train=74, val=15, test=17
[OK] Canna indica: train=104, val=22, test=23
[OK] Cissus quadrangularis: train=93, val=20, test=21
[OK] Curcuma longa: train=77, val=16, test=17
[OK] Eclipta prostrate: train=145, val=31, test=32
[OK] Eichhornia Crassipes: train=141, val=30, test=31
[OK] Hibiscus Rosasinensis: train=79, val=16, test=18
[OK] Ixora coccinea: train=79, val=17, test=18
[OK] Justica adhatoda: train=70, val=15, t