In [10]:
from pathlib import Path
import pandas as pd
import plotly.express as px


In [11]:
def count_files_by_species(dataset_root: str):
    """
    Parcourt les sous-dossiers 'color', 'grayscale' et 'segmented' du dataset PlantVillage
    et retourne un DataFrame avec le nombre de fichiers par espèce et par modalité.
    """
    root = Path(dataset_root)
    modalities = ['color', 'grayscale', 'segmented']
    records = []

    for mod in modalities:
        mod_path = root / mod
        if not mod_path.exists():
            print(f"⚠️ Le dossier {mod_path} n'existe pas.")
            continue

        for class_dir in mod_path.iterdir():
            if class_dir.is_dir():
                species = class_dir.name.split('___')[0]
                count = sum(1 for f in class_dir.iterdir() if f.is_file())
                records.append({'species': species, 'modality': mod, 'count': count})

    df = pd.DataFrame(records)
    # Pivot si vous préférez un format espèces×modalité :
    pivot = df.pivot_table(index='species', columns='modality', values='count', fill_value=0)
    return df, pivot

    

In [12]:
from pathlib import Path
import pandas as pd
import plotly.express as px

def count_files_by_species(dataset_root):
    root_path = Path(dataset_root)
    records = []
    for modality_dir in root_path.iterdir():
        if modality_dir.is_dir():
            modality = modality_dir.name
            for species_dir in modality_dir.iterdir():
                if species_dir.is_dir():
                    species = species_dir.name
                    count = len(list(species_dir.glob('*')))
                    records.append({'modality': modality, 'species': species, 'count': count})
    df_records = pd.DataFrame(records)
    df_pivot = df_records.pivot(index='species', columns='modality', values='count').fillna(0).astype(int)
    return df_records, df_pivot

dataset_root = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage dataset'
df_records, df_pivot = count_files_by_species(dataset_root)

fig1 = px.histogram(
    df_records,
    x='count',
    color='modality',
    barmode='overlay',
    nbins=50,
    title="Distribution du nombre d'images par espèce et modalité",
    labels={'count': "Nombre d'images", 'modality': "Modalité"}
)
fig1.update_layout(
    xaxis_title="Nombre d'images par espèce",
    yaxis_title="Nombre d'espèces",
    legend_title="Modalité"
)
fig1.show()

df_pivot_reset = df_pivot.reset_index().melt(id_vars='species', var_name='modality', value_name='count')

fig2 = px.bar(
    df_pivot_reset,
    x='species',
    y='count',
    color='modality',
    title="Nombre d'images par espèce et modalité (empilé)"
)
fig2.update_layout(
    xaxis_title="Espèce",
    yaxis_title="Nombre d'images",
    legend_title="Modalité"
)
fig2.show()




In [13]:
from pathlib import Path
import pandas as pd
from PIL import Image

def list_image_paths(dataset_root):
    """
    Crée un DataFrame avec les chemins des images du dataset,
    ainsi que la modalité et l'espèce associées.
    """
    root_path = Path(dataset_root)
    records = []

    for modality_dir in root_path.iterdir():
        if modality_dir.is_dir():
            modality = modality_dir.name
            for species_dir in modality_dir.iterdir():
                if species_dir.is_dir():
                    species = species_dir.name
                    for image_file in species_dir.glob('*'):
                        if image_file.is_file():
                            records.append({
                                'modality': modality,
                                'species': species,
                                'filepath': str(image_file.resolve())
                            })

    return pd.DataFrame(records)


def resize_images_from_df(df, output_base_dir, size=(256,256)):
    output_base = Path(output_base_dir)
    for _, row in df.iterrows():
        img_path = Path(row['filepath'])
        modality = row['modality']
        species = row['species']

        dest_dir = output_base / modality / species
        dest_dir.mkdir(parents=True, exist_ok=True)

        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(size)
                img.save(dest_dir / img_path.name)
        except Exception as e:
            print(f"Erreur avec {img_path}: {e}")


# Chemin dataset original
dataset_root = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage dataset'

# Extraction des chemins
df_paths = list_image_paths(dataset_root)

# Chemin où tu veux sauvegarder les images redimensionnées
output_dir = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'

# Resize et sauvegarde
resize_images_from_df(df_paths, output_dir)



In [6]:
from pathlib import Path
from PIL import Image

def verify_image_sizes(root_dir, expected_size):
    """
    Parcourt toutes les images dans root_dir et ses sous-dossiers,
    et affiche les images qui n'ont pas la taille expected_size.
    
    Parameters:
        root_dir (str or Path): dossier racine à vérifier
        expected_size (tuple): taille attendue (width, height)
    """
    root_path = Path(root_dir)
    mismatch_files = []

    for img_path in root_path.rglob('*'):
        if img_path.is_file() and img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
            try:
                with Image.open(img_path) as img:
                    if img.size != expected_size:
                        mismatch_files.append((img_path, img.size))
            except Exception as e:
                print(f"Erreur en ouvrant {img_path}: {e}")

    if mismatch_files:
        print(f"Images ne correspondant pas à la taille {expected_size}:")
        for path, size in mismatch_files:
            print(f" - {path} : taille {size}")
    else:
        print(f"Toutes les images ont bien la taille {expected_size}.")

# Exemple d'utilisation
root_dir = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
expected_size = (256, 256)
verify_image_sizes(root_dir, expected_size)


Toutes les images ont bien la taille (256, 256).


In [1]:
from pathlib import Path
from collections import Counter

def count_images_fast(folder_path):
    folder = Path(folder_path)
    counter = Counter()

    for img_path in folder.rglob('*'):
        if img_path.is_file():
            ext = img_path.suffix.lower()
            if ext in ['.png', '.bmp', '.gif', '.jpeg', '.jpg']:
                counter[ext] += 1

    total = sum(counter.values())
    print(f"\n📦 Nombre total d'images (sans vérification) : {total}")
    print("📊 Répartition par extension :")
    for ext, count in counter.items():
        print(f"  {ext} : {count}")

# Exemple d'utilisation
folder = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
count_images_fast(folder)



📦 Nombre total d'images (sans vérification) : 192718
📊 Répartition par extension :
  .jpg : 192716
  .jpeg : 1
  .png : 1


In [2]:
from pathlib import Path

def supprimer_images_non_jpg(folder_path):
    folder = Path(folder_path)
    extensions_a_supprimer = ['.jpeg', '.png', '.gif', '.bmp']

    fichiers_supprimes = 0

    for img_path in folder.rglob('*'):
        if img_path.is_file() and img_path.suffix.lower() in extensions_a_supprimer:
            try:
                img_path.unlink()
                print(f"❌ Supprimé : {img_path}")
                fichiers_supprimes += 1
            except Exception as e:
                print(f"Erreur lors de la suppression de {img_path} : {e}")

    print(f"\n✅ Nettoyage terminé. {fichiers_supprimes} fichiers supprimés.")

# Exemple d'utilisation
folder = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
supprimer_images_non_jpg(folder)



❌ Supprimé : /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/grayscale/Tomato___Late_blight/d0badc95-d04c-457b-8db8-aa6e198f4bd6___GHLB Leaf 10 Day 6.jpeg
❌ Supprimé : /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/grayscale/Pepper,_bell___healthy/42f083e2-272d-4f83-ad9a-573ee90e50ec___Screen Shot 2015-05-06 at 4.01.13 PM.png

✅ Nettoyage terminé. 2 fichiers supprimés.


In [3]:
from pathlib import Path

def verifier_images_non_jpg(folder_path):
    folder = Path(folder_path)
    extensions_autorisees = ['.jpg']
    extensions_image_connues = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
    fichiers_non_jpg = []

    for img_path in folder.rglob('*'):
        if img_path.is_file() and img_path.suffix.lower() in extensions_image_connues:
            if img_path.suffix.lower() not in extensions_autorisees:
                fichiers_non_jpg.append(img_path)

    total = len(fichiers_non_jpg)
    print(f"\n🔍 Fichiers image non .jpg trouvés : {total}")
    if total > 0:
        print("Liste des fichiers :")
        for f in fichiers_non_jpg:
            print(f" - {f}")
    else:
        print("✅ Aucun fichier image autre que .jpg trouvé.")

# Exemple d'utilisation
folder = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
verifier_images_non_jpg(folder)



🔍 Fichiers image non .jpg trouvés : 0
✅ Aucun fichier image autre que .jpg trouvé.


In [4]:
import os
import random
import shutil
from pathlib import Path

def split_dataset(source_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    random.seed(seed)
    source_dir = Path(source_dir)
    output_dir = Path(output_dir)

    # Vérifie que la somme des ratios est 1
    assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-6, "Les ratios doivent totaliser 1.0"

    # Liste les classes (chaque sous-dossier)
    classes = [d for d in source_dir.iterdir() if d.is_dir()]
    print(f"📂 Classes détectées : {[c.name for c in classes]}")

    for class_dir in classes:
        images = list(class_dir.glob('*.jpg'))
        random.shuffle(images)

        n = len(images)
        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)

        splits = {
            'train': images[:n_train],
            'val': images[n_train:n_train + n_val],
            'test': images[n_train + n_val:]
        }

        for split_name, split_images in splits.items():
            target_dir = output_dir / split_name / class_dir.name
            target_dir.mkdir(parents=True, exist_ok=True)

            for img_path in split_images:
                shutil.copy2(img_path, target_dir)

        print(f"✅ {class_dir.name} : {n} images → {len(splits['train'])} train, {len(splits['val'])} val, {len(splits['test'])} test")

    print("\n🎉 Séparation terminée.")

# Exemple d'utilisation
source = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
output = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_split'
split_dataset(source, output)


📂 Classes détectées : ['segmented', 'color', 'grayscale']
✅ segmented : 0 images → 0 train, 0 val, 0 test
✅ color : 0 images → 0 train, 0 val, 0 test
✅ grayscale : 0 images → 0 train, 0 val, 0 test

🎉 Séparation terminée.


In [5]:
from pathlib import Path

# Dossier contenant les images originales
DATA_RAW_DIR = Path("/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized")

# Dossier de sortie pour le dataset splitté (train/val/test)
DATA_SPLIT_DIR = Path("/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_split")

# Dossier pour éventuellement déplacer les images invalides ou supprimées
TRASH_DIR = Path("/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/trash")

# Extensions valides (standardisation)
IMAGE_EXTENSIONS = ['.jpg']  # On garde uniquement .jpg


In [13]:
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
from skimage.feature import hog

def extract_features(image_path):
    img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Image non chargée: {image_path}")
    img = cv2.resize(img, (128, 128))

    # Moments de Hu
    moments = cv2.moments(img)
    hu_moments = cv2.HuMoments(moments).flatten()
    hu_moments = -np.sign(hu_moments) * np.log10(np.abs(hu_moments) + 1e-10)

    # Transformée de Fourier (magnitude moyenne)
    f = np.fft.fft2(img)
    fshift = np.fft.fftshift(f)
    magnitude_spectrum = 20 * np.log(np.abs(fshift) + 1e-10)
    fourier_mean = magnitude_spectrum.mean()

    # HOG
    hog_features = hog(img, pixels_per_cell=(16, 16), cells_per_block=(2, 2), feature_vector=True)
    hog_reduced = hog_features[:20]

    return np.concatenate([hu_moments, [fourier_mean], hog_reduced])

def build_feature_dataset(dataset_root):
    root = Path(dataset_root)
    segmented_path = root / 'segmented'
    records = []

    if not segmented_path.exists():
        print(f"⚠️ Le dossier {segmented_path} n'existe pas.")
        return pd.DataFrame()  # Vide

    print("Extraction des features en cours...")

    for class_dir in segmented_path.iterdir():
        if class_dir.is_dir():
            species = class_dir.name.split('___')[0]
            image_files = list(class_dir.glob("*.jp*g"))
            print(f"Classe '{class_dir.name}': {len(image_files)} images")

            for image_path in image_files:
                try:
                    features = extract_features(image_path)
                    record = {
                        'species': species,
                        'image_path': str(image_path)
                    }
                    for i, val in enumerate(features):
                        record[f'feat_{i}'] = val
                    records.append(record)
                except Exception as e:
                    print(f"Erreur avec {image_path}: {e}")

    df = pd.DataFrame(records)
    return df

if __name__ == "__main__":
    dataset_path = "/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized"

    df = build_feature_dataset(dataset_path)
    print(f"\nDataFrame généré avec {len(df)} lignes")

    # Sauvegarde CSV
    output_csv = Path(dataset_path) / "features_dataset_segmented.csv"
    df.to_csv(output_csv, index=False)
    print(f"Features sauvegardées dans : {output_csv}")


Extraction des features en cours...
Classe 'Tomato___Tomato_mosaic_virus': 373 images


Classe 'Grape___healthy': 423 images
Classe 'Orange___Haunglongbing_(Citrus_greening)': 5507 images
Classe 'Cherry_(including_sour)___Powdery_mildew': 1052 images
Classe 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)': 1076 images
Classe 'Squash___Powdery_mildew': 1835 images
Classe 'Pepper,_bell___Bacterial_spot': 997 images
Classe 'Tomato___Target_Spot': 1404 images
Classe 'Apple___Apple_scab': 630 images
Classe 'Tomato___Late_blight': 1909 images
Classe 'Peach___Bacterial_spot': 2297 images
Classe 'Pepper,_bell___healthy': 1478 images
Classe 'Tomato___Septoria_leaf_spot': 1771 images
Classe 'Tomato___Leaf_Mold': 952 images
Classe 'Potato___Late_blight': 1000 images
Classe 'Soybean___healthy': 5090 images
Classe 'Tomato___Tomato_Yellow_Leaf_Curl_Virus': 5357 images
Classe 'Corn_(maize)___healthy': 1162 images
Classe 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot': 513 images
Classe 'Tomato___healthy': 1591 images
Classe 'Potato___healthy': 152 images
Classe 'Potato___Early_blight':

In [14]:
import pandas as pd

# Remplace par le chemin exact de ton fichier CSV
csv_path = "/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/features_dataset_segmented.csv"

# Chargement du fichier CSV dans un DataFrame
df = pd.read_csv(csv_path)

# Afficher les 5 premières lignes pour un aperçu rapide
print(df.head())

# Afficher des infos sur les colonnes et types
print(df.info())


  species                                         image_path    feat_0  \
0  Tomato  /workspaces/datasciencetest_reco_plante/datase...  2.674282   
1  Tomato  /workspaces/datasciencetest_reco_plante/datase...  2.752815   
2  Tomato  /workspaces/datasciencetest_reco_plante/datase...  2.570274   
3  Tomato  /workspaces/datasciencetest_reco_plante/datase...  2.854277   
4  Tomato  /workspaces/datasciencetest_reco_plante/datase...  2.756342   

     feat_1    feat_2    feat_3  feat_4    feat_5  feat_6      feat_7  ...  \
0  6.152982  9.513265  9.426511   -10.0  9.999011   -10.0  133.005363  ...   
1  5.971158  9.434369  9.809536    10.0  9.999965    10.0  134.760998  ...   
2  5.283350  9.064645  9.234546    10.0  9.995300    10.0  134.306466  ...   
3  6.612926  9.405340  9.949012   -10.0 -9.999976   -10.0  138.707162  ...   
4  6.216011  9.268629  9.841810    10.0  9.999857    10.0  132.739661  ...   

   feat_18  feat_19  feat_20  feat_21  feat_22  feat_23  feat_24  feat_25  \
0      0.