In [10]:
from pathlib import Path
import pandas as pd
import plotly.express as px


In [11]:
def count_files_by_species(dataset_root: str):
    """
    Parcourt les sous-dossiers 'color', 'grayscale' et 'segmented' du dataset PlantVillage
    et retourne un DataFrame avec le nombre de fichiers par espèce et par modalité.
    """
    root = Path(dataset_root)
    modalities = ['color', 'grayscale', 'segmented']
    records = []

    for mod in modalities:
        mod_path = root / mod
        if not mod_path.exists():
            print(f"⚠️ Le dossier {mod_path} n'existe pas.")
            continue

        for class_dir in mod_path.iterdir():
            if class_dir.is_dir():
                species = class_dir.name.split('___')[0]
                count = sum(1 for f in class_dir.iterdir() if f.is_file())
                records.append({'species': species, 'modality': mod, 'count': count})

    df = pd.DataFrame(records)
    # Pivot si vous préférez un format espèces×modalité :
    pivot = df.pivot_table(index='species', columns='modality', values='count', fill_value=0)
    return df, pivot

    

In [12]:
from pathlib import Path
import pandas as pd
import plotly.express as px

def count_files_by_species(dataset_root):
    root_path = Path(dataset_root)
    records = []
    for modality_dir in root_path.iterdir():
        if modality_dir.is_dir():
            modality = modality_dir.name
            for species_dir in modality_dir.iterdir():
                if species_dir.is_dir():
                    species = species_dir.name
                    count = len(list(species_dir.glob('*')))
                    records.append({'modality': modality, 'species': species, 'count': count})
    df_records = pd.DataFrame(records)
    df_pivot = df_records.pivot(index='species', columns='modality', values='count').fillna(0).astype(int)
    return df_records, df_pivot

dataset_root = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage dataset'
df_records, df_pivot = count_files_by_species(dataset_root)

fig1 = px.histogram(
    df_records,
    x='count',
    color='modality',
    barmode='overlay',
    nbins=50,
    title="Distribution du nombre d'images par espèce et modalité",
    labels={'count': "Nombre d'images", 'modality': "Modalité"}
)
fig1.update_layout(
    xaxis_title="Nombre d'images par espèce",
    yaxis_title="Nombre d'espèces",
    legend_title="Modalité"
)
fig1.show()

df_pivot_reset = df_pivot.reset_index().melt(id_vars='species', var_name='modality', value_name='count')

fig2 = px.bar(
    df_pivot_reset,
    x='species',
    y='count',
    color='modality',
    title="Nombre d'images par espèce et modalité (empilé)"
)
fig2.update_layout(
    xaxis_title="Espèce",
    yaxis_title="Nombre d'images",
    legend_title="Modalité"
)
fig2.show()




In [13]:
from pathlib import Path
import pandas as pd
from PIL import Image

def list_image_paths(dataset_root):
    """
    Crée un DataFrame avec les chemins des images du dataset,
    ainsi que la modalité et l'espèce associées.
    """
    root_path = Path(dataset_root)
    records = []

    for modality_dir in root_path.iterdir():
        if modality_dir.is_dir():
            modality = modality_dir.name
            for species_dir in modality_dir.iterdir():
                if species_dir.is_dir():
                    species = species_dir.name
                    for image_file in species_dir.glob('*'):
                        if image_file.is_file():
                            records.append({
                                'modality': modality,
                                'species': species,
                                'filepath': str(image_file.resolve())
                            })

    return pd.DataFrame(records)


def resize_images_from_df(df, output_base_dir, size=(256,256)):
    output_base = Path(output_base_dir)
    for _, row in df.iterrows():
        img_path = Path(row['filepath'])
        modality = row['modality']
        species = row['species']

        dest_dir = output_base / modality / species
        dest_dir.mkdir(parents=True, exist_ok=True)

        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(size)
                img.save(dest_dir / img_path.name)
        except Exception as e:
            print(f"Erreur avec {img_path}: {e}")


# Chemin dataset original
dataset_root = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage dataset'

# Extraction des chemins
df_paths = list_image_paths(dataset_root)

# Chemin où tu veux sauvegarder les images redimensionnées
output_dir = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'

# Resize et sauvegarde
resize_images_from_df(df_paths, output_dir)



In [6]:
from pathlib import Path
from PIL import Image

def verify_image_sizes(root_dir, expected_size):
    """
    Parcourt toutes les images dans root_dir et ses sous-dossiers,
    et affiche les images qui n'ont pas la taille expected_size.
    
    Parameters:
        root_dir (str or Path): dossier racine à vérifier
        expected_size (tuple): taille attendue (width, height)
    """
    root_path = Path(root_dir)
    mismatch_files = []

    for img_path in root_path.rglob('*'):
        if img_path.is_file() and img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
            try:
                with Image.open(img_path) as img:
                    if img.size != expected_size:
                        mismatch_files.append((img_path, img.size))
            except Exception as e:
                print(f"Erreur en ouvrant {img_path}: {e}")

    if mismatch_files:
        print(f"Images ne correspondant pas à la taille {expected_size}:")
        for path, size in mismatch_files:
            print(f" - {path} : taille {size}")
    else:
        print(f"Toutes les images ont bien la taille {expected_size}.")

# Exemple d'utilisation
root_dir = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
expected_size = (256, 256)
verify_image_sizes(root_dir, expected_size)


Toutes les images ont bien la taille (256, 256).


In [None]:
from pathlib import Path
from PIL import Image, ImageFile

# Pour ne pas bloquer sur images tronquées, on peut activer ou non selon besoins
ImageFile.LOAD_TRUNCATED_IMAGES = False  # Mettre à True si tu veux tolérer les tronquées

def verify_images(folder_path):
    folder = Path(folder_path)
    total = 0
    invalid_files = []

    for img_path in folder.rglob('*'):
        if img_path.is_file() and img_path.suffix.lower() in ['.png', '.bmp', '.gif', '.jpeg', '.jpg']:
            total += 1
            try:
                with Image.open(img_path) as img:
                    img.verify()  # vérifie que l'image est correcte (pas seulement ouvrir)
            except Exception as e:
                print(f"Image invalide ou corrompue : {img_path} - Erreur : {e}")
                invalid_files.append(img_path)

    print(f"\nTotal d'images vérifiées : {total}")
    print(f"Images corrompues ou invalides : {len(invalid_files)}")
    if invalid_files:
        print("Liste des fichiers problématiques :")
        for f in invalid_files:
            print(f" - {f}")

# Exemple d'utilisation
folder = '/workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized'
verify_images(folder)


Erreur avec /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/segmented/Squash___Powdery_mildew/af78d796-9021-4d74-8757-f19a8abcdfde___UMD_Powd.M 0393_final_masked.jpg: image file is truncated (19 bytes not processed)
Erreur avec /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/segmented/Squash___Powdery_mildew/da108f7f-40d2-4881-b3bd-4509bab41cd2___MD_Powd.M 0826_final_masked.jpg: image file is truncated (39 bytes not processed)
Erreur avec /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/segmented/Squash___Powdery_mildew/17a4993c-8d7e-4188-a81b-666e6b1bd244___UMD_Powd.M 0159_final_masked.jpg: image file is truncated (23 bytes not processed)
Erreur avec /workspaces/datasciencetest_reco_plante/dataset/plantvillage/data/plantvillage_resized/segmented/Squash___Powdery_mildew/5af39a25-6486-4944-a14b-382a728ec360___MD_Powd.M 0154_final_masked.jpg: image file is truncated (106 by

KeyboardInterrupt: 