In [1]:
from pathlib import Path

# 1. Trouve dynamiquement la racine du projet (contenant .gitignore)
cwd = Path.cwd()
PROJECT_ROOT = next(p for p in (cwd, *cwd.parents) if (p / ".gitignore").exists())

print("PROJECT_ROOT =", PROJECT_ROOT)

PROJECT_ROOT = /Users/mackjb/repository/datasciencetest_reco_plante


In [14]:
from pathlib import Path
import pandas as pd
import plotly.express as px
import os
from PIL import Image, ImageFilter
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import cv2
from sklearn.cluster import DBSCAN
import albumentations as A




In [6]:
def resize_and_pad(img, size=(256, 256), fill_color=(0, 0, 0, 0)):
    """
    Redimensionne l'image en gardant le ratio, puis pad pour obtenir une taille exacte size.
    fill_color = noir opaque (0,0,0) pour JPG, ou transparent (0,0,0,0) pour PNG.
    """
    original_size = img.size
    ratio = min(size[0]/original_size[0], size[1]/original_size[1])
    new_size = (int(original_size[0]*ratio), int(original_size[1]*ratio))
    img.resize(size, Image.Resampling.LANCZOS)
    
    # Crée une nouvelle image avec fond noir (ou transparent)
    if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
        new_img = Image.new("RGBA", size, fill_color)
    else:
        new_img = Image.new("RGB", size, fill_color[:3])
    
    paste_pos = ((size[0] - new_size[0]) // 2, (size[1] - new_size[1]) // 2)
    new_img.paste(img, paste_pos)
    return new_img



In [None]:

# # Chemin vers le dossier principal PlantVillage
# # root_dir = PROJECT_ROOT / "dataset" / "plantvillage" / "data" / "plantvillage dataset" / "segmented"
# root_dir = PROJECT_ROOT / "dataset" / "plantvillage" / "data" / "plantvillage_5images" / "segmented"


# # Le dossier destination, prefixé "clean_"
# clean_dir = PROJECT_ROOT / "dataset" / "plantvillage" / "data" / "plantvillage_clean" / "segmented"
# clean_dir.mkdir(parents=True, exist_ok=True)

# for subdir, dirs, files in os.walk(root_dir):
#     for file in files:
#         if file.lower().endswith(('.jpg', '.jpeg', '.png')):
#             src_path = Path(subdir) / file
#             # Chemin relatif par rapport à root_dir
#             rel_path = src_path.relative_to(root_dir)
#             dst_path = clean_dir / rel_path
            
#             # Crée les dossiers parents du fichier de destination si besoin
#             dst_path.parent.mkdir(parents=True, exist_ok=True)
            
#             try:
#                 with Image.open(src_path) as img:
#                     # Redimensionne et pad
#                     new_img = resize_and_pad(img, size=(256, 256), fill_color=(0,0,0,0))
#                     # Sauvegarde en conservant le format (extension)
#                     if dst_path.suffix.lower() in ['.jpg', '.jpeg']:
#                         # Convertir en RGB pour JPEG et enlever alpha
#                         if new_img.mode != 'RGB':
#                             new_img = new_img.convert('RGB')
#                         new_img.save(dst_path, quality=95)
#                     else:
#                         # PNG garde RGBA
#                         new_img.save(dst_path)
#             except Exception as e:
#                 print(f"Erreur sur {src_path}: {e}")

In [None]:
# Le filtre gaussien atténue le bruit haute fréquence pour éviter les artefacts visuels et de fausses frontières.
# Il stabilise les descripteurs d’images (HOG, SIFT…), améliorant leur cohérence et la séparation linéaire par le SVM.
# En supprimant les détails non pertinents, il réduit la variance du modèle et limite le surapprentissage.
# C’est un prétraitement standard qui renforce la robustesse du pipeline et facilite la généralisation.


# Définition du pipeline d’augmentation, on veut étendre notre dataset avec des prises de vue différentes
augmenter = A.Compose([
    A.Rotate(limit=90, p=0.5),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=0, p=0.5)
])


# Chemin vers le dossier principal PlantVillage
root_dir = PROJECT_ROOT / "dataset" / "plantvillage" / "data" / "plantvillage_5images" / "segmented"

# Le dossier destination, préfixé "clean_"
clean_dir = PROJECT_ROOT / "dataset" / "plantvillage" / "data" / "plantvillage_clean" / "segmented"
clean_dir.mkdir(parents=True, exist_ok=True)

for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        if file.lower().endswith(('.jpg','.jpeg','.png')):
            src_path = Path(subdir)/file
            rel_path = src_path.relative_to(root_dir)
            dst_path = clean_dir/rel_path
            dst_path.parent.mkdir(parents=True, exist_ok=True)

            try:
                with Image.open(src_path) as img:
                    # 1) Redimension & pad
                    new_img = resize_and_pad(img, size=(256,256), fill_color=(0,0,0,0))
                    # 2) Filtre Gaussien
                    new_img = new_img.filter(ImageFilter.GaussianBlur(radius=1.0))

                    # 3) Augmentation via Albumentations
                    arr = np.array(new_img)
                    aug = augmenter(image=arr)['image']
                    aug_img = Image.fromarray(aug)

                    # on repad pour garantir 256×256
                    aug_img = resize_and_pad(aug_img, size=(256,256), fill_color=(0,0,0,0))

                    # nommage : original_suffix.ext
                    stem, ext = dst_path.stem, dst_path.suffix
                    aug_path = dst_path.parent / f"{stem}_alb{ext}"
                    if ext.lower() in ['.jpg','.jpeg']:
                        aug_img = aug_img.convert('RGB')
                        aug_img.save(aug_path, quality=95)
                    else:
                        aug_img.save(aug_path)
                    print("Saved augmented image:", aug_path)

                    # 4) Sauvegarde de l’image originale nettoyée
                    if dst_path.suffix.lower() in ['.jpg',' .jpeg']:
                        new_img = new_img.convert('RGB')
                        new_img.save(dst_path, quality=95)
                    else:
                        new_img.save(dst_path)

            except Exception as e:
                print(f"Erreur sur {src_path}: {e}")

