In [None]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

Définitions des fonctions statistiques et graphiques des caractéristiques

In [None]:
df = pd.read_csv("/workspaces/datasciencetest_reco_plante/notebooks/Plant_V_Seg_all_features.csv")
df.head()

In [None]:
df.columns

In [None]:
# 1. Vérifier les doublons complets sur tout le DataFrame
n_dup = df.duplicated().sum()
print(f"Il y a {n_dup} doublons exacts dans df.")

Histogrammes pour objectif 1 

In [None]:
# 0. Prépare plant_label 
plant_cols = [c for c in df.columns if c.startswith("plant_")]
mask = df[plant_cols].applymap(lambda x: x is True or x == 1)
idx = mask.values.argmax(axis=1)
df["plant_label"] = [plant_cols[i].replace("plant_", "") for i in idx]

# 1. Liste de features pour l’Objectif 1
features = [
    'aire', 'périmètre', 'circularité', 'excentricité', 'aspect_ratio',
    'contour_density', 'hu_1','hu_2','hu_3','hu_4','hu_5','hu_6','hu_7',
    'mean_H','mean_S','mean_V'
]

# 2. Prépare classes et palette
classes = pd.unique(df["plant_label"].dropna()).tolist()
palette = px.colors.qualitative.Plotly

# 3. Calcul des percentiles pour chaque feature
percentile_ranges = {}
for feat in features:
    arr_all = pd.to_numeric(df[feat], errors='coerce').dropna().values
    low, high = np.percentile(arr_all, [2.5, 97.5])
    percentile_ranges[feat] = (low, high)

# 4. Construis la figure et les traces invisibles
fig = go.Figure()
n_classes = len(classes)
for i, feat in enumerate(features):
    # calcul des bins communs
    vals_all = pd.to_numeric(df[feat], errors='coerce').dropna().values
    edges = np.histogram_bin_edges(vals_all, bins=25)
    centers = (edges[:-1] + edges[1:]) / 2
    
    for j, cls in enumerate(classes):
        vals = pd.to_numeric(df.loc[df["plant_label"]==cls, feat], errors='coerce').dropna().values
        counts, _ = np.histogram(vals, bins=edges)
        
        fig.add_trace(go.Bar(
            x=centers,
            y=counts,
            name=str(cls),
            marker_color=palette[j % len(palette)],
            visible=(i == 0),
            legendgroup=str(cls)
        ))

# 5. Prépare les boutons dropdown avec ajustement de l'axe X
buttons = []
for i, feat in enumerate(features):
    vis = [False] * (len(features) * n_classes)
    for j in range(n_classes):
        vis[i * n_classes + j] = True
    low, high = percentile_ranges[feat]
    buttons.append(dict(
        label=feat.replace("_"," "),
        method="update",
        args=[
            {"visible": vis},
            {
                "title": f"Histogramme de {feat.replace('_',' ')}",
                "xaxis": {"title": feat.replace("_"," "), "range": [low, high]}
            }
        ]
    ))

# 6. Finalise la mise en page
fig.update_layout(
    updatemenus=[dict(
        active=0,
        buttons=buttons,
        x=0.1, y=1.15,
        xanchor="left", yanchor="top"
    )],
    barmode="group",
    title=f"Histogramme de {features[0].replace('_',' ')}",
    xaxis_title=features[0].replace("_"," "),
    yaxis_title="Count",
    legend_title="Espèce",
    margin=dict(l=50, r=50, t=100, b=50)
)

fig.show()
fig.write_html("objectif1_histos.html", include_plotlyjs='cdn')


Histogrammes pour Objectif2

In [None]:
def plot_feature_dists_plotly(df, features, target, n_cols=3, nbins=30):
    # Récupère les classes dans l’ordre d’apparition (pas de tri)
    raw_cats   = pd.unique(df[target].dropna()).tolist()
    cat_labels = [str(c) for c in raw_cats]
    # Palette automatique
    palette = px.colors.qualitative.Plotly
    color_map = {lbl: palette[i % len(palette)] for i, lbl in enumerate(cat_labels)}

    n_plots = len(features)
    n_rows  = math.ceil(n_plots / n_cols)

    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=[f.replace("_"," ") for f in features],
        horizontal_spacing=0.04, vertical_spacing=0.08
    )

    for idx, feat in enumerate(features):
        # Pré-calcule des bins communs
        arr = pd.to_numeric(df[feat], errors='coerce').dropna().values
        edges   = np.histogram_bin_edges(arr, bins=nbins)
        centers = (edges[:-1] + edges[1:]) / 2

        row = idx // n_cols + 1
        col = idx % n_cols + 1

        for cat, lbl in zip(raw_cats, cat_labels):
            vals = pd.to_numeric(df.loc[df[target]==cat, feat], errors='coerce').dropna().values
            counts, _ = np.histogram(vals, bins=edges)
            fig.add_trace(
                go.Bar(
                    x=centers,
                    y=counts,
                    name=lbl,
                    legendgroup=lbl,
                    marker_color=color_map[lbl],
                    opacity=0.8,
                    showlegend=(idx==0)
                ),
                row=row, col=col
            )

        fig.update_xaxes(title_text=feat, row=row, col=col)
        fig.update_yaxes(title_text="Count",  row=row, col=col)

    fig.update_layout(
        title_text=f"Distribution des features par '{target}'",
        height=300 * n_rows, width=350 * n_cols,
        barmode='group',
        legend=dict(title=target, x=1.02, y=1),
        margin=dict(l=50, r=150, t=80, b=50)
    )
    fig.show()
    fig.write_html("objectif2_histos.html", include_plotlyjs='cdn')

In [None]:
# Histogrammes pour Objectif 2 : 

quant_vars_Est_Saine = ['mean_R','mean_G','mean_B', 'std_R','std_G','std_B', 'contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation', 'netteté', 'contour_density']
plot_feature_dists_plotly(df=df, features=quant_vars_Est_Saine, target='Est_Saine', n_cols=3, nbins=25)


Histogramme objectif 3

In [None]:
# --- Préparation de 'disease_label' 
disease_cols = [c for c in df.columns if c.startswith("disease_")]
mask_d = df[disease_cols].applymap(lambda x: x is True or x == 1)
idx_d  = mask_d.values.argmax(axis=1)
df["disease_label"] = [disease_cols[i].replace("disease_", "") for i in idx_d]

# --- Features et classes ---
quant_vars_disease = ['contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation', 'mean_H', 'mean_S', 'mean_V', 'std_R', 'std_G', 'std_B', 'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5', 'hu_6', 'hu_7']

features = quant_vars_disease  
classes  = pd.unique(df["disease_label"].dropna()).tolist()
palette  = px.colors.qualitative.Plotly

# --- Calcul des percentiles pour limiter l’axe X ---
percentile_ranges = {
    feat: np.percentile(df[feat].dropna(), [2.5, 97.5])
    for feat in features
}

# --- Construction de la figure ---
fig = go.Figure()

# pour chaque feature et chaque classe, on ajoute une trace invisible
for i, feat in enumerate(features):
    edges = np.histogram_bin_edges(df[feat].dropna(), bins=25)
    centers = (edges[:-1] + edges[1:]) / 2
    for j, cls in enumerate(classes):
        vals = df.loc[df["disease_label"]==cls, feat].dropna()
        counts, _ = np.histogram(vals, bins=edges)
        fig.add_trace(go.Bar(
            x=centers, y=counts,
            name=str(cls),
            marker_color=palette[j % len(palette)],
            visible=False,
            legendgroup=str(cls)
        ))

# --- Dropdowns pour feature et classe ---
n_feat   = len(features)
n_class  = len(classes)
buttons  = []

# Dropdown 1 : choisir la feature
for i, feat in enumerate(features):
    vis_feat = [False]*(n_feat*n_class)
    # on souhaite montrer toutes les classes pour la feature i
    for j in range(n_class):
        vis_feat[i*n_class + j] = True
    low, high = percentile_ranges[feat]
    buttons.append(dict(
        method="update",
        label=feat.replace("_"," "),
        args=[{"visible": vis_feat},
              {"xaxis": {"range":[low,high], "title":feat.replace("_"," ")},
               "title": f"Histogramme de {feat} par maladie"}]
    ))

# Dropdown 2 : (optionnel) choisir la classe seule
# … similaire, on crée un autre set de boutons …

fig.update_layout(
    updatemenus=[dict(active=0, buttons=buttons, x=0, y=1.2)],
    barmode="group",
    margin=dict(l=50, r=50, t=100, b=50),
    height=500, width=800
)

fig.show()
fig.write_html("objectif3_histos.html", include_plotlyjs='cdn')


Stop Histogrammes

Alternative objetif 3 avec boxplot

In [None]:
# --- Préparation de 'disease_label' 
disease_cols = [c for c in df.columns if c.startswith("disease_")]
mask_d = df[disease_cols].applymap(lambda x: x is True or x == 1)
idx_d  = mask_d.values.argmax(axis=1)
df["disease_label"] = [disease_cols[i].replace("disease_", "") for i in idx_d]

quant_vars_disease = [
    'contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation',
    'mean_H', 'mean_S', 'mean_V', 'std_R', 'std_G', 'std_B',
    'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5', 'hu_6', 'hu_7'
]



plots_per_file = 4
n_files = math.ceil(len(quant_vars_disease) / plots_per_file)
classes = pd.unique(df["disease_label"].dropna()).tolist()
palette = px.colors.qualitative.Plotly
color_map = {cls: palette[i % len(palette)] for i, cls in enumerate(classes)}

for k in range(n_files):
    vars_this = quant_vars_disease[k*plots_per_file : (k+1)*plots_per_file]
    n = len(vars_this)
    n_cols = 2
    n_rows = (n + n_cols - 1) // n_cols

    # Augmente l'espacement vertical (jusqu'à 0.30 si besoin)
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=[v.replace("_", " ") for v in vars_this],
        vertical_spacing=0.25,  # espace élevé entre les lignes !
        horizontal_spacing=0.10
    )

    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=[v.replace("_", " ") for v in vars_this],
        vertical_spacing=0.14, horizontal_spacing=0.10
    )

    for i, feat in enumerate(vars_this):
        row = i // n_cols + 1
        col = i % n_cols + 1
        for j, cls in enumerate(classes):
            y_vals = df.loc[df["disease_label"] == cls, feat]
            fig.add_trace(
                go.Box(
                    y=y_vals,
                    name=str(cls),
                    marker_color=color_map[cls],
                    boxmean='sd',
                    showlegend=(i==0)
                ),
                row=row, col=col
            )
        fig.update_yaxes(title_text=feat.replace("_", " "), row=row, col=col)
        fig.update_xaxes(title_text="Maladie", row=row, col=col)

    fig.update_layout(
        title=f"Boxplots maladies – Partie {k+1}/{n_files}",
        height=600 * n_rows,
        width=900 * n_cols,
        margin=dict(l=40, r=40, t=100, b=60),
        legend=dict(title="Maladie", orientation="v", x=1.03, y=1)
    )

    fig.show()
    html_name = f"Boxplot{k+1}.html"
    fig.write_html(html_name, include_plotlyjs='cdn')
    print(f"→ Sauvegardé : {html_name}")


In [None]:
# --- Préparation de 'disease_label' 
disease_cols = [c for c in df.columns if c.startswith("disease_")]
mask_d = df[disease_cols].applymap(lambda x: x is True or x == 1)
idx_d  = mask_d.values.argmax(axis=1)
df["disease_label"] = [disease_cols[i].replace("disease_", "") for i in idx_d]

quant_vars_disease = ['contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation', 'mean_H', 'mean_S', 'mean_V', 'std_R', 'std_G', 'std_B', 'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5', 'hu_6', 'hu_7']

# Liste des maladies dans l'ordre d'apparition
classes = pd.unique(df["disease_label"].dropna()).tolist()
palette = px.colors.qualitative.Plotly
color_map = {cls: palette[i % len(palette)] for i, cls in enumerate(classes)}

n = len(quant_vars_disease)
n_cols = 2  # Par exemple, 2 colonnes pour éviter une seule colonne trop longue
n_rows = (n + n_cols - 1) // n_cols

fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=[v.replace("_", " ") for v in quant_vars_disease],
    vertical_spacing=0.1, horizontal_spacing=0.07
)

for i, feat in enumerate(quant_vars_disease):
    row = i // n_cols + 1
    col = i % n_cols + 1
    for j, cls in enumerate(classes):
        y_vals = df.loc[df["disease_label"] == cls, feat]
        fig.add_trace(
            go.Box(
                y=y_vals,
                name=str(cls),
                marker_color=color_map[cls],
                boxmean='sd',
                showlegend=(i==0),  # une seule légende commune
            ),
            row=row, col=col
        )
    # X et Y titres
    fig.update_yaxes(title_text=feat.replace("_", " "), row=row, col=col)
    fig.update_xaxes(title_text="Maladie", row=row, col=col)

fig.update_layout(
    title="Distribution des variables quantitatives par maladie",
    height=350 * n_rows,
    width=850 * n_cols,
    margin=dict(l=30, r=30, t=80, b=40),
    legend=dict(title="Maladie", orientation="v", x=1.03, y=1)
)

fig.show()
fig.write_html("Boxplots_toutes_var_disease.html", include_plotlyjs='cdn')
print("→ Tous les boxplots sauvegardés dans : Boxplots_toutes_var_disease.html")

In [None]:
# Colonnes des moyennes des canaux RGB
rgb_means = ['mean_R', 'mean_G', 'mean_B']

# histogrammes superposés avec chaîne de couleurs
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b']
for feat, col in zip(rgb_means, colors):
    plt.hist(df[feat].dropna(), bins=30, alpha=0.5, label=feat, color=col)
plt.title("Histograms of mean RGB values")
plt.xlabel("Valeur")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x='Est_Saine', y='netteté')
plt.xticks([0,1], ['Malade','Saine'])
plt.title("Boxplot de la netteté selon l'état de la feuille")
plt.ylabel("Netteté (laplacian var)")
plt.xlabel("")
plt.tight_layout()
plt.show()

In [None]:
# Affichage des histogrammes des caractéristiques du dataframe
# 1. Colonnes à exclure
exclude = ['ID_Image', 'Est_Saine', 'is_black', 'dimensions']
exclude += [col for col in df.columns if col.startswith(('plant_', 'disease_'))]

# 2. Sélection des features numériques à tracer
features = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in exclude
]

# 3. Définir la taille de la grille de subplots
n_features = len(features)
n_cols = 4
n_rows = int(np.ceil(n_features / n_cols))

# 4. Création des subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
axes = axes.flatten()

# 5. Boucle de tracé
for idx, feature in enumerate(features):
    ax = axes[idx]
    ax.hist(df[feature].dropna(), bins=30)
    ax.set_title(feature)
    ax.set_xlabel(feature)
    ax.set_ylabel("Count")

# 6. Désactiver les sous-parcelles non utilisées
for j in range(idx+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Affichage de heatmap des caractéristiques du dataframe 
# 2. Colonnes à exclure (ID, labels, one-hot encodings)
exclude = ['ID_Image', 'Est_Saine', 'is_black', 'dimensions']
exclude += [col for col in df.columns if col.startswith(('plant_', 'disease_'))]

# 3. Sélection des features numériques
numeric_cols = [
    col for col in df.select_dtypes(include=[np.number]).columns
    if col not in exclude
]

# 4. Calcul de la matrice de corrélation
corr_matrix = df[numeric_cols].corr()

# 5. Affichage avec seaborn pour plus de lisibilité
plt.figure(figsize=(14, 12))
sns.heatmap(
    corr_matrix,
#    mask=np.triu(np.ones_like(corr_matrix, dtype=bool)),  # masque la moitié supérieure
    cmap='vlag',                                           # palette divergeante
    annot=False,                                           # mettre True pour afficher les valeurs 
    fmt=".2f",
    cbar_kws={'label': 'Coefficient de corrélation'}
)
plt.title('Heatmap de corrélation des features numériques')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()