In [None]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

Définitions des fonctions statistiques et graphiques des caractéristiques

In [None]:
df = pd.read_csv("/workspaces/datasciencetest_reco_plante/notebooks/Plant_V_Seg_all_features.csv")
df.head()

In [None]:
df.columns

In [None]:
# 1. Vérifier les doublons complets sur tout le DataFrame
n_dup = df.duplicated().sum()
print(f"Il y a {n_dup} doublons exacts dans df.")

In [None]:
# ─── Prépare un label textuel pour Est_Saine ───────────────────────────────────
df['Est_Saine_label'] = df['Est_Saine'].map({True: 'sain', False: 'malade'})

# ─── Fonction groupée pour Objectif 2 avec limitation de l’axe X ───────────────
def plot_and_export_grouped(
    df,
    features: list,
    target_col: str,
    out_html: str,
    n_cols: int = 3,
    nbins: int = 25
):
    import numpy as np, pandas as pd
    import plotly.graph_objects as go, plotly.express as px
    from plotly.subplots import make_subplots
    import math

    # 1. Catégories & couleurs
    classes   = pd.unique(df[target_col].dropna()).tolist()
    labels    = [str(c) for c in classes]
    palette   = px.colors.qualitative.Plotly
    color_map = {lbl: palette[i % len(palette)] for i, lbl in enumerate(labels)}
    
    # 2. Percentiles pour chaque feature
    percentile_ranges = {
        feat: np.percentile(df[feat].dropna().astype(float), [2.5, 97.5])
        for feat in features
    }

    # 3. Setup figure
    n_plots = len(features)
    n_rows  = math.ceil(n_plots / n_cols)
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=[f.replace("_"," ") for f in features],
        horizontal_spacing=0.05, vertical_spacing=0.1
    )

    # 4. Traces + limitation axe X
    for idx, feat in enumerate(features):
        arr = df[feat].dropna().astype(float).values
        edges = np.histogram_bin_edges(arr, bins=nbins)
        centers = (edges[:-1] + edges[1:]) / 2
        low, high = percentile_ranges[feat]
        row, col = divmod(idx, n_cols)
        row += 1; col += 1

        for lbl in labels:
            vals, _ = np.histogram(
                df.loc[df[target_col] == lbl, feat]
                  .dropna().astype(float).values,
                bins=edges
            )
            fig.add_trace(
                go.Bar(
                    x=centers, y=vals,
                    name=lbl, legendgroup=lbl,
                    marker_color=color_map[lbl],
                    opacity=0.8,
                    showlegend=(idx == 0)
                ),
                row=row, col=col
            )

        fig.update_xaxes(title_text="", row=row, col=col, range=[low, high])
        fig.update_yaxes(title_text="Count", row=row, col=col)

    # 5. Layout, affichage et export
    fig.update_layout(
        title_text="Objectif 2 – Distribution sain vs malade (axes limités)",
        height=300 * n_rows, width=350 * n_cols,
        barmode='group',
        legend=dict(title=target_col, x=1.02, y=1),
        margin=dict(l=50, r=150, t=80, b=50)
    )
    fig.show()
    fig.write_html(out_html, include_plotlyjs='cdn')
    print(f"→ Sauvegardé : {out_html}")
    return fig

# ─── Exécution Objectif 2 ─────────────────────────────────────────────────────
quant_vars_Est_Saine = [
    'mean_R','mean_G','mean_B','std_R','std_G','std_B',
    'contrast','energy','homogeneity','dissimilarite','Correlation',
    'netteté','contour_density'
]
fig2 = plot_and_export_grouped(
    df=df,
    features=quant_vars_Est_Saine,
    target_col='Est_Saine_label',
    out_html='objectif2_sain_malade_interactif.html',
    n_cols=3,
    nbins=25
)


In [None]:

# ─── 0. Reconstruire les labels one-hot ────────────────────────────────────────

# Objectif 1 → plant_label
plant_cols  = [c for c in df.columns if c.startswith("plant_")]
mask_plant  = df[plant_cols].applymap(lambda x: x is True or x == 1)
idx_plant   = mask_plant.values.argmax(axis=1)
df["plant_label"] = [plant_cols[i].replace("plant_", "") for i in idx_plant]

# Objectif 3 → disease_label
disease_cols = [c for c in df.columns if c.startswith("disease_")]
mask_dis     = df[disease_cols].applymap(lambda x: x is True or x == 1)
idx_dis      = mask_dis.values.argmax(axis=1)
df["disease_label"] = [disease_cols[i].replace("disease_", "") for i in idx_dis]


# ─── 1. Fonction dropdown pour Objectifs 1 & 3 ─────────────────────────────────

def build_and_export_dropdown(
    df,
    features: list,
    target_col: str,
    out_html: str,
    nbins: int = 25,
    title_prefix: str = ""
):
    """
    Pour chaque feature, construit un histogramme groupé par target_col,
    assemble un menu dropdown pour passer de feature en feature,
    affiche dans le notebook et exporte en HTML.
    """
    # récupère les classes dans l’ordre d’apparition
    classes = pd.unique(df[target_col].dropna()).tolist()
    labels  = [str(c) for c in classes]
    palette = px.colors.qualitative.Plotly

    # calcule 2.5–97.5% pour chaque feature
    percentile_ranges = {
        feat: np.percentile(df[feat].dropna().astype(float), [2.5, 97.5])
        for feat in features
    }

    fig = go.Figure()
    n_feat = len(features)
    n_cls  = len(classes)

    # ajoute toutes les traces (invisibles sauf la 1ʳᵉ feature)
    for i, feat in enumerate(features):
        data_all = df[feat].dropna().astype(float).values
        edges    = np.histogram_bin_edges(data_all, bins=nbins)
        centers  = (edges[:-1] + edges[1:]) / 2
        for j, cls in enumerate(classes):
            vals, _ = np.histogram(
                df.loc[df[target_col] == cls, feat]
                  .dropna().astype(float).values,
                bins=edges
            )
            fig.add_trace(go.Bar(
                x=centers, y=vals,
                name=labels[j],
                marker_color=palette[j % len(palette)],
                visible=(i == 0),
                legendgroup=labels[j]
            ))

    # construit les boutons dropdown
    buttons = []
    for i, feat in enumerate(features):
        vis = [False] * (n_feat * n_cls)
        for j in range(n_cls):
            vis[i*n_cls + j] = True
        low, high = percentile_ranges[feat]
        buttons.append(dict(
            label=feat.replace("_", " "),
            method="update",
            args=[
                {"visible": vis},
                {
                    "title":       f"{title_prefix}{feat.replace('_',' ')}",
                    "xaxis.range": [low, high],
                    "xaxis.title": feat.replace("_"," ")
                }
            ]
        ))

    # layout général
    fig.update_layout(
        updatemenus=[dict(active=0, buttons=buttons, x=0, y=1.15, xanchor="left")],
        barmode="group",
        title=f"{title_prefix}{features[0].replace('_',' ')}",
        xaxis_title="Valeurs",
        yaxis_title="Count",
        legend_title=target_col,
        margin=dict(l=50, r=50, t=100, b=50),
        height=500, width=800
    )

    # afficher et exporter
    fig.show()
    fig.write_html(out_html, include_plotlyjs="cdn")
    print(f"→ Sauvegardé : {out_html}")
    return fig


# ─── 2. Fonction groupée pour Objectif 2 ──────────────────────────────────────

def plot_and_export_grouped(
    df,
    features: list,
    target_col: str,
    out_html: str,
    n_cols: int = 3,
    nbins: int = 25
):
    """
    Pour chaque feature, trace un histogramme groupé côte-à-côte
    des classes target_col, affiche et exporte en HTML.
    """
    classes = pd.unique(df[target_col].dropna()).tolist()
    labels  = [str(c) for c in classes]
    palette = px.colors.qualitative.Plotly
    color_map = {lbl: palette[i % len(palette)] for i, lbl in enumerate(labels)}

    n_plots = len(features)
    n_rows  = math.ceil(n_plots / n_cols)
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=[f.replace("_"," ") for f in features],
        horizontal_spacing=0.04, vertical_spacing=0.08
    )

    for idx, feat in enumerate(features):
        arr     = df[feat].dropna().astype(float).values
        edges   = np.histogram_bin_edges(arr, bins=nbins)
        centers = (edges[:-1] + edges[1:]) / 2
        row, col = divmod(idx, n_cols)
        row += 1; col += 1

        for lbl in labels:
            vals, _ = np.histogram(
                df.loc[df[target_col] == lbl, feat]
                  .dropna().astype(float).values,
                bins=edges
            )
            fig.add_trace(go.Bar(
                x=centers, y=vals,
                name=lbl,
                legendgroup=lbl,
                marker_color=color_map[lbl],
                opacity=0.8,
                showlegend=(idx == 0)
            ), row=row, col=col)

        fig.update_xaxes(title_text="", row=row, col=col)
        fig.update_yaxes(title_text="Count", row=row, col=col)

    fig.update_layout(
        title_text=f"Distribution des features par '{target_col}'",
        height=300 * n_rows,
        width=350 * n_cols,
        barmode="group",
        legend=dict(title=target_col, x=1.02, y=1),
        margin=dict(l=50, r=150, t=80, b=50)
    )

    fig.show()
    fig.write_html(out_html, include_plotlyjs="cdn")
    print(f"→ Sauvegardé : {out_html}")
    return fig


# ─── 3. Exécution pour les 3 objectifs ────────────────────────────────────────

# Objectif 1 : dropdown espèce
features_plant = [
    'aire','périmètre','circularité','excentricité','aspect_ratio',
    'contour_density','hu_1','hu_2','hu_3','hu_4','hu_5','hu_6','hu_7',
    'mean_H','mean_S','mean_V'
]
fig1 = build_and_export_dropdown(
    df, features_plant,
    target_col="plant_label",
    out_html="objectif1_histos_interactif.html",
    nbins=25,
    title_prefix="Histogramme espèce – "
)

# Objectif 2 : histogrammes groupés sain vs malade
quant_vars_Est_Saine = [
    'mean_R','mean_G','mean_B','std_R','std_G','std_B',
    'contrast','energy','homogeneity','dissimilarite','Correlation',
    'netteté','contour_density'
]
fig2 = plot_and_export_grouped(
    df, quant_vars_Est_Saine,
    target_col="Est_Saine",
    out_html="objectif2_sain_malade_interactif.html",
    n_cols=3, nbins=25
)

# Objectif 3 : dropdown maladies
features_disease = [
    'contrast','energy','homogeneity','dissimilarite','Correlation',
    'mean_H','mean_S','mean_V','std_R','std_G','std_B',
    'hu_1','hu_2','hu_3','hu_4','hu_5','hu_6','hu_7'
]
fig3 = build_and_export_dropdown(
    df, features_disease,
    target_col="disease_label",
    out_html="objectif3_histos_interactif.html",
    nbins=25,
    title_prefix="Histogramme maladie – "
)
