In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
import cv2
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
from skimage.feature.texture import graycomatrix, graycoprops
from skimage.measure import label, regionprops
from skimage.measure import moments_hu
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from scipy.stats import spearmanr
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from collections import defaultdict
import shutil
import hashlib

D√©finitions des fonctions statistiques et graphiques des caract√©ristiques

In [2]:
df = pd.read_csv("/workspaces/datasciencetest_reco_plante/notebooks/plant_V_Seg_all_features.csv")
df.head()

Unnamed: 0,ID_Image,nom_plante,nom_maladie,Est_Saine,Image_Path,dimensions,aire,p√©rim√®tre,circularit√©,excentricit√©,...,mean_S,mean_V,sharpness_laplacian_var,hu_1,hu_2,hu_3,hu_4,hu_5,hu_6,hu_7
0,1,Tomato,Tomato_mosaic_virus,False,/workspaces/datasciencetest_reco_plante/datase...,126x184,12107.0,1025.151363,0.144767,0.702026,...,46.742599,19.742691,460.229553,2.674085,6.151797,9.510154,9.419995,-10.0,9.998989,-10.0
1,2,Tomato,Tomato_mosaic_virus,False,/workspaces/datasciencetest_reco_plante/datase...,162x175,15930.5,982.364566,0.207441,0.749598,...,51.395752,35.314026,696.85199,2.752474,5.969129,9.430474,9.807913,10.0,9.999964,10.0
2,3,Tomato,Tomato_mosaic_virus,False,/workspaces/datasciencetest_reco_plante/datase...,142x184,11486.5,854.482317,0.197693,0.909607,...,43.810669,24.449112,704.671448,2.568956,5.279443,9.058425,9.229536,10.0,9.995215,10.0
3,4,Tomato,Tomato_mosaic_virus,False,/workspaces/datasciencetest_reco_plante/datase...,176x198,18785.0,1044.749342,0.21627,0.659128,...,67.08934,48.410873,1098.690247,2.853978,6.610883,9.403455,9.948396,-10.0,-9.999976,-10.0
4,5,Tomato,Tomato_mosaic_virus,False,/workspaces/datasciencetest_reco_plante/datase...,148x188,15335.0,921.210238,0.227079,0.272318,...,54.958466,31.481064,481.223633,2.755678,6.210397,9.270586,9.841353,10.0,9.999856,10.0


In [3]:
df.columns

Index(['ID_Image', 'nom_plante', 'nom_maladie', 'Est_Saine', 'Image_Path',
       'dimensions', 'aire', 'p√©rim√®tre', 'circularit√©', 'excentricit√©',
       'aspect_ratio', 'mean_R', 'mean_G', 'mean_B', 'std_R', 'std_G', 'std_B',
       'contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation',
       'contour_density', 'mean_H', 'mean_S', 'mean_V',
       'sharpness_laplacian_var', 'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5',
       'hu_6', 'hu_7'],
      dtype='object')

In [4]:
# Analyse statistique - Violinplot interactif Plotly (distribution par classe)
def plot_violin_interactive(df, feature, classe='nom_plante'):
    fig = px.violin(df, y=feature, x=classe, box=True, points="all", hover_data=df.columns,
        title=f"Distribution de {feature} par {classe}")
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

In [5]:
# Analyse statistique - Boxplot interactif Plotly
def plot_box_interactive(df, feature, classe='nom_plante'):
    fig = px.box(df, x=classe, y=feature, points="all", title=f"Boxplot de {feature} par {classe}",hover_data=df.columns)
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

In [6]:
# Analyse statistique - Histogramme (par classe ou global)
def plot_hist_interactive(df, feature, classe='Est_Saine'):
    fig = px.histogram(
        df, x=feature, color=classe,
        marginal="box",  # ou "rug"
        hover_data=df.columns,
        title=f"Histogramme de {feature} (par {classe})"
    )
    fig.show()

In [7]:
# Analyse statistique - Matrice de corr√©lation interactive (Plotly heatmap)
def plot_corr_heatmap_interactive(root_dir_img, colonnes_features, method='spearman'):
    corr = root_dir_img[colonnes_features].corr(method=method)
    fig = px.imshow(
        corr,
        text_auto=True,
        aspect='auto',
        color_continuous_scale='RdBu_r',
        title=f"Matrice de corr√©lation ({method})"
    )
    fig.update_layout(width=1000, height=800)
    fig.show()

In [8]:
def correlation_with_target(df, colonnes_features, target):
    le = LabelEncoder()
    y = le.fit_transform(df[target])
    corr_result = {}
    for feat in colonnes_features:
        corr, _ = spearmanr(df[feat], y)
        corr_result[feat] = corr
    corr_df = pd.DataFrame.from_dict(corr_result, orient='index', columns=['Corr√©lation'])
    corr_df = corr_df.sort_values(by='Corr√©lation', ascending=False)
    display(corr_df)

In [None]:
from scipy.stats import skew

# Calculer la skewness pour chaque feature
skewness_values = df[colonnes_features].apply(lambda x: skew(x.dropna()), axis=0)

# Convertir en DataFrame tri√©
skew_df = pd.DataFrame({
    "Feature": skewness_values.index,
    "Skewness": skewness_values.values
}).sort_values(by="Skewness", ascending=False)

# Afficher les 10 features les plus asym√©triques
print("üîç Top 10 features les plus asym√©triques :")
print(skew_df.head(15))

In [None]:
def stats_par_feature(df, colonnes_features):
    desc_df = df[colonnes_features].describe().T
    desc_df["range"] = desc_df["max"] - desc_df["min"]

    outlier_summary = {}
    for col in colonnes_features:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        n_outliers = df[(df[col] < lower) | (df[col] > upper)].shape[0]
        outlier_summary[col] = n_outliers

    outliers_df = pd.DataFrame.from_dict(outlier_summary, orient='index', columns=['Nb_outliers'])
    outliers_df = outliers_df.sort_values(by='Nb_outliers', ascending=False)

    # Affichage
    display(desc_df)
    display(outliers_df)

    # Sauvegarde Excel
    with pd.ExcelWriter("reco_plant_features_statistiques.xlsx") as writer:
        desc_df.to_excel(writer, sheet_name="Descriptives")
        outliers_df.to_excel(writer, sheet_name="Outliers")
    return desc_df, outliers_df


In [None]:
# Appel de la fonction d'analyse statistique des features

# S√©lection automatique des colonnes de features num√©riques
colonnes_features = [col for col in df.columns if any(prefix in col for prefix in ['dimensions', 'aire', 'p√©rim√®tre', 'circularit√©', 'excentricit√©',
       'aspect_ratio', 'mean_R', 'mean_G', 'mean_B', 'std_R', 'std_G', 'std_B',
       'contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation',
       'contour_density', 'mean_H', 'mean_S', 'mean_V',
       'sharpness_laplacian_var', 'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5',
       'hu_6', 'hu_7'])]

# Analyse descriptive
desc_df, outliers_df = stats_par_feature(df, colonnes_features)

# Outliers - Trie des features par nombre d‚Äôoutliers d√©croissant et graphique
outliers_sorted = outliers_df.sort_values(by="Nb_outliers", ascending=False)
plt.figure(figsize=(12, 6))
plt.bar(outliers_sorted.index, outliers_sorted["Nb_outliers"], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.title("Nombre d'outliers par feature (m√©thode IQR)", fontsize=14)
plt.ylabel("Nombre d'outliers", fontsize=12)
plt.xlabel("Feature", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# statistiques descriptives
# Calcul de la plage (range)  
if "range" not in desc_df.columns:
    desc_df["range"] = desc_df["max"] - desc_df["min"]

# Extraire les s√©ries n√©cessaires
std_series = desc_df["std"]
range_series = desc_df["range"]

# S√©lectionner les 10 features √† plus forte variation
top_std = std_series.sort_values(ascending=False).head(10)
top_range = range_series.sort_values(ascending=False).head(10)

# Tracer les deux graphiques c√¥te √† c√¥te
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# √âcart-type
axes[0].barh(top_std.index[::-1], top_std.values[::-1], color='tomato')
axes[0].set_title("Top 10 features √† plus grande variance (√©cart-type)")
axes[0].set_xlabel("√âcart-type")
axes[0].grid(axis='x', linestyle='--', alpha=0.6)

# Plage (max - min)
axes[1].barh(top_range.index[::-1], top_range.values[::-1], color='steelblue')
axes[1].set_title("Top 10 features √† plus grande plage de variation")
axes[1].set_xlabel("Plage (max - min)")
axes[1].grid(axis='x', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

In [None]:
# Les graphiques pour exploration

# Heatmap des paires Plante / Maladie
reco_plant_heat = reco_plant[reco_plant["Nom_maladie"] != "Aucune"]
heatmap_data = reco_plant_heat.groupby(["Nom_Plante", "Nom_maladie"]).size().reset_index(name="count")

fig4 = px.density_heatmap(
    heatmap_data,
    x="Nom_maladie",
    y="Nom_Plante",
    z="count",
    color_continuous_scale="Viridis",
    title="Carte de chaleur Plante vs Maladie"
)
fig4.update_layout(xaxis_tickangle=-45)
fig4.show()