In [None]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

Définitions des fonctions statistiques et graphiques des caractéristiques

In [None]:
df = pd.read_csv("/workspaces/datasciencetest_reco_plante/notebooks/Plant_V_Seg_all_features.csv")

In [None]:
df.columns

tableau identifiant les chevauchements avec IQR

In [None]:
# Génération d'un tableau identifiant les chevauchements avec IQR

def iqr_overlap_table(df, features, label_col="disease_label", healthy_label="healthy"):
    """
    Renvoie un DataFrame listant, pour chaque feature,
    les maladies dont l'IQR chevauche celui de 'healthy'.
    """
    results = []
    diseases = [d for d in df[label_col].unique() if d != healthy_label]

    for feat in features:
        q1_h, q3_h = df.loc[df[label_col]==healthy_label, feat].quantile([0.25, 0.75])
        overlap_maladies = []
        for disease in diseases:
            q1_m, q3_m = df.loc[df[label_col]==disease, feat].quantile([0.25, 0.75])
            # Condition de chevauchement
            if (q1_h < q3_m) and (q3_h > q1_m):
                overlap_maladies.append(disease)
        results.append({
            "feature": feat,
            "overlapping_diseases": overlap_maladies,
            "n_overlap": len(overlap_maladies)
        })
    # On retourne un DataFrame trié par n_overlap décroissant
    return pd.DataFrame(results).sort_values("n_overlap", ascending=False)

# --- Préparation de 'disease_label' 
disease_cols = [c for c in df.columns if c.startswith("disease_")]
mask_d = df[disease_cols].applymap(lambda x: x is True or x == 1)
idx_d  = mask_d.values.argmax(axis=1)
df["disease_label"] = [disease_cols[i].replace("disease_", "") for i in idx_d]

features = [
    'contrast', 'energy', 'homogeneity', 'dissimilarite', 'Correlation',
    'mean_H', 'mean_S', 'mean_V', 'std_R', 'std_G', 'std_B',
    'hu_1', 'hu_2', 'hu_3', 'hu_4', 'hu_5', 'hu_6', 'hu_7'
]

overlap_df = iqr_overlap_table(df, features, label_col="disease_label", healthy_label="healthy")
pd.set_option('display.max_colwidth', 120)  # Pour voir les maladies en entier
print(overlap_df[["feature", "overlapping_diseases", "n_overlap"]])
overlap_df.to_excel("Results_chevauchements_IQR.xlsx", index=False)
