# Analyse Exploratoire et Détection d'Anomalies avec DBSCAN

Ce notebook présente une analyse complète des données patients et l'application de l'algorithme DBSCAN pour la détection d'anomalies médicales.


## 1. Importation des bibliothèques


In [None]:
import sys
from pathlib import Path

# Ajouter le répertoire parent au path
sys.path.append(str(Path().resolve().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Modules du projet
from src.data_loader import load_patient_data, validate_required_columns
from src.preprocessing import clean_data, prepare_features, normalize_features
from src.dbscan_model import (
    compute_k_distance_curve,
    plot_k_distance_curve,
    suggest_eps_from_k_distance,
    apply_dbscan
)
from src.visualization import plot_clusters_2d, plot_feature_distributions, plot_cluster_statistics

# Configuration
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


## 2. Chargement des données


In [None]:
DATA_FILE = '../data/patients.csv'
PATIENT_ID_COL = 'patient_id'
FEATURE_COLS = [
    'blood_pressure_systolic',
    'blood_pressure_diastolic',
    'temperature_c',
    'heart_rate_bpm'
]

df = load_patient_data(DATA_FILE)
validate_required_columns(df, [PATIENT_ID_COL] + FEATURE_COLS)

print(f"\nShape: {df.shape}")
print(f"\nPremières lignes:")
df.head()


## 3. Analyse Exploratoire des Données (EDA)


In [None]:
print("=== INFORMATIONS GÉNÉRALES ===")
df.info()


In [None]:
print("=== STATISTIQUES DESCRIPTIVES ===")
df[FEATURE_COLS].describe()


In [None]:
print("=== DISTRIBUTION DES VARIABLES ===")
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, col in enumerate(FEATURE_COLS):
    axes[idx].hist(df[col].values, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    axes[idx].set_title(f'Distribution: {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Fréquence')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 4. Nettoyage et Préprocessing


In [None]:
df_clean = clean_data(df, patient_id_col=PATIENT_ID_COL)
X, patient_ids = prepare_features(df_clean, FEATURE_COLS, PATIENT_ID_COL)
X_scaled, scaler = normalize_features(X)

print(f"Features préparées: {X_scaled.shape}")
print(f"Moyenne après normalisation: {X_scaled.mean(axis=0)}")
print(f"Écart-type après normalisation: {X_scaled.std(axis=0)}")


## 5. Détermination des Paramètres DBSCAN


In [None]:
MIN_SAMPLES = 5
k_distances = compute_k_distance_curve(X_scaled, k=MIN_SAMPLES)
plot_k_distance_curve(k_distances, k=MIN_SAMPLES)


In [None]:
# Suggestion automatique de eps
eps_suggested = suggest_eps_from_k_distance(k_distances, percentile=50.0)
print(f"\nValeur eps suggérée (médiane): {eps_suggested:.4f}")

# Vous pouvez aussi tester différentes valeurs
eps_candidates = [
    np.percentile(k_distances, 25),
    np.percentile(k_distances, 50),
    np.percentile(k_distances, 75)
]
print(f"\nCandidats eps (25%, 50%, 75%): {[f'{e:.4f}' for e in eps_candidates]}")


## 6. Application de DBSCAN


In [None]:
EPS = eps_suggested  # Utiliser la valeur suggérée ou ajuster manuellement

labels, dbscan_model = apply_dbscan(X_scaled, eps=EPS, min_samples=MIN_SAMPLES)

# Ajouter les labels au DataFrame
df_clean['cluster_label'] = labels
df_clean['is_anomaly'] = (labels == -1)


## 7. Visualisations


In [None]:
plot_clusters_2d(X_scaled, labels, method='pca')


In [None]:
plot_feature_distributions(df_clean, FEATURE_COLS, labels)


In [None]:
plot_cluster_statistics(labels)
