In [None]:
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import EasyEnsembleClassifier


# ajouter le dossier parent du notebook (pour trouver utils/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# importer les fonctions du module utils
from utils.utils import  *

# configuration
np.set_printoptions(threshold=10000, suppress=True)
warnings.filterwarnings('ignore')


# Dataset Mouse

In [None]:
df_mouse = load_dataset("../data/mouse.txt", sep=' ', header=None, names=['x1', 'x2'])

In [None]:
analyze_dataset(df_mouse)

In [None]:
scores_if, anomalies_if = detect_outliers_iforest(df_mouse)
scores_lof, anomalies_lof = detect_outliers_lof(df_mouse)

In [None]:
threshold_if = find_threshold_iqr(scores_if,method_name="Isolation Forest")
threshold_lof = find_threshold_iqr(scores_lof,method_name="LOF")

In [None]:
threshold_if_km = find_threshold_kmeans(scores_if, method_name="Isolation Forest")
threshold_lof_km = find_threshold_kmeans(scores_lof, method_name="LOF")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# --- Ligne 1 : Méthode IQR ---
plot_anomaly_scores(scores_if, "Isolation Forest (IQR)", threshold=threshold_if, ax=axes[0, 0])
plot_anomaly_scores(scores_lof, "LOF (IQR)", threshold=threshold_lof, ax=axes[0, 1])

# --- Ligne 2 : Méthode KMeans ---
plot_anomaly_scores(scores_if, "Isolation Forest (KMeans)", threshold=threshold_if_km, ax=axes[1, 0])
plot_anomaly_scores(scores_lof, "LOF (KMeans)", threshold=threshold_lof_km, ax=axes[1, 1])

# Titre global
plt.suptitle("Comparaison des méthodes de seuil : IQR vs K-Means (Isolation Forest & LOF)",
             fontsize=15, fontweight="bold", color="#d46a9b")

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

In [None]:
scores_novelty, labels_novelty = detect_novelty_lof(df_mouse)

# Dataset creditcard

In [None]:
df_credit = load_dataset("../data/creditcard.csv", sep=',')

## Préparation des données

Avant de comparer les méthodes de détection d’anomalies (Isolation Forest, LOF, etc.),
il est indispensable de **préparer les données** pour les rendre exploitables par les modèles.

Cette étape consiste à :
- nettoyer les variables inutiles,
- mettre toutes les caractéristiques sur la même échelle,
- séparer les variables explicatives et la variable cible,
- et éventuellement échantillonner le jeu pour des raisons de performance.

In [None]:
# Nettoyage
df_credit = drop_columns(df_credit, ["Time"])

In [None]:
# Séparation
X_credit, y_credit = split_features_target(df_credit, "Class")

In [None]:
df_credit["Amount"].describe()

In [None]:
# Normalisation
X_credit = normalize_columns(X_credit, ["Amount"], method="standard")

In [None]:
X_credit["Amount"].describe()

In [None]:
X_credit.head()

In [None]:
X_credit.shape

In [None]:
ratio = class_ratio(y_credit, labels={0: "Transactions normales", 1: "Fraudes"})

## Séparation du dataset

In [None]:
X_train, X_test, y_train, y_test = split_dataset(X_credit, y_credit, test_size=0.3)

### Validation croisée choisie

Pour évaluer les modèles de détection de fraude, nous utilisons une validation croisée **stratifiée** :

- Le dataset contient **284 807 transactions**, dont seulement **0.173 %** sont des fraudes.
- Afin de **préserver la proportion de classes** dans chaque sous-échantillon,
  nous appliquons une **Stratified K-Fold Cross Validation**.
- Nous choisissons **3 plis (n_splits=3)**, ce qui assure un bon compromis entre stabilité statistique
  et temps de calcul raisonnable.

Chaque sous-échantillon contient donc environ **164 fraudes sur 95 000 transactions**,
garantissant une évaluation équilibrée et représentative.

## Isolation Forest et Local Outlier Factor

In [None]:
param_grid_if = [
    {"n_estimators": 100, "max_samples": "auto", "contamination": 0.001},
    {"n_estimators": 200, "max_samples": "auto", "contamination": 0.002},
    {"n_estimators": 300, "max_samples": 0.8, "contamination": 0.005},
    {"n_estimators": 400, "max_samples": 0.8, "contamination": 0.002},  
    {"n_estimators": 300, "max_samples": 0.6, "contamination": 0.002},
]

best_if, results_if = optimize_unsupervised("Isolation Forest", X_credit, y_credit, param_grid_if, dataset_name="credit card")

In [None]:
res_if = evaluate_unsupervised(best_if, X_test, y_test, model_name="Isolation Forest")

Pour LOF on va utiliser un grand nombre de neighbors étant donnée la taille du dataset


In [None]:
param_grid_lof = [
    {"n_neighbors": 200, "contamination": 0.01},
    {"n_neighbors": 500, "contamination": 0.01},
    {"n_neighbors": 700, "contamination": 0.02}
]

best_lof, results_lof = optimize_unsupervised("Local Outlier Factor", X_credit, y_credit, param_grid_lof,dataset_name="credit card")

In [None]:
print(best_lof)

In [None]:
res_lof = evaluate_unsupervised(
    best_lof,
    X_credit,
    y_credit,
    model_name="Local Outlier Factor",
    tail_split=85443
)

## Easy Ensemble

In [None]:
param_grid_easy = [
    {"n_estimators": 6, "random_state": 42, "n_jobs": -1},
    {"n_estimators": 8, "random_state": 42, "n_jobs": -1},
    {"n_estimators": 12, "random_state": 42, "n_jobs": -1},
]

best_easy, df_easy = optimize_supervised(
    model_class=EasyEnsembleClassifier,
    model_name="EasyEnsemble",
    X=X_train, y=y_train,
    param_grid=param_grid_easy,
    dataset_name="credit card",
    cv_splits=3
)

In [None]:
res_easy = evaluate_supervised(
    best_easy, X_test, y_test,
    model_name="EasyEnsemble (credit card)",
    optimize_threshold=True
)

## Approches supervisées avec méthodes de rééquilibrages

In [None]:
datasets_balanced = generate_balanced_datasets(X_train, y_train)

In [None]:
plot_class_distributions(datasets_balanced)

In [None]:
# === PARAMÈTRES DE BASE ===
param_grid_xgb = [
    {"n_estimators": 300, "max_depth": 4, "learning_rate": 0.1, "subsample": 0.8,
     "colsample_bytree": 0.8, "scale_pos_weight": 1, "random_state": 42,
     "n_jobs": -1, "use_label_encoder": False, "eval_metric": "logloss"},
    {"n_estimators": 500, "max_depth": 6, "learning_rate": 0.05, "subsample": 0.8,
     "colsample_bytree": 0.8, "scale_pos_weight": 1, "random_state": 42,
     "n_jobs": -1, "use_label_encoder": False, "eval_metric": "logloss"},
]

param_grid_rf = [
    {"n_estimators": 200, "max_depth": 6, "class_weight": "balanced_subsample", "random_state": 42, "n_jobs": -1},
    {"n_estimators": 400, "max_depth": 8, "class_weight": "balanced", "random_state": 42, "n_jobs": -1},
]

param_grid_log = [
    {"C": 1.0, "max_iter": 500, "class_weight": "balanced", "random_state": 42},
    {"C": 0.5, "max_iter": 500, "class_weight": "balanced", "random_state": 42},
]

param_grids = {
    "XGBoost": param_grid_xgb,
    "Random Forest": param_grid_rf,
    "Régression Logistique": param_grid_log
}

# === Évaluation des modèles supervisés sur tous les datasets rééquilibrés ===
results_df, results_pr_curves = evaluate_supervised_models_on_balanced_datasets(
    datasets_balanced=datasets_balanced,
    X_test=X_test,
    y_test=y_test,
    param_grids=param_grids,
    cv_splits=3
)

In [None]:
plot_all_pr_curves(results_pr_curves)

# Dataset KDCUP99

In [None]:
df_kd = load_dataset("../data/KDDCup99.csv", sep=',')

In [None]:
df_kd = pd.get_dummies(df_kd, columns=['protocol_type', 'service', 'flag'], drop_first=True)

In [None]:
df_kd['is_attack'] = (df_kd['label'] != 'normal').astype(int)

In [None]:
df_kd['is_attack'].value_counts(normalize=True)

In [None]:
df_kd['is_attack'] = np.where(df_kd['label'].str.contains('normal'), 0, 1)

In [None]:
df_kd['is_attack'].value_counts(normalize=True)

In [None]:
df_kd = drop_columns(df_kd, ["label"])

In [None]:
X_kd, y_kd = split_features_target(df_kd, "is_attack")

In [None]:
df_kd.dtypes.value_counts()

In [None]:
X_kd.head()

In [None]:
scaler = StandardScaler()
X_kd = scaler.fit_transform(X_kd)

In [None]:
y_kd.head()

In [None]:
ratio = class_ratio(y_kd, labels={0: "Trafic normal", 1: "Intrusions"})

In [None]:
X_train_kd, X_test_kd, y_train_kd, y_test_kd = split_dataset(X_kd, y_kd, test_size=0.3)

In [None]:
param_grid_if_kd = [
    {"n_estimators": 100, "max_samples": 0.6, "contamination": 0.15},
    {"n_estimators": 200, "max_samples": 0.6, "contamination": 0.2},
    {"n_estimators": 300, "max_samples": 0.8, "contamination": 0.2},
    {"n_estimators": 400, "max_samples": 0.8, "contamination": 0.25},
    {"n_estimators": 500, "max_samples": 0.8, "contamination": 0.25},
    {"n_estimators": 700, "max_samples": 0.8, "contamination": 0.25}
]

In [None]:
best_if_kd, results_if_kd = optimize_unsupervised(
    "Isolation Forest", 
    X_kd, 
    y_kd, 
    param_grid_if_kd, 
    dataset_name="KDDCup99"
)

In [None]:
res_if_kd = evaluate_unsupervised(
    best_if_kd, 
    X_test_kd, 
    y_test_kd, 
    model_name="Isolation Forest (KDDCup99)"
)

In [None]:
from sklearn.model_selection import train_test_split

X_sample, _, y_sample, _ = train_test_split(
    X_kd, y_kd, 
    test_size=0.7,  # garde 30 % (environ 150 000 lignes)
    stratify=y_kd, 
    random_state=42
)

In [None]:
param_grid_lof_kd = [
    {"n_neighbors": 20, "contamination": 0.15},
    {"n_neighbors": 50, "contamination": 0.20},
    {"n_neighbors": 100, "contamination": 0.25},
]

In [None]:
best_lof_kd, results_lof_kd = optimize_unsupervised(
    "Local Outlier Factor",
    X_sample,    
    y_sample,
    param_grid_lof_kd,
    dataset_name="KDDCup99"
)

In [None]:
y_test_kd.shape

In [None]:
res_lof_kd = evaluate_unsupervised(
    best_lof_kd,
    X_kd,
    y_kd,
    model_name="Local Outlier Factor (KDDCup99)",
    tail_split=85443   
)

## Easy Ensemble

In [None]:
X_train_kd = pd.DataFrame(X_train_kd, columns=[f"feature_{i}" for i in range(X_train_kd.shape[1])])

In [None]:
param_grid_easy_kd = [
    {"n_estimators": 6, "random_state": 42, "n_jobs": -1},
    {"n_estimators": 8, "random_state": 42, "n_jobs": -1},
    {"n_estimators": 10, "random_state": 42, "n_jobs": -1},
]

In [None]:
best_easy_kd, df_easy_kd = optimize_supervised(
    model_class=EasyEnsembleClassifier,
    model_name="EasyEnsemble",
    X=X_train_kd,
    y=y_train_kd,
    param_grid=param_grid_easy_kd,
    dataset_name="KDDCup99",
    cv_splits=3
)

In [None]:
res_easy_kd = evaluate_supervised(
    best_easy_kd, X_test_kd, y_test_kd,
    model_name="EasyEnsemble (KDDCup99)",
    optimize_threshold=True
)

## Approches supervisées avec méthodes de rééquilibrages

In [None]:
datasets_balanced_kd = generate_balanced_datasets(X_train_kd, y_train_kd)

In [None]:
plot_class_distributions(datasets_balanced_kd)

In [None]:
# === PARAMÈTRES ADAPTÉS AU DATASET ===

# XGBoost : dataset large → moins de profondeur, plus de régularisation
param_grid_xgb_kd = [
    {"n_estimators": 200, "max_depth": 5, "learning_rate": 0.1,
     "subsample": 0.8, "colsample_bytree": 0.8, "scale_pos_weight": 4,  # ≈ ratio attaques/normales
     "random_state": 42, "n_jobs": -1, "use_label_encoder": False, "eval_metric": "logloss"},
    {"n_estimators": 400, "max_depth": 6, "learning_rate": 0.05,
     "subsample": 0.9, "colsample_bytree": 0.9, "scale_pos_weight": 4,
     "random_state": 42, "n_jobs": -1, "use_label_encoder": False, "eval_metric": "logloss"},
]

# Random Forest : dataset dense → max_depth modéré
param_grid_rf_kd = [
    {"n_estimators": 200, "max_depth": 8, "class_weight": "balanced_subsample", "random_state": 42, "n_jobs": -1},
    {"n_estimators": 400, "max_depth": 10, "class_weight": "balanced", "random_state": 42, "n_jobs": -1},
]

# Régression logistique : simple, mais utile comme baseline
param_grid_log_kd = [
    {"C": 1.0, "max_iter": 1000, "class_weight": "balanced", "solver": "lbfgs", "n_jobs": -1, "random_state": 42},
    {"C": 0.5, "max_iter": 1000, "class_weight": "balanced", "solver": "lbfgs", "n_jobs": -1, "random_state": 42},
]

param_grids_kd = {
    "XGBoost": param_grid_xgb_kd,
    "Random Forest": param_grid_rf_kd,
    "Régression Logistique": param_grid_log_kd
}

results_df_kd, results_pr_curves_kd = evaluate_supervised_models_on_balanced_datasets(
    datasets_balanced_kd,
    X_test_kd,
    y_test_kd,
    param_grids=param_grids_kd,
    cv_splits=3
)


In [None]:
plot_all_pr_curves(results_pr_curves)