In [54]:
# Imports 
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.inference import VariableElimination
from tqdm import tqdm
import time

# Configuration parallélisation Apple Silicon (80% des coeurs)
n_cores_total = mp.cpu_count()
n_cores_use = max(1, int(0.8 * n_cores_total))
print(f"Utilisation de {n_cores_use}/{n_cores_total} coeurs CPU")

Utilisation de 6/8 coeurs CPU


In [55]:
# Question 1 : implémentation d'un classifieur Bayésien avec apprentissage de structure

class BayesianClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, model=None):
        self.model = model
        
    def fit(self, X, y):
        X, y = validate_data(self, X, y)
        self.classes_ = unique_labels(y)
        
        if self.model is None:
            # Convertir en DataFrame et discrétiser
            if not hasattr(X, 'columns'):
                X = pd.DataFrame(X, columns=[f'f_{i}' for i in range(X.shape[1])])
            
            # Utiliser toutes les features
            X_full = X.copy()
            
            # Sauvegarder les bins de discrétisation
            self.discretization_bins_ = {}
            X_discretized = X_full.copy()
            
            # Discrétisation simple en 3 bins
            for col in X_full.columns:
                if X_full[col].dtype in ['float64', 'int64']:
                    X_discretized[col], bins = pd.cut(X_full[col], bins=3, labels=['low', 'mid', 'high'], retbins=True)
                    self.discretization_bins_[col] = bins
                else:
                    X_discretized[col] = X_full[col].astype(str)
            
            # Ajouter la cible
            data = pd.concat([X_discretized, pd.Series(y, name='target')], axis=1)
            
            # Apprentissage de structure Hill Climbing (optimisé pour la vitesse)
            hc = HillClimbSearch(data)
            model_structure = hc.estimate(max_iter=20, show_progress=False)  # Réduit de 50 à 20
            
            # Créer le réseau bayésien
            self.model = DiscreteBayesianNetwork(model_structure.edges())
            
            # Contourner le problème de pgmpy en utilisant une approche directe
            from pgmpy.estimators import MaximumLikelihoodEstimator
            try:
                # Essayer d'abord avec MaximumLikelihoodEstimator (généralement plus robuste)
                self.model.fit(data, estimator=MaximumLikelihoodEstimator)
            except (TypeError, AttributeError):
                # Si ça ne marche pas, utiliser l'estimation manuelle
                from pgmpy.factors.discrete import TabularCPD
                
                for node in self.model.nodes():
                    parents = list(self.model.predecessors(node))
                    
                    if len(parents) == 0:
                        # Nœud sans parents
                        value_counts = data[node].value_counts(normalize=True)
                        node_values = sorted(data[node].unique())
                        values = [value_counts.get(val, 0.001) for val in node_values]
                        # Normaliser pour s'assurer que ça somme à 1
                        total = sum(values)
                        values = [v/total for v in values]
                        cpd = TabularCPD(variable=node, variable_card=len(node_values), values=[values])
                    else:
                        # Nœud avec parents - approche simplifiée
                        node_values = sorted(data[node].unique())
                        # Utiliser une distribution uniforme conditionnelle pour simplifier
                        uniform_prob = 1.0 / len(node_values)
                        parent_cards = [len(data[p].unique()) for p in parents]
                        total_combinations = 1
                        for card in parent_cards:
                            total_combinations *= card
                        
                        values = [[uniform_prob] * total_combinations for _ in range(len(node_values))]
                        cpd = TabularCPD(variable=node, variable_card=len(node_values),
                                       values=values, evidence=parents, evidence_card=parent_cards)
                    
                    self.model.add_cpds(cpd)
            
            self.inference_ = VariableElimination(self.model)
            self.learned_structure_ = True
            self.feature_names_ = X_full.columns.tolist()
            
        else:
            # Utiliser le modèle fourni
            self.model.fit(X, y)
            self.learned_structure_ = False
        
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
    
    def predict_proba(self, X):
        check_is_fitted(self)
        
        if hasattr(self, 'learned_structure_') and self.learned_structure_:
            # Utiliser toutes les features
            if not hasattr(X, 'columns'):
                X = pd.DataFrame(X, columns=self.feature_names_)
            else:
                X = X[self.feature_names_]  # Garder seulement les features d'entraînement
            
            # Discrétiser avec les mêmes bins qu'à l'entraînement
            X_discretized = X.copy()
            for col in X.columns:
                if col in self.discretization_bins_:
                    X_discretized[col] = pd.cut(X[col], bins=self.discretization_bins_[col], 
                                              labels=['low', 'mid', 'high'], include_lowest=True)
                else:
                    X_discretized[col] = X[col].astype(str)
            
            probabilities = []
            for i in range(len(X_discretized)):
                # Créer l'évidence seulement avec les nœuds qui existent dans le graphe
                evidence = {}
                for col in X_discretized.columns:
                    if col in self.model.nodes() and str(X_discretized.iloc[i][col]) != 'nan':
                        evidence[col] = str(X_discretized.iloc[i][col])
                
                if evidence:  # Seulement si on a des preuves valides
                    result = self.inference_.query(['target'], evidence=evidence)
                    probabilities.append(result.values)
                else:
                    # Distribution uniforme si pas de preuve
                    probabilities.append([0.5, 0.5])
            
            return np.array(probabilities)
        else:
            return self.model.predict_proba(X)

In [56]:
# Question 2 : application du classifieur Bayésien au dataset 

# Wrapper pour LabelEncoder compatible avec Pipeline et valeurs inconnues
class MultiLabelEncoder(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        # Convertir en DataFrame si nécessaire
        if not hasattr(X, 'columns'):
            X = pd.DataFrame(X)
        
        for i, col in enumerate(X.columns):
            le = LabelEncoder()
            # Ajouter une catégorie spéciale pour les valeurs inconnues
            unique_values = list(X.iloc[:, i].astype(str).unique()) + ['__UNKNOWN__']
            le.fit(unique_values)
            self.label_encoders[i] = le
        return self
    
    def transform(self, X):
        # Convertir en DataFrame si nécessaire
        if not hasattr(X, 'columns'):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for i, col in enumerate(X.columns):
            if i in self.label_encoders:
                # Remplacer les valeurs inconnues par '__UNKNOWN__'
                values = X.iloc[:, i].astype(str)
                known_values = set(self.label_encoders[i].classes_)
                values_safe = [v if v in known_values else '__UNKNOWN__' for v in values]
                X_encoded.iloc[:, i] = self.label_encoders[i].transform(values_safe)
        
        # Retourner en array numpy pour compatibilité pipeline
        return X_encoded.values
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')

# Définir la cible
target = 'fraud_reported' if 'fraud_reported' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pipeline scikit-learn avec LabelEncoder (évite l'explosion combinatoire)
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', MultiLabelEncoder())
        ]), cat_cols)
    ])),
    ('classifier', BayesianClassifier())
])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Entraînement et prédiction avec barre de progression
print(" Démarrage de l'entraînement du BayesianClassifier...")

# Barre de progression pour l'entraînement
with tqdm(total=100, desc="Entraînement", bar_format='{l_bar}{bar}| {percentage:3.0f}%') as pbar:
    pbar.set_description("📚 Préparation des données")
    pbar.update(10)
    time.sleep(0.1)
    
    pbar.set_description("🔍 Apprentissage de structure")
    pbar.update(20)
    
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    pbar.set_description("✅ Entraînement terminé")
    pbar.update(70)

print(f"⏱️  Temps d'entraînement: {train_time:.2f}s")

# Barre de progression pour la prédiction
print("\n Prédictions en cours...")
with tqdm(total=100, desc="Prédiction", bar_format='{l_bar}{bar}| {percentage:3.0f}%') as pbar:
    start_pred = time.time()
    y_pred = pipeline.predict(X_test)
    pred_time = time.time() - start_pred
    pbar.update(100)

print(f"⏱️  Temps de prédiction: {pred_time:.2f}s")

# Évaluation
print("\n Résultats:")
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy: {accuracy:.4f}")
print("\n Rapport de classification:")
print(classification_report(y_test, y_pred))

 Démarrage de l'entraînement du BayesianClassifier...


🔍 Apprentissage de structure:  30%|███       |  30%INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'f_5': 'O', 'f_6': 'O', 'f_7': 'O', 'f_8': 'O', 'f_9': 'O', 'f_10': 'O', 'f_11': 'O', 'f_12': 'O', 'f_13': 'O', 'f_14': 'O', 'f_15': 'O', 'f_16': 'O', 'f_17': 'O', 'f_18': 'O', 'f_19': 'O', 'f_20': 'O', 'f_21': 'O', 'f_22': 'O', 'f_23': 'O', 'f_24': 'O', 'f_25': 'O', 'f_26': 'O', 'f_27': 'O', 'f_28': 'O', 'f_29': 'O', 'f_30': 'O', 'f_31': 'O', 'f_32': 'O', 'f_33': 'O', 'f_34': 'O', 'f_35': 'O', 'f_36': 'O', 'f_37': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'f_5': 'O', 'f_6': 'O', 'f_7': 'O', 'f_8': 'O', 'f_9': 'O', 'f_10': 'O', 'f_11': 'O', 'f_12': 'O', 'f_13': 'O', 'f_14': 'O', 'f_15': 'O', 'f_16': 'O', 'f_17': 'O', 'f_18': 'O', '

⏱️  Temps d'entraînement: 2.14s

 Prédictions en cours...


Prédiction: 100%|██████████| 100%

⏱️  Temps de prédiction: 2.02s

 Résultats:
 Accuracy: 0.7550

 Rapport de classification:
              precision    recall  f1-score   support

           N       0.76      1.00      0.86       151
           Y       0.00      0.00      0.00        49

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.57      0.76      0.65       200






In [57]:
# Question 3 : comparaison avec RandomForest + validation croisée stratifiée

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Pipeline RandomForest avec parallélisation
rf_pipeline = Pipeline([
    ('preprocessor', pipeline.named_steps['preprocessor']),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_cores_use))
])

# Entraînement et test simple d'abord
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("=== COMPARAISON SIMPLE TRAIN/TEST ===")
print(f"BayesianClassifier: {accuracy_score(y_test, y_pred):.4f}")
print(f"RandomForest:       {accuracy_score(y_test, y_pred_rf):.4f}")

# Puis validation croisée pour évaluation robuste
print("\n=== VALIDATION CROISÉE SIMPLE (K-FOLD) ===")
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=n_cores_use)
rf_scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=n_cores_use)

print(f"BayesianClassifier: {bayes_scores.mean():.4f} (+/- {bayes_scores.std()*2:.4f})")
print(f"RandomForest:       {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})")
print(f"Différence:         {abs(bayes_scores.mean() - rf_scores.mean()):.4f}")

=== COMPARAISON SIMPLE TRAIN/TEST ===
BayesianClassifier: 0.7550
RandomForest:       0.8300

=== VALIDATION CROISÉE SIMPLE (K-FOLD) ===


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 661, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/var/folders/_s/qvs1vdhn51xfrr5kv9fr9ynr0000gn/T/ipykernel_7555/1237464343.py", line 85, in fit
  File "/Users/remyplastre/Library/Python/3.12/lib/python/site-packages/pgmpy/base/DAG.py", line 1235, in fit
    if not issubclass(estimator, BaseEstimator):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: issubclass() arg 1 must be a class


In [None]:
# Question 4 : analyse des erreurs avec la structure du réseau bayésien

y_pred_final = pipeline.predict(X_test)
errors_mask = y_test != y_pred_final
print(f"Total d'erreurs: {errors_mask.sum()}")

# Structure apprise ?
clf = pipeline.named_steps['classifier']
if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
    print(f"\nStructure apprise - Arêtes: {list(clf.model.edges())}")

# 5 premières erreurs
error_indices = y_test[errors_mask].index[:5]
for i, idx in enumerate(error_indices):
    print(f"\nErreur {i+1}:")
    print(f"  Vraie: {y_test.loc[idx]} | Prédite: {y_pred_final[y_test.index.get_loc(idx)]}")
    print(f"  Features: {X_test.loc[idx].head(3).to_dict()}")
    if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
        print("  → Structure bayésienne utilisée pour cette prédiction")