In [21]:
# Imports 
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.inference import VariableElimination

# Configuration parallélisation Apple Silicon (80% des coeurs)
n_cores_total = mp.cpu_count()
n_cores_use = max(1, int(0.8 * n_cores_total))
print(f"Utilisation de {n_cores_use}/{n_cores_total} coeurs CPU")

Utilisation de 6/8 coeurs CPU


In [22]:
# Question 1 : implémentation d'un classifieur Bayésien avec apprentissage de structure

class BayesianClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, model=None):
        self.model = model
        
    def fit(self, X, y):
        X, y = validate_data(self, X, y)
        self.classes_ = unique_labels(y)
        
        if self.model is None:
            # Convertir en DataFrame et discrétiser
            if not hasattr(X, 'columns'):
                X = pd.DataFrame(X, columns=[f'f_{i}' for i in range(X.shape[1])])
            
            # Prendre toutes les features
            X_small = X.copy()
            
            # Sauvegarder les bins de discrétisation
            self.discretization_bins_ = {}
            X_discretized = X_small.copy()
            
            # Discrétisation simple en 3 bins
            for col in X_small.columns:
                if X_small[col].dtype in ['float64', 'int64']:
                    X_discretized[col], bins = pd.cut(X_small[col], bins=3, labels=['low', 'mid', 'high'], retbins=True)
                    self.discretization_bins_[col] = bins
                else:
                    X_discretized[col] = X_small[col].astype(str)
            
            # Ajouter la cible
            data = pd.concat([X_discretized, pd.Series(y, name='target')], axis=1)
            
            # Apprentissage de structure Hill Climbing
            hc = HillClimbSearch(data)
            model_structure = hc.estimate(max_iter=50)
            
            # Créer le réseau bayésien
            self.model = DiscreteBayesianNetwork(model_structure.edges())
            estimator = BayesianEstimator(self.model, data)
            for node in self.model.nodes():
                self.model.add_cpds(estimator.estimate_cpd(node))
            
            self.inference_ = VariableElimination(self.model)
            self.learned_structure_ = True
            self.feature_names_ = X_small.columns.tolist()
            
        else:
            # Utiliser le modèle fourni
            self.model.fit(X, y)
            self.learned_structure_ = False
        
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
    
    def predict_proba(self, X):
        check_is_fitted(self)
        
        if hasattr(self, 'learned_structure_') and self.learned_structure_:
            # Utiliser toutes les features
            if not hasattr(X, 'columns'):
                X = pd.DataFrame(X, columns=self.feature_names_)
            else:
                X = X[self.feature_names_]  # Garder seulement les features d'entraînement
            
            # Discrétiser avec les mêmes bins qu'à l'entraînement
            X_discretized = X.copy()
            for col in X.columns:
                if col in self.discretization_bins_:
                    X_discretized[col] = pd.cut(X[col], bins=self.discretization_bins_[col], 
                                              labels=['low', 'mid', 'high'], include_lowest=True)
                else:
                    X_discretized[col] = X[col].astype(str)
            
            probabilities = []
            for i in range(len(X_discretized)):
                # Créer l'évidence seulement avec les nœuds qui existent dans le graphe
                evidence = {}
                for col in X_discretized.columns:
                    if col in self.model.nodes() and str(X_discretized.iloc[i][col]) != 'nan':
                        evidence[col] = str(X_discretized.iloc[i][col])
                
                if evidence:  # Seulement si on a des preuves valides
                    result = self.inference_.query(['target'], evidence=evidence)
                    probabilities.append(result.values)
                else:
                    # Distribution uniforme si pas de preuve
                    probabilities.append([0.5, 0.5])
            
            return np.array(probabilities)
        else:
            return self.model.predict_proba(X)

In [None]:
# Question 2 : application du classifieur Bayésien au dataset 

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')

# Définir la cible
target = 'fraud_reported' if 'fraud_reported' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pipeline scikit-learn 
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
        ]), cat_cols)
    ])),
    ('classifier', BayesianClassifier())
])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Entraînement et prédiction
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Évaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nRapport de classification:")
print(classification_report(y_test, y_pred))

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'f_5': 'O', 'f_6': 'O', 'f_7': 'O', 'f_8': 'O', 'f_9': 'O', 'f_10': 'O', 'f_11': 'O', 'f_12': 'O', 'f_13': 'O', 'f_14': 'O', 'f_15': 'O', 'f_16': 'O', 'f_17': 'O', 'f_18': 'O', 'f_19': 'O', 'f_20': 'O', 'f_21': 'O', 'f_22': 'O', 'f_23': 'O', 'f_24': 'O', 'f_25': 'O', 'f_26': 'O', 'f_27': 'O', 'f_28': 'O', 'f_29': 'O', 'f_30': 'O', 'f_31': 'O', 'f_32': 'O', 'f_33': 'O', 'f_34': 'O', 'f_35': 'O', 'f_36': 'O', 'f_37': 'O', 'f_38': 'O', 'f_39': 'O', 'f_40': 'O', 'f_41': 'O', 'f_42': 'O', 'f_43': 'O', 'f_44': 'O', 'f_45': 'O', 'f_46': 'O', 'f_47': 'O', 'f_48': 'O', 'f_49': 'O', 'f_50': 'O', 'f_51': 'O', 'f_52': 'O', 'f_53': 'O', 'f_54': 'O', 'f_55': 'O', 'f_56': 'O', 'f_57': 'O', 'f_58': 'O', 'f_59': 'O', 'f_60': 'O', 'f_61': 'O', 'f_62': 'O', 'f_63': 'O', 'f_64': 'O', 'f_65': 'O', 'f_66': 'O', 'f_67': 'O', 'f_68': 'O', 'f_69':

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# Question 3 : comparaison avec RandomForest + validation croisée stratifiée

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Pipeline RandomForest avec parallélisation
rf_pipeline = Pipeline([
    ('preprocessor', pipeline.named_steps['preprocessor']),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_cores_use))
])

# Validation croisée stratifiée (5-fold) avec parallélisation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bayes_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=n_cores_use)
rf_scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=n_cores_use)

print("=== VALIDATION CROISÉE STRATIFIÉE ===")
print(f"BayesianClassifier: {bayes_scores.mean():.4f} (+/- {bayes_scores.std()*2:.4f})")
print(f"RandomForest:       {rf_scores.mean():.4f} (+/- {rf_scores.std()*2:.4f})")
print(f"Différence:         {abs(bayes_scores.mean() - rf_scores.mean()):.4f}")

=== VALIDATION CROISÉE STRATIFIÉE ===
BayesianClassifier: 0.3030 (+/- 0.0174)
RandomForest:       0.7510 (+/- 0.0075)
Différence:         0.4480


In [None]:
# Question 4 : analyse des erreurs avec la structure du réseau bayésien

y_pred_final = pipeline.predict(X_test)
errors_mask = y_test != y_pred_final
print(f"Total d'erreurs: {errors_mask.sum()}")

# Structure apprise ?
clf = pipeline.named_steps['classifier']
if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
    print(f"\nStructure apprise - Arêtes: {list(clf.model.edges())}")

# 5 premières erreurs
error_indices = y_test[errors_mask].index[:5]
for i, idx in enumerate(error_indices):
    print(f"\nErreur {i+1}:")
    print(f"  Vraie: {y_test.loc[idx]} | Prédite: {y_pred_final[y_test.index.get_loc(idx)]}")
    print(f"  Features: {X_test.loc[idx].head(3).to_dict()}")
    if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
        print("  → Structure bayésienne utilisée pour cette prédiction")