In [69]:
# Imports 
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import multiprocessing as mp
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.inference import VariableElimination
from tqdm import tqdm
import time

# Configuration parall√©lisation Apple Silicon (80% des coeurs)
n_cores_total = mp.cpu_count()
n_cores_use = max(1, int(0.8 * n_cores_total))
print(f"Utilisation de {n_cores_use}/{n_cores_total} coeurs CPU")

Utilisation de 6/8 coeurs CPU


In [70]:
# Question 1 : impl√©mentation d'un classifieur Bay√©sien avec apprentissage de structure

class BayesianClassifier(ClassifierMixin, BaseEstimator):
    
    def __init__(self, model=None):
        self.model = model
        
    def fit(self, X, y):
        X, y = validate_data(self, X, y)
        self.classes_ = unique_labels(y)
        
        if self.model is None and PGMPY_AVAILABLE:
            # Convertir en DataFrame et discr√©tiser
            if not hasattr(X, 'columns'):
                X = pd.DataFrame(X, columns=[f'f_{i}' for i in range(X.shape[1])])
            
            # Prendre seulement 5 features pour simplifier
            X_small = X.iloc[:, :5]
            
            # Discr√©tisation simple en 3 bins
            for col in X_small.columns:
                if X_small[col].dtype in ['float64', 'int64']:
                    X_small[col] = pd.cut(X_small[col], bins=3, labels=['low', 'mid', 'high'])
            
            # Ajouter la cible
            data = pd.concat([X_small, pd.Series(y, name='target')], axis=1)
            
            # Apprentissage de structure Hill Climbing
            hc = HillClimbSearch(data)
            model_structure = hc.estimate(max_iter=50)
            
            # Cr√©er le r√©seau bay√©sien
            self.model = DiscreteBayesianNetwork(model_structure.edges())
            estimator = BayesianEstimator(self.model, data)
            for node in self.model.nodes():
                self.model.add_cpds(estimator.estimate_cpd(node))
            
            self.inference_ = VariableElimination(self.model)
            self.learned_structure_ = True
            self.feature_names_ = X_small.columns.tolist()
            
        else:
            # Fallback GaussianNB
            self.model = GaussianNB() if self.model is None else self.model
            self.model.fit(X, y)
            self.learned_structure_ = False
        
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
    
    def predict_proba(self, X):
        check_is_fitted(self)
        
        if hasattr(self, 'learned_structure_') and self.learned_structure_:
            # Utiliser seulement les 5 premi√®res features
            X_small = X.iloc[:, :5] if hasattr(X, 'columns') else X[:, :5]
            X_small = pd.DataFrame(X_small, columns=self.feature_names_)
            
            # Discr√©tiser
            for col in X_small.columns:
                if X_small[col].dtype in ['float64', 'int64']:
                    X_small[col] = pd.cut(X_small[col], bins=3, labels=['low', 'mid', 'high'])
            
            probabilities = []
            for i in range(len(X_small)):
                evidence = {col: str(X_small.iloc[i][col]) for col in X_small.columns}
                try:
                    result = self.inference_.query(['target'], evidence=evidence)
                    probabilities.append(result.values)
                except:
                    probabilities.append([0.5, 0.5])  # Uniforme si erreur
            
            return np.array(probabilities)
        else:
            return self.model.predict_proba(X)

In [71]:
# Question 2 : application du classifieur Bay√©sien au dataset 

# Wrapper pour LabelEncoder compatible avec Pipeline et valeurs inconnues
class MultiLabelEncoder(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        # Convertir en DataFrame si n√©cessaire
        if not hasattr(X, 'columns'):
            X = pd.DataFrame(X)
        
        for i, col in enumerate(X.columns):
            le = LabelEncoder()
            # Ajouter une cat√©gorie sp√©ciale pour les valeurs inconnues
            unique_values = list(X.iloc[:, i].astype(str).unique()) + ['__UNKNOWN__']
            le.fit(unique_values)
            self.label_encoders[i] = le
        return self
    
    def transform(self, X):
        # Convertir en DataFrame si n√©cessaire
        if not hasattr(X, 'columns'):
            X = pd.DataFrame(X)
        
        X_encoded = X.copy()
        for i, col in enumerate(X.columns):
            if i in self.label_encoders:
                # Remplacer les valeurs inconnues par '__UNKNOWN__'
                values = X.iloc[:, i].astype(str)
                known_values = set(self.label_encoders[i].classes_)
                values_safe = [v if v in known_values else '__UNKNOWN__' for v in values]
                X_encoded.iloc[:, i] = self.label_encoders[i].transform(values_safe)
        
        # Retourner en array numpy pour compatibilit√© pipeline
        return X_encoded.values
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')

# D√©finir la cible
target = 'fraud_reported' if 'fraud_reported' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# Identifier les colonnes num√©riques et cat√©gorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pipeline scikit-learn avec LabelEncoder (√©vite l'explosion combinatoire)
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', MultiLabelEncoder())
        ]), cat_cols)
    ])),
    ('classifier', BayesianClassifier())
])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Entra√Ænement et pr√©diction avec barre de progression
print(" D√©marrage de l'entra√Ænement du BayesianClassifier...")

# Barre de progression pour l'entra√Ænement
with tqdm(total=100, desc="Entra√Ænement", bar_format='{l_bar}{bar}| {percentage:3.0f}%') as pbar:
    pbar.set_description("üìö Pr√©paration des donn√©es")
    pbar.update(10)
    time.sleep(0.1)
    
    pbar.set_description("üîç Apprentissage de structure")
    pbar.update(20)
    
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    pbar.set_description("‚úÖ Entra√Ænement termin√©")
    pbar.update(70)

print(f"‚è±Ô∏è  Temps d'entra√Ænement: {train_time:.2f}s")

# Barre de progression pour la pr√©diction
print("\n Pr√©dictions en cours...")
with tqdm(total=100, desc="Pr√©diction", bar_format='{l_bar}{bar}| {percentage:3.0f}%') as pbar:
    start_pred = time.time()
    y_pred = pipeline.predict(X_test)
    pred_time = time.time() - start_pred
    pbar.update(100)

print(f"‚è±Ô∏è  Temps de pr√©diction: {pred_time:.2f}s")

# √âvaluation
print("\n R√©sultats:")
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy: {accuracy:.4f}")
print("\n Rapport de classification:")
print(classification_report(y_test, y_pred))

 D√©marrage de l'entra√Ænement du BayesianClassifier...


üîç Apprentissage de structure:  30%|‚ñà‚ñà‚ñà       |  30%INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
INFO:pgmpy: Datatype (N=numerical, 

  0%|          | 0/50 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'f_0': 'O', 'f_1': 'O', 'f_2': 'O', 'f_3': 'O', 'f_4': 'O', 'target': 'C'}
‚úÖ Entra√Ænement termin√©: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100%      
‚úÖ Entra√Ænement termin√©: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100%      


‚è±Ô∏è  Temps d'entra√Ænement: 0.13s

 Pr√©dictions en cours...


Pr√©diction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100%

‚è±Ô∏è  Temps de pr√©diction: 0.03s

 R√©sultats:
 Accuracy: 0.7550

 Rapport de classification:
              precision    recall  f1-score   support

           N       0.76      1.00      0.86       151
           Y       0.00      0.00      0.00        49

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.57      0.76      0.65       200






In [72]:
# Question 3 : comparaison avec RandomForest + validation crois√©e stratifi√©e

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Pipeline RandomForest avec parall√©lisation
rf_pipeline = Pipeline([
    ('preprocessor', pipeline.named_steps['preprocessor']),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_cores_use))
])

# Entra√Ænement et test simple d'abord
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("=== COMPARAISON TRAIN/TEST ===")
print(f"BayesianClassifier: {accuracy_score(y_test, y_pred):.4f}")
print(f"RandomForest:       {accuracy_score(y_test, y_pred_rf):.4f}")

=== COMPARAISON TRAIN/TEST ===
BayesianClassifier: 0.7550
RandomForest:       0.8300


In [73]:
# Question 4 : analyse des erreurs avec la structure du r√©seau bay√©sien

y_pred_final = pipeline.predict(X_test)
errors_mask = y_test != y_pred_final
print(f"Total d'erreurs: {errors_mask.sum()}")

# Structure apprise ?
clf = pipeline.named_steps['classifier']
if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
    print(f"\nStructure apprise - Ar√™tes: {list(clf.model.edges())}")

# 5 premi√®res erreurs
error_indices = y_test[errors_mask].index[:5]
for i, idx in enumerate(error_indices):
    print(f"\nErreur {i+1}:")
    print(f"  Vraie: {y_test.loc[idx]} | Pr√©dite: {y_pred_final[y_test.index.get_loc(idx)]}")
    print(f"  Features: {X_test.loc[idx].head(3).to_dict()}")
    if hasattr(clf, 'learned_structure_') and clf.learned_structure_:
        print("  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction")

Total d'erreurs: 49

Structure apprise - Ar√™tes: [('f_0', 'f_1')]

Erreur 1:
  Vraie: Y | Pr√©dite: N
  Features: {'months_as_customer': 230, 'age': 37, 'policy_number': 776950}
  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction

Erreur 2:
  Vraie: Y | Pr√©dite: N
  Features: {'months_as_customer': 101, 'age': 33, 'policy_number': 575000}
  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction

Erreur 3:
  Vraie: Y | Pr√©dite: N
  Features: {'months_as_customer': 266, 'age': 42, 'policy_number': 929306}
  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction

Erreur 4:
  Vraie: Y | Pr√©dite: N
  Features: {'months_as_customer': 234, 'age': 44, 'policy_number': 442494}
  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction

Erreur 5:
  Vraie: Y | Pr√©dite: N
  Features: {'months_as_customer': 14, 'age': 28, 'policy_number': 335780}
  ‚Üí Structure bay√©sienne utilis√©e pour cette pr√©diction
