In [76]:
# Imports minimaux nécessaires
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.inference import VariableElimination
import logging
logging.getLogger("pgmpy").setLevel(logging.CRITICAL)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.calibration import LabelEncoder
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [83]:
# Question 1 : implémentation d'un classifieur Bayésien 

class BayesianClassifier(ClassifierMixin, BaseEstimator):

    def __init__(self, model=None):
        self.model = model

    def fit(self, X, y):
        # Check that X and y have correct shape, set n_features_in_, etc.
        X, y = validate_data(self, X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        self.feature_names_ = [f"feature_{i}" for i in range(X.shape[1])]


        # If bayesian network model not provided at init, perform structure learning
        if self.model is None:
            # Combine X and y into a single DataFrame for pgmpy
            data_dict = {name: X[:, i] for i, name in enumerate(self.feature_names_)}
            data_dict['target'] = y
            data_df = pd.DataFrame(data_dict)
   
            # Learn the structure using Hill Climb Search
            est = HillClimbSearch(data=data_df)
            estimated_model = est.estimate(scoring_method="bic-d", max_indegree=4, max_iter=int(1e4))
            
            # Create Bayesian Network model
            self.model = DiscreteBayesianNetwork(estimated_model.edges())

        # Estimate parameters from data

        # Fit the model parameters using Bayesian Estimator
        self.model.fit(data_df, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=10)
        self.inference = VariableElimination(self.model)

        # Return the classifier
        return self

    def predict_proba(self, X):
        # Check if fit has been called
        check_is_fitted(self)

        # Input validation
        X = validate_data(self, X, reset=False)

        # Implement prediction here, return classes with probabilities
        predictions_proba = []
        for row in X:
            evidence = {self.feature_names_[i]: row[i] for i in range(X.shape[1])
                        if self.feature_names_[i] in self.model.nodes()}
            query_result = self.inference.query(variables=['target'], evidence=evidence)
            predictions_proba.append(query_result.values)
        return np.array(predictions_proba)

    def predict(self, X):

        # Check if fit has been called
        check_is_fitted(self)

        # Input validation
        X = validate_data(self, X, reset=False)
       
        # Perform prediction here with predict_proba
        proba = self.predict_proba(X)

        # Return just result (without the probability)
        return self.classes_[np.argmax(proba, axis=1)]

In [84]:
# Question 2 : application du classifieur Bayésien au dataset 

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')
df = df.rename(columns={'fraud_reported': 'target'})

# Définir la cible
X = df.drop(columns=['target'])
y = df['target']

X = X.dropna(axis=1, how='all')
# Séparation num/cat
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pré-traitement
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

# Pipeline complet
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', BayesianClassifier())
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraînement
clf_pipeline.fit(X_train, y_train)

# Prédiction
y_pred = clf_pipeline.predict(X_test)

# Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  0%|          | 12/10000 [00:08<1:51:25,  1.49it/s]


Accuracy: 0.78
              precision    recall  f1-score   support

           N       0.86      0.83      0.85       145
           Y       0.59      0.65      0.62        55

    accuracy                           0.78       200
   macro avg       0.73      0.74      0.73       200
weighted avg       0.79      0.78      0.78       200



In [89]:
# Question 3 : comparaison avec d'autres classifieurs

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Bayesian": BayesianClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=10000, random_state=42)
}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

  0%|          | 12/10000 [00:08<1:57:55,  1.41it/s]


=== Bayesian ===
Accuracy: 0.78
              precision    recall  f1-score   support

           N       0.86      0.83      0.85       145
           Y       0.59      0.65      0.62        55

    accuracy                           0.78       200
   macro avg       0.73      0.74      0.73       200
weighted avg       0.79      0.78      0.78       200

=== RandomForest ===
Accuracy: 0.765
              precision    recall  f1-score   support

           N       0.83      0.86      0.84       145
           Y       0.58      0.53      0.55        55

    accuracy                           0.77       200
   macro avg       0.70      0.69      0.70       200
weighted avg       0.76      0.77      0.76       200

=== LogisticRegression ===
Accuracy: 0.72
              precision    recall  f1-score   support

           N       0.77      0.87      0.82       145
           Y       0.49      0.33      0.39        55

    accuracy                           0.72       200
   macro avg     