In [1]:
# Imports minimaux nécessaires
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.naive_bayes import GaussianNB  # Fallback simple
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

In [2]:
# Question 1 : implémentation d'un classifieur Bayésien 

class BayesianClassifier(ClassifierMixin, BaseEstimator):
    """Classifieur Bayésien minimal pour scikit-learn"""
    
    def __init__(self, model=None):
        self.model = model
    
    def fit(self, X, y):
        # Validation sklearn standard
        X, y = validate_data(self, X, y)
        self.classes_ = unique_labels(y)
        
        # Si pas de modèle fourni, utiliser Naive Bayes simple
        if self.model is None:
            self.model = GaussianNB()
        
        # Entraîner le modèle
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        X = validate_data(self, X, reset=False)
        return self.model.predict(X)
    
    def predict_proba(self, X):
        check_is_fitted(self)
        X = validate_data(self, X, reset=False)
        return self.model.predict_proba(X)

In [3]:
# Question 2 : application du classifieur Bayésien au dataset 

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')

# Définir la cible
target = 'fraud_reported' if 'fraud_reported' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pipeline scikit-learn 
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
        ]), cat_cols)
    ])),
    ('classifier', BayesianClassifier())
])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Entraînement et prédiction
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Évaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nRapport de classification:")
print(classification_report(y_test, y_pred))

Accuracy: 0.3000

Rapport de classification:
              precision    recall  f1-score   support

           N       0.82      0.09      0.17       151
           Y       0.25      0.94      0.40        49

    accuracy                           0.30       200
   macro avg       0.54      0.52      0.28       200
weighted avg       0.68      0.30      0.22       200



