In [2]:
# Imports minimaux nécessaires
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.naive_bayes import GaussianNB  # Fallback simple
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

In [3]:
# Question 1 : implémentation d'un classifieur Bayésien 

class TemplateClassifier1(ClassifierMixin, BaseEstimator):
    """Classifieur Bayésien minimal pour scikit-learn"""
    
    def __init__(self, model=None):
        self.model = model
    
    def fit(self, X, y):
        # Validation sklearn standard
        X, y = validate_data(self, X, y)
        self.classes_ = unique_labels(y)
        
        # Si pas de modèle fourni, utiliser Naive Bayes simple
        if self.model is None:
            self.model = GaussianNB()
        
        # Entraîner le modèle
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        check_is_fitted(self)
        X = validate_data(self, X, reset=False)
        return self.model.predict(X)
    
    def predict_proba(self, X):
        check_is_fitted(self)
        X = validate_data(self, X, reset=False)
        return self.model.predict_proba(X)

In [4]:
# Question 2 : application du classifieur Bayésien au dataset 

# Charger le dataset
df = pd.read_csv('insurance_claims.csv')

# Définir la cible
target = 'fraud_reported' if 'fraud_reported' in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Pipeline scikit-learn 
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
        ]), cat_cols)
    ])),
    ('classifier', TemplateClassifier1())
])

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Entraînement et prédiction
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Évaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nRapport de classification:")
print(classification_report(y_test, y_pred))

Accuracy: 0.3000

Rapport de classification:
              precision    recall  f1-score   support

           N       0.82      0.09      0.17       151
           Y       0.25      0.94      0.40        49

    accuracy                           0.30       200
   macro avg       0.54      0.52      0.28       200
weighted avg       0.68      0.30      0.22       200





In [4]:
pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-1.0.0-py3-none-any.whl.metadata (9.4 kB)
Collecting networkx (from pgmpy)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting torch (from pgmpy)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting statsmodels (from pgmpy)
  Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting opt-einsum (from pgmpy)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting pyro-ppl (from pgmpy)
  Downloading pyro_ppl-1.9.1-py3-none-any.whl.metadata (7.8 kB)
Collecting pyro-api>=0.1.1 (from pyro-ppl->pgmpy)
  Downloading pyro_api-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting filelock (from torch->pgmpy)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch->pgmpy)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting fsspec (from torch->pgmpy)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metada


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import validate_data, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.inference import VariableElimination
from pgmpy.estimators import HillClimbSearch, BayesianEstimator

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
# Question 1 : implémentation d'un classifieur Bayésien 

class TemplateClassifier(ClassifierMixin, BaseEstimator):

    def __init__(self, model=None):
        self.model = model

    def fit(self, X, y):

        # Check that X and y have correct shape, set n_features_in_, etc.
        X, y = validate_data(self, X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        
        # If bayesian network model not provided at init, perform structure learning
        if self.model is None:
            # Combine X and y into a single DataFrame for pgmpy
            data = pd.DataFrame(X)
            data['target'] = y
            
            # Learn the structure using Hill Climb Search
            est = HillClimbSearch(data=data)
            estimated_model = est.estimate(scoring_method="bic-d", max_indegree=4, max_iter=int(1e4))
            
            # Create Bayesian Network model
            self.model = DiscreteBayesianNetwork(estimated_model.edges())

        # Estimate parameters from data

        # Fit the model parameters using Bayesian Estimator
        self.model.fit(data, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=10)
        self.inference = VariableElimination(self.model)

        # Return the classifier
        return self

    def predict_proba(self, X):
        # Check if fit has been called
        check_is_fitted(self)

        # Input validation
        X = validate_data(self, X, reset=False)

        # Implement prediction here, return classes with probabilities
        predictions_proba = []
        for _, row in pd.DataFrame(X).iterrows():
            evidence = row.to_dict()
            query_result = self.inference.query(variables=['target'], evidence=evidence)
            proba = query_result.values
            predictions_proba.append(proba)
        return np.array(predictions_proba)

    def predict(self, X):

        # Check if fit has been called
        check_is_fitted(self)

        # Input validation
        X = validate_data(self, X, reset=False)
       
        # Perform prediction here with predict_proba
        proba = self.predict_proba(X)
        result = self.classes_[np.argmax(proba, axis=1)]

        # Return just result (without the probability)
        return result

In [35]:
# Question 2 : application du classifieur Bayésien au dataset 

# Charger le dataset
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('insurance_claims.csv')
df = df.rename(columns={'fraud_reported': 'target'})

# Définir la cible
X = df.drop(columns=['target'])
y = df['target']

X = X.dropna(axis=1, how='all')

# Identifier les colonnes numériques et catégorielles
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

X_encoded = X.copy()
for col in X_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])

# Idem pour y si c'est catégoriel
if y.dtype == 'object':
    le_y = LabelEncoder()
    y_encoded = le_y.fit_transform(y)
else:
    y_encoded = y

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42
)

# Entraînement et prédiction
clf = TemplateClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {0: 'N', 1: 'N', 2: 'N', 3: 'N', 4: 'N', 5: 'N', 6: 'N', 7: 'N', 8: 'N', 9: 'N', 10: 'N', 11: 'N', 12: 'N', 13: 'N', 14: 'N', 15: 'N', 16: 'N', 17: 'N', 18: 'N', 19: 'N', 20: 'N', 21: 'N', 22: 'N', 23: 'N', 24: 'N', 25: 'N', 26: 'N', 27: 'N', 28: 'N', 29: 'N', 30: 'N', 31: 'N', 32: 'N', 33: 'N', 34: 'N', 35: 'N', 36: 'N', 37: 'N', 'target': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {0: 'N', 1: 'N', 2: 'N', 3: 'N', 4: 'N', 5: 'N', 6: 'N', 7: 'N', 8: 'N', 9: 'N', 10: 'N', 11: 'N', 12: 'N', 13: 'N', 14: 'N', 15: 'N', 16: 'N', 17: 'N', 18: 'N', 19: 'N', 20: 'N', 21: 'N', 22: 'N', 23: 'N', 24: 'N', 25: 'N', 26: 'N', 27: 'N', 28: 'N', 29: 'N', 30: 'N', 31: 'N', 32: 'N', 33: 'N', 34: 'N', 35: 'N', 36: 'N', 37: 'N', 'target': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred fr

ValueError: Node 0 not in graph

In [22]:
import pgmpy.estimators as est
print(dir(est))


['AIC', 'AICCondGauss', 'AICGauss', 'BDeu', 'BDs', 'BIC', 'BICCondGauss', 'BICGauss', 'BaseEstimator', 'BayesianEstimator', 'CITests', 'EM', 'ExhaustiveSearch', 'ExpectationMaximization', 'ExpertInLoop', 'ExpertKnowledge', 'GES', 'HillClimbSearch', 'IVEstimator', 'K2', 'LinearModel', 'LogLikelihoodCondGauss', 'LogLikelihoodGauss', 'MLE', 'MarginalEstimator', 'MaximumLikelihoodEstimator', 'MirrorDescentEstimator', 'MmhcEstimator', 'PC', 'ParameterEstimator', 'SEMEstimator', 'StructureEstimator', 'StructureScore', 'TreeSearch', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'base', 'expert', 'get_scoring_method']
