In [3]:
import pandas as pd
import numpy as np
import pymc3 as pm
import theano
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Load the breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Build the Bayesian logistic regression model using PyMC3
# Build the Bayesian logistic regression model using PyMC3
with pm.Model() as model:
    # Define the priors
    alpha = pm.Normal("alpha", mu=0, sd=10)
    beta = pm.Normal("beta", mu=0, sd=10, shape=X_train.shape[1])

    # Create a shared variable for the input data
    X_data = theano.shared(X_train)

    # Define the likelihood
    mu = alpha + pm.math.dot(beta, X_data.T)
    theta = pm.Deterministic("theta", pm.math.sigmoid(mu))
    y_obs = pm.Bernoulli("y_obs", p=theta, observed=y_train)

    # Perform inference
    trace = pm.sample(2000, tune=1000, target_accept=0.95)





Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 24862 seconds.


In [4]:
# Evaluate the model on the testing set
with model:
    X_data.set_value(X_test)  # Update the shared variable with test data
    ppc = pm.sample_posterior_predictive(trace, var_names=["theta"])
    y_test_pred = ppc["theta"].mean(axis=0) > 0.5
    accuracy = (y_test_pred == y_test).mean()
    print(
        f"Accuracy with PyMC3 Bayesian Logistic Regression: {accuracy * 100:.2f}%")


Accuracy with PyMC3 Bayesian Logistic Regression: 96.49%
