In [None]:
import numpy as np 
import pandas as pd
#Data visualization package for Python
import arviz as az
import matplotlib.pyplot as plt 
#Bayesian package for Python
import pymc3             as pm



In [None]:
az.style.use("arviz-darkgrid")

generating data and population regression equation

In [None]:
size=200
x=np.linspace(0,1,size)
xb=1+2*x
y  = 1 + 2 * x + np.random.normal(scale=1, size=size)

data = pd.DataFrame(dict(x=x, y=y))

plot data and population regression equation

In [None]:
fig = plt.figure(figsize=(7, 7))

plt.scatter(x, y, label="data")
plt.plot(x, xb, label="population regression line", c='red', lw=5, linestyle='dashed')
plt.xlabel('x')
plt.ylabel('y')

plt.legend()
plt.show()

Baseline : Frequentist OLS

In [None]:
import statsmodels.api as sm
X = data['x']
X = sm.add_constant(X)
Y = data['y']
reg = sm.OLS(Y, X)
import statsmodels.api as sm

X = data['x']
X = sm.add_constant(X)
Y = data['y']

reg = sm.OLS(Y, X)
result = reg.fit()

print(result.summary())

Y_pred = result.predict()
Y_pred.shape = (200,1)

Frequentist OLS plotting

In [None]:
fig1 = plt.figure()

plt.scatter(X['x'], Y, label='data', color='blue')
plt.plot(X['x'], Y_pred, label='predicted regression equation', color='orange', linewidth=5.0)

plt.xlabel('x')
plt.ylabel('y')

plt.legend()
plt.show()


# Bayesian regression


define prior, likelihood and calculate posterior


In [None]:
with pm.Model() as model:  # model specifications in PyMC3 are wrapped in a with-statement
    
    # Define priors
    sigma = pm.HalfCauchy("sigma", beta=10, testval=1.0) # for mor info https://docs.pymc.io/en/latest/api/distributions/generated/pymc.HalfCauchy.html
    intercept = pm.Normal("Intercept", 0, sigma=20)
    x_coeff = pm.Normal("x", 0, sigma=20) # https://docs.pymc.io/en/latest/api/distributions/generated/pymc.Normal.html

    # Define likelihood
    likelihood = pm.Normal("y", mu=intercept + x_coeff * x, sigma=sigma, observed=y)

    # Inference
    trace = pm.sample(10000, cores=16, return_inferencedata=True)

check the posterior of each coefficient

In [None]:
az.plot_trace(trace, figsize=(10, 7)) # Plot distribution (histogram or kernel density estimates) and sampled values or rank plot.

                                     # If divergences data is available in sample_stats, will plot the location of divergences as dashed vertical lines.



Bayesian regression plot


In [None]:
plt.figure(figsize=(7, 5))

plt.scatter(x, y, c='blue', label="data")
pm.plot_posterior_predictive_glm(trace, samples=100, label="posterior predictive regression lines")
plt.plot(x, xb, label="population regression line", lw=3.0, c="r", linestyle='dashed')

plt.xlabel('x')
plt.ylabel('y')
plt.ylim(-2,7)

plt.legend()

plt.show()

OLS vs Bayesian Regression

In [None]:
trace

In [None]:
fig = plt.subplots(figsize=(8,6))

plt.subplot(2, 1, 1)
plt.scatter(x, y, c='dodgerblue')
pm.plot_posterior_predictive_glm(trace, samples=100, label="posterior predictive regression lines")
plt.plot(x, xb, label="population regression line", lw=3.0, c="r", linestyle='dashed')
plt.title("Bayesian regression")
plt.ylim(-2.5,5.5)
plt.legend()


plt.subplot(2, 1, 2)
plt.scatter(x, y, c='dodgerblue')
plt.plot(X['x'], Y_pred, label='OLS regression equation', c='k', linewidth=5.0)
plt.plot(x, xb, label="population regression line", lw=5.0, c="r", linestyle='dashed')
plt.title("OLS regression")
plt.ylim(-2.5,5.5)
plt.legend()

plt.show()