# Simple linear regression via Bayes

In [None]:
import numpy as np
import pymc3 as pm
import pandas as pd
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
climate = pd.read_csv('../data/climate-change-2016.csv')
center = climate['CO2ppm'].mean()
climate['CO2ppm'] = (climate['CO2ppm'] - center) / 100   # center the x values
climate.head()

### Prior sampling

Three choices of priors on $\alpha$ and $\beta$
$$
\begin{align*}
\alpha &\sim \mathrm{Normal}(0, 5) \\
\beta &\sim \mathrm{Normal}(0, 10) \\
\end{align*}
$$
(really vague, diffuse priors)
$$
\begin{align*}
\alpha &\sim \mathrm{Normal}(0, 0.5) \\
\beta &\sim \mathrm{Normal}(0, 1) \\
\end{align*}
$$
(weakly informative)
$$
\begin{align*}
\alpha &\sim \mathrm{Normal}(0, 0.5) \\
\beta &\sim \mathrm{LogNormal}(0, 1) \\
\end{align*}
$$
(more informative, enforces positive relationship)

In [None]:
n_lines = 100
α = sp.stats.norm.rvs(0, 5, n_lines)
β = sp.stats.norm.rvs(0, 10, n_lines)

xmin=climate['CO2ppm'].min()
xmax=climate['CO2ppm'].max()

plt.figure(figsize=(12, 9))
for i in range(n_lines):
    plt.plot(climate['CO2ppm'] * 100 + center, α[i] + β[i] * climate['CO2ppm'], 'r-', alpha=0.2)
plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.show()

In [None]:
plt.figure(figsize=(12, 9))
for i in range(n_lines):
    plt.plot(climate['CO2ppm'] * 100 + center, α[i] + β[i] * climate['CO2ppm'], 'r-', alpha=0.2)
plt.hlines(4, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='b', linestyle='--', label='"Worst case" by 2100')
plt.hlines(-6, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='g', linestyle='--', label='Last Glacial Maximum (ice age)')
plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

In [None]:
n_lines = 100
α = sp.stats.norm.rvs(0, 0.5, n_lines)
β = sp.stats.norm.rvs(0, 2, n_lines)

xmin=climate['CO2ppm'].min()
xmax=climate['CO2ppm'].max()

plt.figure(figsize=(12, 9))
for i in range(n_lines):
    plt.plot(climate['CO2ppm'] * 100 + center, α[i] + β[i] * climate['CO2ppm'], 'r-', alpha=0.2)
plt.hlines(4, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='b', linestyle='--', label='"Worst case" by 2100')
plt.hlines(-6, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='g', linestyle='--', label='Last Glacial Maximum (ice age)')
plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

In [None]:
n_lines = 100
α = sp.stats.norm.rvs(0, 0.5, n_lines)
β = sp.stats.norm.rvs(0, 1, n_lines)

xmin=climate['CO2ppm'].min()
xmax=climate['CO2ppm'].max()

plt.figure(figsize=(12, 9))
for i in range(n_lines):
    plt.plot(climate['CO2ppm'] * 100 + center, α[i] + np.exp(β[i]) * climate['CO2ppm'], 'r-', alpha=0.2)
plt.hlines(4, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='b', linestyle='--', label='"Worst case" by 2100')
plt.hlines(-6, xmin=xmin * 100 + center, xmax=xmax * 100 + center, color='g', linestyle='--', label='Last Glacial Maximum (ice age)')
plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

### Adding the data

In [None]:
plt.figure(figsize=(12, 9))
plt.plot(climate['CO2ppm'] * 100 + center, climate['global_temp_anomaly'], 'bo', label = 'Observed values')
plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

In [None]:
with pm.Model() as linear_model:
    # Priors for parameters
    alpha = pm.Normal('alpha', 0, 0.5)
    beta = pm.Normal('beta', 0, 1)

    sigma = pm.HalfCauchy('sigma', 1)
    
    
    # Model equation
    mu = pm.Deterministic('mu', alpha + beta * climate['CO2ppm'])
        
    # Likelihood
    y_ = pm.Normal('y', mu=mu, sigma=sigma, observed = climate['global_temp_anomaly'])
    
    # Inference
    trace = pm.sample(2000, target_accept = 0.9, tune = 1000)
    mp = pm.find_MAP()

In [None]:
with linear_model:
    summary = pm.summary(trace, var_names = ['alpha', 'beta', 'sigma'])

In [None]:
summary

In [None]:
plt.figure(figsize=(12, 9))
plt.plot(climate['CO2ppm'], climate['global_temp_anomaly'], 'bo', label = 'Observed values')
plt.plot(climate['CO2ppm'], summary.loc['alpha', 'mean'] + summary.loc['beta', 'mean'] * climate['CO2ppm'], 'r-', label = 'Posterior mean line')

plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

### Visualizing uncertainty

In [None]:
plt.figure(figsize=(12, 9))
plt.plot(climate['CO2ppm'], climate['global_temp_anomaly'], 'bo', label = 'Observed values')
plt.plot(climate['CO2ppm'], summary.loc['alpha', 'mean'] + summary.loc['beta', 'mean'] * climate['CO2ppm'], 'k-', label = 'Posterior mean line')
for i in range(n_lines):
    plt.plot(climate['CO2ppm'], trace['mu'][i], 'r-', alpha=0.05)

plt.xlabel('CO2 (ppm)', size = 14)
plt.ylabel('Global temperature (centered, degrees C)', size = 14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.legend()
plt.show()

### Logistic regression for the bioassay experiment

In [None]:
bioassay = pd.DataFrame(data={'dose':[-0.86, -0.3, -0.05, 0.73], 'n':[5, 5, 5, 5], 'deaths':[0, 1, 3, 5]})
bioassay

In [None]:
with pm.Model() as logistic_model:
    alpha = pm.Normal('alpha', 0, 1)
    beta = pm.Normal('beta', 0, 5)
    #alpha = pm.Flat('alpha')
    #beta = pm.Flat('beta')
    
    theta = pm.Deterministic('theta', pm.math.invlogit(alpha + beta * bioassay['dose']))
    
    y_ = pm.Binomial('y', n = bioassay['n'], p = theta, observed = bioassay['deaths'])
    
    trace = pm.sample()
    mp = pm.find_MAP()

In [None]:
summary = pm.summary(trace, var_names = ['alpha', 'beta'])
summary

In [None]:
grid = np.linspace(-2, 2, 100)

plt.figure(figsize=(12, 9))
plt.plot(grid, sp.special.expit(summary.loc['alpha', 'mean'] + summary.loc['beta', 'mean'] * grid), 'r-', label = 'Posterior mean fit')
plt.plot(bioassay['dose'], bioassay['deaths'] / bioassay['n'], 'bo', label = 'Observed data')

plt.legend()
plt.show()

In [None]:
grid = np.linspace(-2, 2, 100)

plt.figure(figsize=(12, 9))
plt.plot(grid, sp.special.expit(summary.loc['alpha', 'mean'] + summary.loc['beta', 'mean'] * grid), 'b-', label = 'Posterior mean fit')
for i in range(100):
    plt.plot(grid, sp.special.expit(trace['alpha'][i] + trace['beta'][i] * grid), 'r-', alpha = 0.1)
plt.plot(bioassay['dose'], bioassay['deaths'] / bioassay['n'], 'bo', label = 'Observed data')
plt.plot(grid, sp.special.expit(mp['alpha'] + mp['beta'] * grid), 'k-', label = 'MAP fit')


plt.legend()
plt.show()