In [18]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.interpolate import interp1d
from scipy.special import expit
from fractions import Fraction
from empiricaldist import Pmf, Cdf
import matplotlib.pyplot as plt
from collections import Counter
import statsmodels.formula.api as smfa

In [3]:
def prob(o):
    return o / (o+1)

def odds(p):
    return p / (1 - p)

In [4]:
link = 'https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv'

In [11]:
data = pd.read_csv(link, parse_dates=['Date'])
data.rename(columns={'Damage Incident': 'Damage'}, inplace=True)
data.drop(labels=[3, 24], inplace=True)
data['Damage'] = data['Damage'].astype(int)
data.head(), data.shape

(        Date  Temperature  Damage
 0 1981-04-12           66       0
 1 1981-11-12           70       1
 2 1982-03-22           69       0
 4 1982-01-11           68       0
 5 1983-04-04           67       0,
 (23, 3))

In [10]:
offset = round(data['Temperature'].mean(), 1)
offset

69.6

In [12]:
data['x'] = data['Temperature'] - offset
data['y'] = data['Damage']

In [14]:
formula = 'y ~ x'
results = smfa.logit(formula, data=data).fit(disp=False)
results.params

Intercept   -1.115625
x           -0.232163
dtype: float64

In [15]:
inter = results.params['Intercept']
slope = results.params['x']
xs = np.arange(53, 83) - offset

In [16]:
log_odds = inter + slope * xs
log_odds

array([ 2.7382762 ,  2.50611346,  2.27395072,  2.04178797,  1.80962523,
        1.57746248,  1.34529974,  1.11313699,  0.88097425,  0.64881151,
        0.41664876,  0.18448602, -0.04767673, -0.27983947, -0.51200221,
       -0.74416496, -0.9763277 , -1.20849045, -1.44065319, -1.67281594,
       -1.90497868, -2.13714142, -2.36930417, -2.60146691, -2.83362966,
       -3.0657924 , -3.29795515, -3.53011789, -3.76228063, -3.99444338])

In [17]:
odds = np.exp(log_odds)
ps = prob(odds)
ps

array([0.93924781, 0.92456929, 0.90669655, 0.88511521, 0.85931657,
       0.82884484, 0.79336013, 0.75271348, 0.70702407, 0.65674259,
       0.60268105, 0.54599114, 0.48808308, 0.43049313, 0.37472428,
       0.32209405, 0.27362105, 0.22996826, 0.19144422, 0.1580491 ,
       0.12954602, 0.10553894, 0.08554356, 0.06904407, 0.05553372,
       0.04454055, 0.03564141, 0.02846733, 0.02270329, 0.01808462])

In [20]:
ps = expit(inter + slope * xs)
ps

array([0.93924781, 0.92456929, 0.90669655, 0.88511521, 0.85931657,
       0.82884484, 0.79336013, 0.75271348, 0.70702407, 0.65674259,
       0.60268105, 0.54599114, 0.48808308, 0.43049313, 0.37472428,
       0.32209405, 0.27362105, 0.22996826, 0.19144422, 0.1580491 ,
       0.12954602, 0.10553894, 0.08554356, 0.06904407, 0.05553372,
       0.04454055, 0.03564141, 0.02846733, 0.02270329, 0.01808462])

In [21]:
def make_uniform(qs, name=None, **options):
    pmf = Pmf(1.0, qs, **options)
    pmf.normalize()
    if name:
        pmf.index.name = name
    return pmf

In [22]:
qs_inter = np.linspace(-5, 1, 101)
prior_inter = make_uniform(qs_inter, name='Intercept')
qs_slope = np.linspace(-0.8, 0.1, 101)
prior_slope = make_uniform(qs_slope, name='Slope')

In [23]:
def make_joint(pmf1, pmf2):
    X, Y = np.meshgrid(pmf1, pmf2)
    return pd.DataFrame(X * Y, columns=pmf1.qs, index=pmf2.qs)

In [24]:
joint = make_joint(prior_inter, prior_slope)

In [25]:
joint_pmf = Pmf(joint.stack())

In [26]:
grouped = data.groupby('x')['y'].agg(['count', 'sum'])

In [27]:
ns = grouped['count']
ks = grouped['sum']

In [28]:
xs = grouped.index
ps = expit(inter + slope * xs)

In [29]:
likes = ss.binom.pmf(ks, ns, ps)

In [30]:
likelihood = joint_pmf.copy()
for slope, inter in joint_pmf.index:
    ps = expit(inter + slope * xs)
    likes = ss.binom.pmf(ks, ns, ps)
    likelihood[slope, inter] = likes.prod()

In [31]:
likelihood

Unnamed: 0,Unnamed: 1,probs
-0.8,-5.00,1.036161e-08
-0.8,-4.94,1.249270e-08
-0.8,-4.88,1.503102e-08
-0.8,-4.82,1.804693e-08
-0.8,-4.76,2.162121e-08
...,...,...
0.1,0.76,4.786765e-11
0.1,0.82,2.878786e-11
0.1,0.88,1.703372e-11
0.1,0.94,9.919219e-12


In [32]:
posterior_pmf = joint_pmf * likelihood
posterior_pmf.normalize()

3.5089887132383696e-05

In [34]:
joint_posterior = posterior_pmf.unstack()
joint_posterior.head()

Unnamed: 0,-5.00,-4.94,-4.88,-4.82,-4.76,-4.70,-4.64,-4.58,-4.52,-4.46,...,0.46,0.52,0.58,0.64,0.70,0.76,0.82,0.88,0.94,1.00
-0.8,2.894694e-08,3.490051e-08,4.199173e-08,5.041718e-08,6.040255e-08,7.220584e-08,8.61207e-08,1.024797e-07,1.216575e-07,1.440743e-07,...,1.684845e-08,1.259112e-08,9.347097e-09,6.893193e-09,5.050334e-09,3.676222e-09,2.658839e-09,1.910814e-09,1.364614e-09,9.684889e-10
-0.791,2.997096e-08,3.618641e-08,4.360092e-08,5.242412e-08,6.289716e-08,7.529631e-08,8.99367e-08,1.07176e-07,1.274184e-07,1.511177e-07,...,1.957195e-08,1.462265e-08,1.085185e-08,8.000014e-09,5.858842e-09,4.262778e-09,3.081488e-09,2.213313e-09,1.579676e-09,1.12038e-09
-0.782,3.099052e-08,3.747093e-08,4.52135e-08,5.444144e-08,6.541209e-08,7.842088e-08,9.380555e-08,1.119503e-07,1.332903e-07,1.583154e-07,...,2.272323e-08,1.697252e-08,1.259173e-08,9.27921e-09,6.792791e-09,4.939955e-09,3.569129e-09,2.562086e-09,1.827451e-09,1.295232e-09
-0.773,3.200181e-08,3.874949e-08,4.682399e-08,5.646265e-08,6.793971e-08,8.157067e-08,9.771699e-08,1.167908e-07,1.3926e-07,1.656525e-07,...,2.636699e-08,1.96887e-08,1.460203e-08,1.075654e-08,7.870837e-09,5.721154e-09,4.131311e-09,2.963886e-09,2.112676e-09,1.496343e-09
-0.764,3.300075e-08,4.00172e-08,4.842651e-08,5.848076e-08,7.047174e-08,8.473594e-08,1.016597e-07,1.216844e-07,1.453124e-07,1.731121e-07,...,3.057717e-08,2.282601e-08,1.692306e-08,1.246142e-08,9.114248e-09,6.621637e-09,4.7789e-09,3.426387e-09,2.440727e-09,1.72745e-09


In [36]:
def transform(pmf, func):
    ps = pmf.ps
    qs = func(pmf.qs)
    return Pmf(ps, qs, copy=True)

In [41]:
sample = posterior_pmf.choice(101)

In [42]:
temps = np.arange(31, 83)
xs = temps - offset

In [44]:
pred = np.empty((len(sample), len(xs)))
for i, (slope, inter) in enumerate(sample):
    pred[i] = expit(inter + slope * xs)

In [45]:
low, median, high = np.percentile(pred, [5, 50, 95], axis=0)