### Приложение: множественные сравнения

Байесовский подход 

$$
P(data | p) = Binom(p, s, N)
\\
P(p) = Uniform(0, 1)
$$

$$
P(p | data) = \frac{ P(data | p) P(p) }{P(data)}
= Beta(p; \alpha, \beta)
\\
\alpha = s + 1, \qquad \beta = N - s + 1
$$

In [None]:
import pandas as pd
import numpy as np
np.random.seed(7)

from collections import namedtuple

import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
def posterior_for_binom_and_uniform_prior(n_success, n_trials):
    alpha_prior = 1
    beta_prior = 1
    alpha_post = alpha_prior + n_success
    beta_post = beta_prior + (n_trials - n_success)
    return stats.beta(alpha_post, beta_post)

pa_exact = 0.100
pb_exact = 0.105
pc_exact = 0.110

N = 1000

sa = stats.binom.rvs(n=N, p=pa_exact)
sb = stats.binom.rvs(n=N, p=pb_exact)
sc = stats.binom.rvs(n=N, p=pc_exact)
print(sa, sb, sc)

a_post_dist = posterior_for_binom_and_uniform_prior(sa, N)
b_post_dist = posterior_for_binom_and_uniform_prior(sb, N)
c_post_dist = posterior_for_binom_and_uniform_prior(sc, N)

x = np.linspace(0, 1, 1001)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=a_post_dist.pdf(x), mode='lines', name="A"))
fig.add_trace(go.Scatter(x=x, y=b_post_dist.pdf(x), mode='lines', name="B"))
fig.add_trace(go.Scatter(x=x, y=c_post_dist.pdf(x), mode='lines', name="C"))
fig.update_layout(title='Posterior Distributions',
                  xaxis_title='p',
                  yaxis_title='Prob Density',
                  xaxis_range=[0, 0.3],
                  hovermode="x",
                  height=450)
fig.show()

In [None]:
n_sample = 50000
a_post_sample = a_post_dist.rvs(size=n_sample)
b_post_sample = b_post_dist.rvs(size=n_sample)
c_post_sample = c_post_dist.rvs(size=n_sample)

fig = go.Figure()
fig.add_trace(go.Histogram(x=a_post_sample, histnorm='probability density', 
                           name='A',
                           opacity=0.6))
fig.add_trace(go.Histogram(x=b_post_sample, histnorm='probability density', 
                           name='B',
                           opacity=0.6))
fig.add_trace(go.Histogram(x=c_post_sample, histnorm='probability density', 
                           name='C',
                           opacity=0.6))
fig.update_layout(barmode='overlay')
fig.update_layout(title='Posterior Samples',
                  xaxis_title='n',
                  yaxis_title='Prob Density',
                  hovermode="x",
                  height=450)
fig.show()

In [None]:
diff_a_b = a_post_sample - b_post_sample
diff_a_c = a_post_sample - c_post_sample

diff_b_a = b_post_sample - a_post_sample
diff_b_c = b_post_sample - c_post_sample

diff_c_a = c_post_sample - a_post_sample
diff_c_b = c_post_sample - b_post_sample

prob_a_gt_b_and_c = len(a_post_sample[(diff_a_b > 0) & (diff_a_c > 0)]) / len(a_post_sample)
prob_b_gt_a_and_c = len(b_post_sample[(diff_b_a > 0) & (diff_b_c > 0)]) / len(b_post_sample)
prob_c_gt_b_and_a = len(c_post_sample[(diff_c_a > 0) & (diff_c_b > 0)]) / len(c_post_sample)

print(f'Pr(Group A is the best) = Pr(p_a > p_b & p_a > p_c): {prob_a_gt_b_and_c}')
print(f'Pr(Group B is the best) = Pr(p_b > p_a & p_b > p_c): {prob_b_gt_a_and_c}')
print(f'Pr(Group C is the best) = Pr(p_c > p_b & p_c > p_a): {prob_c_gt_b_and_a}')

Видно, что вероятность того, что C лучше A и B одновременно ниже, чем то, что C лучшая при попарном сравнении.

$$
P(p_c > p_a, p_b) < \min(P(p_c > p_a), P(p_c > p_b))
$$

Это и есть проблема множественного сравнения.

In [None]:
diff_c_a = c_post_sample - a_post_sample
prob_c_gt_a = len(diff_c_a[diff_c_a > 0]) / len(c_post_sample)
print(f'Pr(p_c > p_a): {prob_c_gt_a}')

diff_c_b = c_post_sample - b_post_sample
prob_c_gt_b = len(diff_c_b[diff_c_b > 0]) / len(c_post_sample)
print(f'Pr(p_c > p_b): {prob_c_gt_b}')

prob_c_gt_b_and_a = len(diff_c_b[(diff_c_b > 0) & (diff_c_a > 0)]) / len(c_post_sample)
print(f'Pr(p_c > p_b & p_c > p_a): {prob_c_gt_b_and_a}')

In [None]:
def p_best_binomial(df_gr_n_s):
    t = pd.DataFrame()
    n_sample = 50000 * len(df_gr_n_s.index.unique())
    for g in df_gr_n_s.index.unique():
        d_post = posterior_for_binom_and_uniform_prior(df_gr_n_s['S'][g], df_gr_n_s['N'][g])
        post_samp = d_post.rvs(size=n_sample)
        t[g] = post_samp
    return t.idxmax(axis=1).value_counts(normalize=True)


df = pd.DataFrame({'gr': ['A', 'B', 'C'], 'N':N, 'S':[sa, sb, sc]})
df['p_best'] = p_best_binomial(df.set_index('gr')[['N', 'S']])
df

In [None]:
import numpy as np
import scipy.stats as stats
import plotly.graph_objects as go

prior_conv = 0.1

a_total = 1000
a_conv = a_total * prior_conv

b_total = 1000
b_conv = b_total * (1.1 * prior_conv)

c_total = 1000
c_conv = c_total * (1.2 * prior_conv)

x = np.linspace(0, 1, 1001)
fig = go.Figure()


# Uniform prior
beta_prior = 1
alpha_prior = 1

fig.add_trace(go.Scatter(x=x, y=stats.beta.pdf(x, alpha_prior, beta_prior), mode='lines',
                         name=f"alpha_prior={alpha_prior}, beta_prior={beta_prior}"))

alpha_post = alpha_prior + a_conv
beta_post = beta_prior + (a_total - a_conv)
fig.add_trace(go.Scatter(x=x, y=stats.beta.pdf(x, alpha_post, beta_post), mode='lines',
                         name=f"<br>A: total = {a_total}, conv = {a_conv}<br>alpha_prior={alpha_prior}, beta_prior={beta_prior},<br>alpha_post={alpha_post}, beta_post={beta_post}"))

alpha_post = alpha_prior + b_conv
beta_post = beta_prior + (b_total - b_conv)
fig.add_trace(go.Scatter(x=x, y=stats.beta.pdf(x, alpha_post, beta_post), mode='lines',
                         name=f"<br>B: total = {b_total}, conv = {b_conv}<br>alpha_prior={alpha_prior}, beta_prior={beta_prior},<br>alpha_post={alpha_post}, beta_post={beta_post}"))

alpha_post = alpha_prior + c_conv
beta_post = beta_prior + (c_total - c_conv)
fig.add_trace(go.Scatter(x=x, y=stats.beta.pdf(x, alpha_post, beta_post), mode='lines',
                         name=f"<br>C: total = {c_total}, conv = {c_conv}<br>alpha_prior={alpha_prior}, beta_prior={beta_prior},<br>alpha_post={alpha_post}, beta_post={beta_post}"))


fig.update_layout(title='Posterior and Prior Distributions',
                  xaxis_title='p',
                  yaxis_title='Prob Density',
                  hovermode="x",
                  height=550)

fig.show()

In [None]:
n_sample = 50000
alpha_prior = 1
beta_prior = 1
a = alpha_prior + a_conv
b = beta_prior + (a_total - a_conv)
post_sample_a = np.random.beta(a, b, n_sample)
a = alpha_prior + b_conv
b = beta_prior + (b_total - b_conv)
post_sample_b = np.random.beta(a, b, n_sample)
a = alpha_prior + c_conv
b = beta_prior + (c_total - c_conv)
post_sample_c = np.random.beta(a, b, n_sample)

fig = go.Figure()
fig.add_trace(go.Histogram(x=post_sample_a, histnorm='probability density', 
                           name='A',
                           opacity=0.6))
fig.add_trace(go.Histogram(x=post_sample_b, histnorm='probability density', 
                           name='B',
                           opacity=0.6))
fig.add_trace(go.Histogram(x=post_sample_c, histnorm='probability density', 
                           name='C',
                           opacity=0.6))
fig.update_layout(barmode='overlay')
fig.update_layout(title='Posterior Sample',
                  xaxis_title='n',
                  yaxis_title='Prob Density',
                  hovermode="x")
fig.show()

In [None]:
diff_c_a = post_sample_c - post_sample_a
prob_c_ge_a = len(diff_c_a[diff_c_a > 0]) / len(post_sample_c)
print(f'Pr(p_c > p_a): {prob_c_ge_a}')

diff_c_b = post_sample_c - post_sample_b
prob_c_ge_b = len(diff_c_b[diff_c_b > 0]) / len(post_sample_c)
print(f'Pr(p_c > p_b): {prob_c_ge_b}')

prob_c_ge_b_and_a = len(diff_c_b[(diff_c_b > 0) & (diff_c_a > 0)]) / len(post_sample_c)
print(f'Pr(p_c > p_b & p_c > p_a): {prob_c_ge_b_and_a}')

Видно, что вероятность того, что C лучше A и B одновременно ниже, чем то, что C лучшая при попарном сравнении.
$$
P(p_c > p_a, p_b) < \min(P(p_c > p_a), P(p_c > p_b))
$$
Это и есть проблема множественного сравнения.

С вероятностью $P(p_c > p_a, p_b)$ можно делать то же, что и с $P(p_c > p_a), P(p_c > p_b)$.  
Моделировать, сколько нужно данных для достижения определенного уровня уверенности или записать функцию потерь и моделировать ее минимизацию. 

### Средние

In [None]:
from collections import namedtuple

ConjugateNormalParams = namedtuple('ConjugateNormalParams', 'mu sigma k a b')

def initial_parameters(mu, sigma, k=1/25, a=2, b=1):
    return ConjugateNormalParams(mu=mu, sigma=sigma, k=k, a=a, b=b)

def update_conj_parameters(x, conj_norm_pars):
    mu_p = (x + conj_norm_pars.k * conj_norm_pars.mu) / (conj_norm_pars.k + 1)
    sigma_p = conj_norm_pars.sigma
    k_p = conj_norm_pars.k + 1
    a_p = conj_norm_pars.a + 1/2
    b_p = conj_norm_pars.b + conj_norm_pars.k / (conj_norm_pars.k + 1) * (x - conj_norm_pars.mu)**2 / (2 * conj_norm_pars.sigma**2)
    return ConjugateNormalParams(mu=mu_p, sigma=sigma_p, k=k_p, a=a_p, b=b_p)

def compute_posterior_parameters(sample, n_split):
    means = reshape_and_compute_means(sample, n_split)
    mean_1 = means[0]
    sigma_1 = sample[0:n_split].std() / np.sqrt(n_split)
    pars = []
    pars.append(initial_parameters(mu=mean_1, sigma=sigma_1)) 
    for x in means[1:]:
        new_pars = update_conj_parameters(x, pars[-1])
        pars.append(new_pars)
    return pars

def reshape_and_compute_means(sample, n_split):
    n_means = len(sample) // n_split
    samp_reshaped = np.reshape(sample[0 : n_means * n_split], (n_means, n_split))
    means = np.array([x.mean() for x in samp_reshaped])
    return means

def mu_marginal_distrib(conj_norm_pars):
    df = 2 * conj_norm_pars.a
    loc = conj_norm_pars.mu
    scale = np.sqrt(conj_norm_pars.sigma**2 / conj_norm_pars.k * conj_norm_pars.b / conj_norm_pars.a)
    return stats.t(df=df, loc=loc, scale=scale)

In [None]:
def p_best_means(df_gr_samp):
    t = pd.DataFrame()
    n_split = 25
    n_sample = 30000 * len(df_gr_samp.index.unique())
    for g in df_gr_samp.index.unique():
        pars = compute_posterior_parameters(np.array(df_gr_samp['samp'][g]), n_split)
        last_pars = pars[-1]
        mu_dist = mu_marginal_distrib(last_pars)        
        mu_samp = mu_dist.rvs(size=n_sample)
        t[g] = mu_samp
    return t.idxmax(axis=1).value_counts(normalize=True)


n = 1000
c_a_exact = 1.5
exact_dist_a = stats.lomax(c=c_a_exact)
samp_a = exact_dist_a.rvs(size=n)
#c_b_exact = 2.8 
c_b_exact = c_a_exact * 0.95
exact_dist_b = stats.lomax(c=c_b_exact)
samp_b = exact_dist_b.rvs(size=n)


df = pd.concat([
     pd.DataFrame({'gr':['A']*len(samp_a), 'samp':samp_a}),
     pd.DataFrame({'gr':['B']*len(samp_b), 'samp':samp_b})])
df_res = df.groupby('gr')['samp'].mean().rename('means').to_frame()
df_res['p_best'] = p_best_means(df.set_index('gr'))
df_res

# Ссылки

В проверке статистических гипотез комбинирование результатов нескольких статистических тестов обычно делается разного рода поправками [[MultipleComp](https://en.wikipedia.org/wiki/Multiple_comparisons_problem), [FWER](https://en.wikipedia.org/wiki/Family-wise_error_rate), [Bonf](https://en.wikipedia.org/wiki/Bonferroni_correction)]. 

[[MultipleComp](https://en.wikipedia.org/wiki/Multiple_comparisons_problem)] -   
[[FWER](https://en.wikipedia.org/wiki/Family-wise_error_rate)] -  
[[Bonf](https://en.wikipedia.org/wiki/Bonferroni_correction)] -   