# 통계적 가설검정

In [47]:
import numpy as np
import pandas as pd
from scipy import stats

%precision 3
np.random.seed(1111)

## 2표본 문제에 관한 가설검정

### 대응비교 t검정

In [48]:
training_rel = pd.read_csv('../data/ch11_training_rel.csv')
print(training_rel.shape)
training_rel.head()

(20, 2)


Unnamed: 0,전,후
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84


In [49]:
training_rel['차'] = training_rel['후'] - training_rel['전']
training_rel.head()

Unnamed: 0,전,후,차
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25


In [50]:
t, p = stats.ttest_1samp(training_rel['차'], 0)
p

0.040

In [51]:
t, p = stats.ttest_rel(training_rel['후'], training_rel['전'])
p

0.040

### 독립비교 t검정

In [52]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
print(training_ind.shape)
training_ind.head()

(20, 2)


Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


In [53]:
t, p = stats.ttest_ind(training_ind['A'], training_ind['B'],
                       equal_var=False)
p

0.087

### 윌콕슨의 부호순위검정

In [54]:
training_rel = pd.read_csv('../data/ch11_training_rel.csv')
toy_df = training_rel[:6].copy()
toy_df

Unnamed: 0,전,후
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84
5,45,37


In [55]:
diff = toy_df['후'] - toy_df['전']
toy_df['차'] = diff
toy_df

Unnamed: 0,전,후,차
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25
5,45,37,-8


In [56]:
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['순위'] = rank
toy_df

Unnamed: 0,전,후,차,순위
0,59,41,-18,5
1,52,63,11,3
2,55,68,13,4
3,61,59,-2,1
4,59,84,25,6
5,45,37,-8,2


In [57]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(8, 13)

In [58]:
toy_df['후'] = toy_df['전'] + np.arange(1, 7)
diff = toy_df['후'] - toy_df['전']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['차'] = diff
toy_df['순위'] = rank
toy_df

Unnamed: 0,전,후,차,순위
0,59,60,1,1
1,52,54,2,2
2,55,58,3,3
3,61,65,4,4
4,59,64,5,5
5,45,51,6,6


In [59]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(0, 21)

In [60]:
toy_df['후'] = toy_df['전'] + [1, -2, -3, 4, 5, -6]
diff = toy_df['후'] - toy_df['전']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['차'] = diff
toy_df['순위'] = rank
toy_df

Unnamed: 0,전,후,차,순위
0,59,60,1,1
1,52,50,-2,2
2,55,52,-3,3
3,61,65,4,4
4,59,64,5,5
5,45,39,-6,6


In [61]:
r_minus = np.sum((diff < 0) * rank)
r_plus = np.sum((diff > 0) * rank)

r_minus, r_plus

(11, 10)

In [62]:
T, p = stats.wilcoxon(training_rel['전'], training_rel['후'])
p

0.036

In [63]:
T, p = stats.wilcoxon(training_rel['후'] - training_rel['전'])
p

0.040

In [64]:
n = 10000
diffs = np.round(stats.norm(3, 4).rvs(size=(n, 20)))

In [65]:
cnt = 0
alpha = 0.05
for diff in diffs:
    t, p = stats.ttest_1samp(diff, 0)
    if p < alpha:
        cnt += 1
cnt / n

0.883

In [66]:
cnt = 0
alpha = 0.05
for diff in diffs:
    T, p = stats.wilcoxon(diff)
    if p < alpha:
        cnt += 1
cnt / n



0.871

### 만・위트니의 U검정

In [67]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
toy_df = training_ind[:5].copy()
toy_df

Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


In [68]:
rank = stats.rankdata(np.concatenate([toy_df['A'],
                                      toy_df['B']]))
rank_df = pd.DataFrame({'A': rank[:5],
                        'B': rank[5:10]}).astype(int)
rank_df

Unnamed: 0,A,B
0,3,5
1,6,8
2,1,9
3,10,4
4,2,7


In [69]:
n1 = len(rank_df['A'])
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

7.000

In [70]:
rank_df = pd.DataFrame(np.arange(1, 11).reshape(2, 5).T,
                       columns=['A', 'B'])
rank_df

Unnamed: 0,A,B
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [71]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

0.000

In [72]:
rank_df = pd.DataFrame(np.arange(1, 11).reshape(2, 5)[::-1].T,
                       columns=['A', 'B'])
rank_df

Unnamed: 0,A,B
0,6,1
1,7,2
2,8,3
3,9,4
4,10,5


In [73]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

25.000

In [74]:
u, p = stats.mannwhitneyu(training_ind['A'], training_ind['B'],
                          alternative='two-sided')
p

0.059

### 카이제곱검정

In [75]:
ad_df = pd.read_csv('../data/ch11_ad.csv')
n = len(ad_df)
print(n)
ad_df.head()

1000


Unnamed: 0,광고,구입
0,B,하지 않았다
1,B,하지 않았다
2,A,했다
3,A,했다
4,B,하지 않았다


In [76]:
ad_cross = pd.crosstab(ad_df['광고'], ad_df['구입'])
ad_cross

구입,하지 않았다,했다
광고,Unnamed: 1_level_1,Unnamed: 2_level_1
A,351,49
B,549,51


In [77]:
ad_cross['했다'] / (ad_cross['했다'] + ad_cross['하지 않았다'])

광고
A    0.1225
B    0.0850
dtype: float64

In [78]:
n_not, n_yes = ad_cross.sum()
n_not, n_yes

(900, 100)

In [79]:
n_adA, n_adB = ad_cross.sum(axis=1)
n_adA, n_adB

(400, 600)

In [80]:
ad_ef = pd.DataFrame({'했다': [n_adA * n_yes / n,
                              n_adB * n_yes / n],
                      '하지 않았다': [n_adA * n_not / n,
                                   n_adB * n_not / n]},
                      index=['A', 'B'])
ad_ef

Unnamed: 0,했다,하지 않았다
A,40.0,360.0
B,60.0,540.0


In [81]:
y = ((ad_cross - ad_ef) ** 2 / ad_ef).sum().sum()
y

3.750

In [82]:
rv = stats.chi2(1)
1 - rv.cdf(y)

0.053

In [83]:
chi2, p, dof, ef = stats.chi2_contingency(ad_cross,
                                          correction=False)
chi2, p, dof

(3.750, 0.053, 1)

In [84]:
ef

array([[360.,  40.],
       [540.,  60.]])