In [1]:
import pandas as pd
import numpy as np
import itertools
import scipy.stats
import statsmodels.stats.weightstats

## 4.1 標準誤差

### 表4.2 母数の集計

In [2]:
N1 = 4; n1 = 2
x1 = np.array([165, 166, 171, 180], dtype=np.float64)
mu = np.mean(x1)
hensa = x1 - mu
hensa2 = np.power(hensa,2)
sigma2 = np.sum(hensa2)/N1
sigma = np.sqrt(sigma2)

In [3]:
mu

170.5

In [4]:
sigma

5.937171043518958

### 表4.4 標本サイズ2の全ての標本の抽出

In [5]:
xs = np.array(list(itertools.combinations(x1, 2)))
xbars = np.apply_along_axis(np.mean, 1, xs)
mub = np.mean(xbars)
hensab = xbars - mub
hensa2b = np.power(hensab, 2)
sigma2b = np.sum(hensa2b)/6
sigmab = np.sqrt(sigma2b)

In [6]:
xs

array([[165., 166.],
       [165., 171.],
       [165., 180.],
       [166., 171.],
       [166., 180.],
       [171., 180.]])

In [7]:
mub

170.5

In [8]:
xbars

array([165.5, 168. , 172.5, 168.5, 173. , 175.5])

In [9]:
sigmab

3.427827300200522

### 表4.5 標準誤差の計算

In [10]:
se0 = sigma / np.sqrt(n1)
correct = np.sqrt((N1 - n1)/(N1 - 1))
se1 = se0 * correct

In [11]:
se1

3.427827300200522

## 4.2 信頼区間

### t分布のq分位点の確認 (4.14)

In [12]:
scipy.stats.t.ppf(1-0.025, 49)

2.009575234489209

#### ファイル読み込み

In [13]:
data04 = pd.read_csv('./causality/data04.csv')
data04

Unnamed: 0,x1,y3,t1,y0,y1,y0t,y1t
0,74,76,1,,76.0,68,76
1,82,75,0,75.0,,75,84
2,72,75,1,65.0,,65,75
3,96,84,0,84.0,,84,97
4,83,75,0,,84.0,75,84
5,72,74,1,65.0,,65,74
6,85,76,0,,87.0,76,87
7,87,77,0,77.0,,77,89
8,86,77,0,77.0,,77,87
9,77,80,1,,80.0,70,80


#### 要約統計量

In [14]:
data04.describe()

Unnamed: 0,x1,y3,t1,y0,y1,y0t,y1t
count,20.0,20.0,20.0,8.0,12.0,20.0,20.0
mean,81.95,76.6,0.3,73.125,84.083333,73.8,83.85
std,8.999854,5.245549,0.470162,6.812541,9.129954,7.898034,8.430989
min,58.0,61.0,0.0,65.0,61.0,52.0,61.0
25%,76.25,75.0,0.0,66.5,81.5,69.5,79.25
50%,83.5,76.5,0.0,75.0,85.5,75.0,84.5
75%,87.25,80.0,1.0,77.0,89.25,78.5,89.0
max,96.0,87.0,1.0,84.0,96.0,87.0,97.0


### 4.2.3 信頼区間による対応のある場合の2標本t検定

In [15]:
n1 = len(data04)
data04['diff'] = data04['y1t'] - data04['y0t']
m1 = data04['diff'].mean()
s1 = data04['diff'].std(ddof=1)
talpha = scipy.stats.t.ppf(1-0.025, n1 - 1)

In [16]:
m1 + talpha*s1/np.sqrt(n1)

10.666325491505752

In [17]:
m1 - talpha*s1/np.sqrt(n1)

9.43367450849425

In [18]:
dsw = statsmodels.stats.weightstats.DescrStatsW(data04['diff'])
_t_test_result = dsw.ttest_mean(alternative='two-sided')
_ttest_confint = dsw.tconfint_mean(alternative='two-sided')
print(_t_test_result)
print(_ttest_confint)

pd.DataFrame(
    [*_t_test_result, *_ttest_confint],
    index=['t-statistic', 'p-value', 'df', 'confint_low', 'confint_high'],
    columns=['t-test result'],
)

(34.12951766024227, 1.6291445414422036e-18, 19.0)
(9.43367450849425, 10.666325491505752)


Unnamed: 0,t-test result
t-statistic,34.12952
p-value,1.629145e-18
df,19.0
confint_low,9.433675
confint_high,10.66633


### 4.2.4 信頼区間による対応のない場合の2標本t検定

In [19]:
y0obs = data04['y0'].dropna()
y1obs = data04['y1'].dropna()
n0 = len(y0obs)
n1 = len(y1obs)
s0 = y0obs.std(ddof=1)
s1 = y1obs.std(ddof=1)
num = (s1**2/n1 + s0**2/n0)**2
denom = ((s1**2/n1)**2)/(n1-1) + ((s0**2/n0)**2)/(n0-1)
df1 = num/denom
xbar = y1obs.mean() - y0obs.mean()
se1 = np.sqrt(s1**2/n1 + s0**2/n0)
talpha = scipy.stats.t.ppf(1-0.025, df1)

In [20]:
xbar + talpha * se1

18.469369847637893

In [21]:
xbar - talpha * se1

3.4472968190287663

In [22]:
cm = statsmodels.stats.weightstats.CompareMeans.from_data(y1obs, y0obs)
_t_test_result = cm.ttest_ind(alternative='two-sided', usevar='unequal')
_ttest_confint = cm.tconfint_diff(alternative='two-sided', usevar='unequal')
print(_t_test_result)
print(_ttest_confint)

pd.DataFrame(
    [*_t_test_result, *_ttest_confint],
    index=['t-statistic', 'p-value', 'df', 'confint_low', 'confint_high'],
    columns=['welch-test result'],
)

(3.0692267168379623, 0.006714432861111201, 17.674073174398252)
(3.447296819028768, 18.46936984763789)


Unnamed: 0,welch-test result
t-statistic,3.069227
p-value,0.006714
df,17.674073
confint_low,3.447297
confint_high,18.46937
