# 第 3 章　使用 Pyhton 进行数据分析｜用 Python 动手学统计学

## 第 7 节　参数估计

### 2. 环境准备

In [18]:
# 用于数值计算的库
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 用于绘图的库
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# 设置浮点数打印精度
%precision 3
# 在 Jupyter Notebook 里显示图形
%matplotlib inline

In [19]:
# 读入数据
URL = "https://raw.githubusercontent.com/pineapple-666/Learn-Statistics-with-Python/main/data/3-7-1-fish_length.csv"
fish = pd.read_csv(URL)["length"]
fish

Unnamed: 0,length
0,4.352982
1,3.735304
2,5.944617
3,3.798326
4,4.087688
5,5.265985
6,3.272614
7,3.526691
8,4.150083
9,3.736104


### 4. 实现：点估计

In [20]:
# 总体均值的点估计
mu = np.mean(fish)
mu

np.float64(4.187039324504523)

In [21]:
# 总体方差的点估计
sigma_2 = np.var(fish, ddof = 1)
sigma_2

0.680

### 9. 实现：区间估计

In [22]:
# 自由度
df = len(fish) - 1
df

9

In [23]:
# 标准误差
sigma = np.std(fish, ddof = 1)
se = sigma / np.sqrt(len(fish))
se

np.float64(0.2608259396768776)

In [24]:
# 区间估计
interval = stats.t.interval(
    confidence = 0.95, df = df, loc = mu, scale = se)
interval

(np.float64(3.5970100568063232), np.float64(4.777068592202723))

In [25]:
stats.t.interval?

### 10. 补充：置信区间的求解细节

In [26]:
# 97.5% 分位数
t_975 = stats.t.ppf(q = 0.975, df = df)
t_975

np.float64(2.2621571628540993)

In [27]:
# 下置信界限
lower = mu - t_975 * se
lower

np.float64(3.5970100568063232)

In [28]:
# 上置信界限
upper = mu + t_975 * se
upper

np.float64(4.777068592202723)

### 11. 决定置信区间大小的因素

In [31]:
# 样本方差越大, 置信区间越大
se2 = (sigma*10) / np.sqrt(len(fish))
stats.t.interval(
    confidence = 0.95, df = df, loc = mu, scale = se2)

(np.float64(-1.7132533524774765), np.float64(10.087332001486523))

In [34]:
# 样本容量越大, 置信区间越小
df2 = (len(fish)*10) - 1
se3 = sigma / np.sqrt(len(fish)*10)
stats.t.interval(
    confidence = 0.95, df = df2, loc = mu, scale = se3)

(np.float64(4.0233803082774395), np.float64(4.350698340731607))

In [36]:
# 99% 置信区间
stats.t.interval(
    confidence = 0.99, df = df, loc = mu, scale = se)

(np.float64(3.339397915573443), np.float64(5.034680733435604))

### 12. 区间估计结果的解读

In [37]:
# 如果置信区间包含总体均值 (4) 就取 True
be_included_array = np.zeros(20000, dtype = "bool")
be_included_array

array([False, False, False, ..., False, False, False])

In [39]:
# 执行 20,000 次求 95% 置信区间的操作
# 如果置信区间包含总体均值 (4) 就取 True
np.random.seed(1)
norm_dist = stats.norm(loc = 4, scale = 0.8)
for i in range(0, 20000):
    sample = norm_dist.rvs(size = 10)
    df = len(sample) - 1
    mu = np.mean(sample)
    std = np.std(sample, ddof = 1)
    se = std / np.sqrt(len(sample))
    interval = stats.t.interval(0.95, df, mu, se)
    if(interval[0] <= 4 and interval[1] >= 4):
        be_included_array[i] = True

In [40]:
sum(be_included_array) / len(be_included_array)

np.float64(0.948)