# 第 3 章　使用 Pyhton 进行数据分析｜用 Python 动手学统计学

## 第 2 节　使用 Python 进行描述统计：多变量

### 4. 多变量数据的管理

In [87]:
# 用于数值计算的库
import pandas as pd
import scipy as sp
import numpy as np

# 设置浮点数打印精度
%precision 3

'%.3f'

### 5. 实现：求各分组的统计量

In [88]:
url = "https://raw.githubusercontent.com/pineapple-666/Learn-Statistics-with-Python/main/data/3-2-1-fish_multi.csv"
fish_multi = pd.read_csv(url)
print(fish_multi)

  species  length
0       A       2
1       A       3
2       A       4
3       B       6
4       B       8
5       B      10


In [89]:
# 按鱼的种类计算
group = fish_multi.groupby("species")
print(group.mean())

         length
species        
A           3.0
B           8.0


In [90]:
print(group.std(ddof = 1))

         length
species        
A           1.0
B           2.0


In [91]:
group.describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.0,3.0,1.0,2.0,2.5,3.0,3.5,4.0
B,3.0,8.0,2.0,6.0,7.0,8.0,9.0,10.0


### 6. 实现：列联表

In [92]:
url = "https://raw.githubusercontent.com/pineapple-666/Learn-Statistics-with-Python/main/data/3-2-2-shoes.csv"
shoes = pd.read_csv(url)
print(shoes)

   store color  sales
0  tokyo  blue     10
1  tokyo   red     15
2  osaka  blue     13
3  osaka   red      9


In [93]:
cross = pd.pivot_table(
    data = shoes,
    values = "sales",
    aggfunc = "sum",
    index = "store",
    columns = "color"
)
print(cross)

color  blue  red
store           
osaka    13    9
tokyo    10   15


### 9. 实现：协方差

In [94]:
url = "https://raw.githubusercontent.com/pineapple-666/Learn-Statistics-with-Python/main/data/3-2-3-cov.csv"
cov_data = pd.read_csv(url)
print(cov_data)

      x   y
0  18.5  34
1  18.7  39
2  19.1  41
3  19.7  38
4  21.5  45
5  21.7  41
6  21.8  52
7  22.0  44
8  23.4  44
9  23.8  49


In [95]:
# 读取数据的列
x = cov_data["x"]
y = cov_data["y"]

# 求样本容量
N = len(cov_data)

# 求各变量均值
mu_x = x.mean()
mu_y = y.mean()

In [96]:
# 样本协方差
cov_sample = sum((x - mu_x) * (y - mu_y)) / N
cov_sample

6.906

In [97]:
# 协方差
cov = sum((x - mu_x) * (y - mu_y)) / (N - 1)
cov

7.673

### 10. 实现：协方差矩阵

In [98]:
# 样本协方差
np.cov(x, y, ddof = 0)
# np.cov(x, y, ddof = 1)

array([[ 3.282,  6.906],
       [ 6.906, 25.21 ]])

In [99]:
# 无偏协方差
np.cov(x, y, ddof = 1)

array([[ 3.646,  7.673],
       [ 7.673, 28.011]])

### 13. 实现：皮尔逊积矩相关系数

In [100]:
# 计算两个变量的方差
sigma_2_x = np.var(x, ddof = 1)
sigma_2_y = np.var(y, ddof = 1)

# 计算相关系数
rho = cov / np.sqrt(sigma_2_x * sigma_2_y)
rho

np.float64(0.7592719041137088)

In [102]:
# 计算两个变量的方差
sigma_2_x_sample = np.var(x, ddof = 0)
sigma_2_y_sample = np.var(y, ddof = 0)

# 计算相关系数
cov_sample / np.sqrt(sigma_2_x_sample * sigma_2_y_sample)

np.float64(0.7592719041137087)

In [103]:
# 相关矩阵
np.corrcoef(x, y)

array([[1.   , 0.759],
       [0.759, 1.   ]])