# 第 2 章　Python 与 Jupyter Notebook 基础｜用 Python 动手学统计学

## 第 4 节　认识 numpy 与 pandas

### 1. 导入用于分析的功能

In [27]:
import numpy as np
import pandas as pd

### 3. 实现：列表

In [28]:
sample_list = [1,2,3,4,5]
sample_list

[1, 2, 3, 4, 5]

In [75]:
sample_list *2

[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]

### 5. 实现：数组

In [29]:
sample_array = np.array([1,2,3,4,5])
sample_array

array([1, 2, 3, 4, 5])

In [30]:
sample_array + 2

array([3, 4, 5, 6, 7])

In [31]:
sample_array * 2

array([ 2,  4,  6,  8, 10])

In [32]:
np.array([1 ,2, "A"])

array(['1', '2', 'A'], dtype='<U21')

In [33]:
# 矩阵
sample_array_2 = np.array(
    [[1,2,3,4,5],
     [6,7,8,9,10]])
sample_array_2

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [34]:
# 获取行数与列数
sample_array_2.shape

(2, 5)

### 6. 实现：生成等差数列的方法

In [35]:
np.arange(start = 1, stop = 6, step = 1)

array([1, 2, 3, 4, 5])

In [36]:
np.arange(start = 0.1, stop = 0.8, step = 0.2)

array([0.1, 0.3, 0.5, 0.7])

In [37]:
np.arange(0.1, 0.8, 0.2)

array([0.1, 0.3, 0.5, 0.7])

### 7. 实现：多种生成数组的方式

In [38]:
# 元素相同的数组
np.tile("A", 5)

array(['A', 'A', 'A', 'A', 'A'], dtype='<U1')

In [39]:
# 存放 4 个 0
np.tile(0, 4)

array([0, 0, 0, 0])

In [40]:
# 只有 0 的数组
np.zeros(4)

array([0., 0., 0., 0.])

In [41]:
# 二维数组
np.zeros([2,3])

array([[0., 0., 0.],
       [0., 0., 0.]])

In [42]:
# 只有 1 的数组
np.ones(3)

array([1., 1., 1.])

### 8. 实现：切片

In [43]:
# 一维数组
d1_array = np.array([1,2,3,4,5])
d1_array

array([1, 2, 3, 4, 5])

In [76]:
# 取得第一个元素
d1_array[0]

np.int64(1)

In [45]:
# 获取索引中的 1 号和 2 号元素
d1_array[1:3]

array([2, 3])

In [46]:
# 二维数组
d2_array = np.array(
    [[1,2,3,4,5],
    [6,7,8,9,10]])
d2_array

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [47]:
d2_array[0, 3]

np.int64(4)

In [48]:
d2_array[1, 2:4]

array([8, 9])

### 9. 实现：数据帧

In [77]:
print(sample_array)

[1 2 3 4 5]


In [49]:
sample_df = pd.DataFrame({
    'col1' : sample_array,
    'col2' : sample_array * 2,
    'col3' : ["A", "B", "C", "D", "E"]
})
print(sample_df)

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C
3     4     8    D
4     5    10    E


In [50]:
sample_df

Unnamed: 0,col1,col2,col3
0,1,2,A
1,2,4,B
2,3,6,C
3,4,8,D
4,5,10,E


### 10. 实现：读取文件中的数据

In [51]:
file_data = pd.read_csv("2-4-1-sample_data.csv")
print(file_data)

   col1 col2
0     1    A
1     2    A
2     3    B
3     4    B
4     5    C
5     6    C


In [53]:
type(file_data)

### 11. 实现：连接数据帧

In [54]:
df_1 = pd.DataFrame({
    'col1' : np.array([1, 2, 3]),
    'col2' : np.array(["A", "B", "C"])
})
df_2 = pd.DataFrame({
    'col1' : np.array([4, 5, 6]),
    'col2' : np.array(["D", "E", "F"])
})

In [55]:
# 在纵向上连接
print(pd.concat([df_1, df_2]))

   col1 col2
0     1    A
1     2    B
2     3    C
0     4    D
1     5    E
2     6    F


In [56]:
# 在横向上连接
print(pd.concat([df_1, df_2], axis = 1))

   col1 col2  col1 col2
0     1    A     4    D
1     2    B     5    E
2     3    C     6    F


### 12. 实现：数据帧的列操作

In [57]:
# 对象数据
print(sample_df)

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C
3     4     8    D
4     5    10    E


In [58]:
# 按列名获取数据
print(sample_df.col2)

0     2
1     4
2     6
3     8
4    10
Name: col2, dtype: int64


In [59]:
print(sample_df["col2"])

0     2
1     4
2     6
3     8
4    10
Name: col2, dtype: int64


In [60]:
print(sample_df[["col2", "col3"]])

   col2 col3
0     2    A
1     4    B
2     6    C
3     8    D
4    10    E


In [61]:
# 删除指定的列
print(sample_df.drop("col1", axis = 1))

   col2 col3
0     2    A
1     4    B
2     6    C
3     8    D
4    10    E


### 13. 实现：数据帧的行操作

In [62]:
# 获取前 3 行
print(sample_df.head(n = 3))

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C


In [63]:
# 获取第 1 行
print(sample_df.query('index == 0'))

   col1  col2 col3
0     1     2    A


In [64]:
# 通过多种条件获取数据
print(sample_df.query('col3 == "A"'))

   col1  col2 col3
0     1     2    A


In [65]:
# 按 OR 条件获取数据
print(sample_df.query('col3 == "A" | col3 == "D"'))

   col1  col2 col3
0     1     2    A
3     4     8    D


In [66]:
# 按 AND 条件获取数据
print(sample_df.query('col3 == "A" & col1 == 3'))

Empty DataFrame
Columns: [col1, col2, col3]
Index: []


In [67]:
# 同时指定行和列的条件
print(sample_df.query('col3 == "A"')[["col2", "col3"]])

   col2 col3
0     2    A


### 14. 补充：序列

In [68]:
type(sample_df)

In [69]:
type(sample_df.col1)

In [70]:
# 转换为数组
type(np.array(sample_df.col1))

numpy.ndarray

In [71]:
type(sample_df.col1.values)

numpy.ndarray

### 15. 补充：函数文档

In [72]:
help(sample_df.query)

Help on method query in module pandas.core.frame:

query(expr: 'str', *, inplace: 'bool' = False, **kwargs) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Query the columns of a DataFrame with a boolean expression.

    Parameters
    ----------
    expr : str
        The query string to evaluate.

        You can refer to variables
        in the environment by prefixing them with an '@' character like
        ``@a + b``.

        You can refer to column names that are not valid Python variable names
        by surrounding them in backticks. Thus, column names containing spaces
        or punctuations (besides underscores) or starting with digits must be
        surrounded by backticks. (For example, a column named "Area (cm^2)" would
        be referenced as ```Area (cm^2)```). Column names which are Python keywords
        (like "list", "for", "import", etc) cannot be used.

        For example, if one of your columns is called ``a a`` and you want
        