In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame([5, 27.3, np.nan, -16], columns=["numbers"])
df

Unnamed: 0,numbers
0,5.0
1,27.3
2,
3,-16.0


In [2]:
numbers = pd.DataFrame([pd.NA, 27.3, np.nan, -16, None], columns=["numbers"])
numbers

Unnamed: 0,numbers
0,
1,27.3
2,
3,-16.0
4,


In [3]:
fruits = pd.DataFrame(
    ["orange", np.nan, "apple", None, "banana", pd.NA], columns=["fruit"]
)
fruits

Unnamed: 0,fruit
0,orange
1,
2,apple
3,
4,banana
5,


In [4]:
fruits.isna()

Unnamed: 0,fruit
0,False
1,True
2,False
3,True
4,False
5,True


# 处理缺失值

In [5]:
nan_df = pd.DataFrame(
    [
        [np.nan, 2, None, 0],
        [3, 4, np.nan, 1],
        [5, np.nan, np.nan, pd.NA],
        [np.nan, 3, np.nan, 4],
    ],
    columns=list("ABCD"),
)

nan_df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,5.0,,,
3,,3.0,,4.0


In [6]:
nan_df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,5.0,0.0,0.0,0
3,0.0,3.0,0.0,4


In [7]:
nan_df.fillna(value={"A": 0, "B": 1, "C": 2, "D": 3})

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,5.0,1.0,2.0,3
3,0.0,3.0,2.0,4


In [8]:
# 根据前一个值填充。forward fill
nan_df.fillna(method="ffill")

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,5.0,4.0,,1
3,5.0,3.0,,4


In [9]:
# 根据后一个值填充。backward fill
nan_df.fillna(method="bfill")

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,5.0,3.0,,4
3,,3.0,,4


In [10]:
# 限制填充个数
nan_df.fillna(value={"A": 0, "B": 1, "C": 2, "D": 3}, limit=1)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,,1
2,5.0,1.0,,3
3,,3.0,,4


# 直接过滤掉缺失值

In [11]:
nan_df["A"].dropna(axis=0)  # on a single column

1    3.0
2    5.0
Name: A, dtype: float64

In [12]:
# 在这种情况下，由于每列中至少有一个 NaN，因此将不会留下任何数据
nan_df.dropna(axis=1)

0
1
2
3


In [13]:
nan_df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,5.0,,,
3,,3.0,,4.0


In [14]:
stata_df = pd.DataFrame([[3, 4, 5], [-7, 4, -99], [-99, 6, 5]], columns=list("ABC"))
stata_df

Unnamed: 0,A,B,C
0,3,4,5
1,-7,4,-99
2,-99,6,5


In [15]:
stata_df.replace({-99: pd.NA})

Unnamed: 0,A,B,C
0,3.0,4,5.0
1,-7.0,4,
2,,6,5.0


# 隐式缺失值

In [16]:
stocks = pd.DataFrame(
    {
        "year": [2020, 2020, 2020, 2020, 2021, 2021, 2021],
        "qtr": [1, 2, 3, 4, 2, 3, 4],
        "price": [1.88, 0.59, 0.35, np.nan, 0.92, 0.17, 2.66],
    }
)
stocks

# 2022 第 4 季度是 显示 缺失
# 2021 第一季度是 隐式 缺失

Unnamed: 0,year,qtr,price
0,2020,1,1.88
1,2020,2,0.59
2,2020,3,0.35
3,2020,4,
4,2021,2,0.92
5,2021,3,0.17
6,2021,4,2.66


## Pivoting

In [17]:
stocks.pivot(columns="qtr", values="price", index="year")

qtr,1,2,3,4
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020,1.88,0.59,0.35,
2021,,0.92,0.17,2.66


## Categorical 类型变量的缺失

In [18]:
health = pd.DataFrame(
    {
        "name": ["Ikaia", "Oletta", "Leriah", "Dashay", "Tresaun"],
        "smoker": ["no", "no", "previously", "no", "yes"],
        "age": [34, 88, 75, 47, 56],
    }
)
health["smoker"] = health["smoker"].astype("category")

In [20]:
# 手动删除最后一行数据，导致没有了 yes 类型的数据
health_cut = health.iloc[:-1, :]
health_cut

Unnamed: 0,name,smoker,age
0,Ikaia,no,34
1,Oletta,no,88
2,Leriah,previously,75
3,Dashay,no,47


In [21]:
# 对 categorical 类型数据使用 value_counts
health_cut["smoker"].value_counts()

smoker
no            3
previously    1
yes           0
Name: count, dtype: int64

In [22]:
# 或者 groupby
health_cut.groupby("smoker")["age"].mean()

smoker
no            56.333333
previously    75.000000
yes                 NaN
Name: age, dtype: float64