## pandasやNumPyにおける欠損値のデータ型
<table>
    <thead>
        <tr>
            <th>表記</th>
            <th>データ型</th>
            <th>クラス</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>NaN</td>
            <td><m-b>float型</m-b></td>
            <td><m-b>numpy.nan</m-b></td>
        </tr>
        <tr>
            <td>NA</td>
            <td><m-b>int型</m-b></td>
            <td><m-b>pandas.NA</m-b></td>
        </tr>
        <tr>
            <td>NaT</td>
            <td><m-b>datetime</m-b></td>
            <td><m-b>pandas.NaT</m-b></td>
        </tr>
    </tbody>
</table>

In [3]:
# NaNを含むSeries
import numpy as np
import pandas as pd

rng = np.random.default_rng(1)
float_ser = pd.Series(rng.random(4), index=range(0, 8, 2)).reindex(range(4))
float_ser

0    0.511822
1         NaN
2    0.950464
3         NaN
dtype: float64

In [5]:
# NaTを含むSeries
dt_ser = pd.Series(
    pd.date_range("2025-02-12", periods=4), index=range(0, 8, 2)
).reindex(range(4))
dt_ser

0   2025-02-12
1          NaT
2   2025-02-13
3          NaT
dtype: datetime64[ns]

In [10]:
# NAを含むSeries
rng = np.random.default_rng(1)
int_ser = pd.Series(
    rng.integers(0, 10, 4), index=range(0, 8, 2), dtype=pd.Int64Dtype(),
).reindex(range(4))
int_ser

0       4
1    <NA>
2       5
3    <NA>
dtype: Int64

In [12]:
int_ser.dtype

Int64Dtype()

In [17]:
# pandasの型変換
from IPython.display import display

display(pd.Series([1, None, 3]))
display(pd.Series([True, None, False]))
display(pd.Series([1.0, None, 3.0]))
display(pd.Series(["a", None, "c"]))
display(pd.Series([pd.to_datetime("2025-12-01"), None, pd.to_datetime("2025-12-03")]))

0    1.0
1    NaN
2    3.0
dtype: float64

0     True
1     None
2    False
dtype: object

0    1.0
1    NaN
2    3.0
dtype: float64

0       a
1    None
2       c
dtype: object

0   2025-12-01
1          NaT
2   2025-12-03
dtype: datetime64[ns]

In [21]:
# 欠損値を含むデータの評価
display(np.nan == np.nan)
display(np.nan > np.nan)
display(np.nan < np.nan)
display(np.nan != np.nan)

display(pd.NaT == pd.NaT)
display(pd.NaT != pd.NaT)

display(pd.NA == pd.NA)
display(pd.NA != pd.NA)

False

False

False

True

False

True

<NA>

<NA>

In [23]:
display(float_ser)
display(pd.isna(float_ser))
display(pd.isnull(float_ser))
display(float_ser.isna())
display(float_ser.isnull())

0    0.511822
1         NaN
2    0.950464
3         NaN
dtype: float64

0    False
1     True
2    False
3     True
dtype: bool

0    False
1     True
2    False
3     True
dtype: bool

0    False
1     True
2    False
3     True
dtype: bool

0    False
1     True
2    False
3     True
dtype: bool

In [24]:
int_ser.isna()

0    False
1     True
2    False
3     True
dtype: bool

In [26]:
dt_ser.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [34]:
# 欠損値を含む計算
display(int_ser)
display("size")
display(int_ser.size)
display("count()")
display(int_ser.count())
display("sum()")
display(int_ser.sum())
display("sum(skipna=False)")
display(int_ser.sum(skipna=False))
display("mean()")
display(int_ser.mean())
display("cumsum()")
display(int_ser.cumsum())
display("cumprod(skipna=False)")
display(int_ser.cumprod(skipna=False))

0       4
1    <NA>
2       5
3    <NA>
dtype: Int64

'size'

4

'count()'

np.int64(2)

'sum()'

np.int64(9)

'sum(skipna=False)'

<NA>

'mean()'

np.float64(4.5)

'cumsum()'

0       4
1    <NA>
2       9
3    <NA>
dtype: Int64

'cumprod(skipna=False)'

0       4
1    <NA>
2    <NA>
3    <NA>
dtype: Int64

## 欠損値が発生するパターン

### <m-b>MCAR</m-b> (<m-b>Missing Completely At Random</m-b>): 完全にランダムに発生する場合

ある列の欠損値の発生が他の列の値に依存しない場合。
欠損値は無視できる。

### <m-b>MAR</m-b> (<m-b>Missing At Random</m-b>): 条件はあるがランダムに発生する場合
その条件についてのみ考える場合に欠損を無視できるが、そうでない場合に無視できない。

### <m-b>NMAR</m-b> (<m-b>Not Missing At Random</m-b>): ランダムには発生しない場合
系統的に欠損値が発生する場合、欠損値を無視できない。

In [37]:
# 欠損値の確認
df = pd.read_parquet("../result/penguins.parquet")
df.isna().sum()

Species             0
Island              0
Individual_ID       0
Date_Egg            0
Culmen_Length       2
Culmen_Depth        2
Flipper_Length      2
Body_Mass           2
Sex                11
Comments          290
Species_Short       0
dtype: int64

In [39]:
df.shape

(344, 11)

In [38]:
# ペアワイズ: 対象とする列のいずれかに欠損がある行を削除
df.dropna(subset=["Culmen_Length"]).shape

(342, 11)

In [40]:
df.dropna(subset=["Culmen_Length", "Sex"]).shape

(333, 11)

In [42]:
# 完全セット(リストワイズ): いずれかの列に欠損ががある行を削除
df.dropna().shape

(43, 11)

In [45]:
# 欠損値がある列を削除
df.dropna(axis=1).shape

(344, 5)

In [47]:
# how="any"と"all"
print(df.dropna(subset=["Culmen_Length", "Sex"], how="any").shape)
print(df.dropna(subset=["Culmen_Length", "Sex"], how="all").shape)

(333, 11)
(342, 11)


In [50]:
# Seriesに対するdropna()
df.loc[:, "Comments"].dropna()

0                         Not enough blood for isotopes.
3                                     Adult not sampled.
6                  Nest never observed with full clutch.
7                  Nest never observed with full clutch.
8                              No blood sample obtained.
9                   No blood sample obtained for sexing.
10                  No blood sample obtained for sexing.
11                             No blood sample obtained.
12                        Not enough blood for isotopes.
13                        Not enough blood for isotopes.
15                        Not enough blood for isotopes.
28                 Nest never observed with full clutch.
29                 Nest never observed with full clutch.
38                 Nest never observed with full clutch.
39     Nest never observed with full clutch. Not enou...
41                        Not enough blood for isotopes.
46                        Not enough blood for isotopes.
47     Sexing primers did not a

In [None]:
# 欠損値の補完 (単変量補完)
df.groupby("Species_Short", observed=True)["Body_Mass"].transform(
    lambda x: x
)

0      3750.0
1      3800.0
2      3250.0
3         NaN
4      3450.0
        ...  
339    4000.0
340    3400.0
341    3775.0
342    4100.0
343    3775.0
Name: Body_Mass, Length: 344, dtype: float64

In [56]:
df.groupby("Species_Short", observed=True)["Body_Mass"].transform(
    lambda x: x.fillna(x.mean())
)

0      3750.000000
1      3800.000000
2      3250.000000
3      3700.662252
4      3450.000000
          ...     
339    4000.000000
340    3400.000000
341    3775.000000
342    4100.000000
343    3775.000000
Name: Body_Mass, Length: 344, dtype: float64

In [59]:
# 行同士に順序がある場合の補完
df.loc[:, "Body_Mass"].head()

0    3750.0
1    3800.0
2    3250.0
3       NaN
4    3450.0
Name: Body_Mass, dtype: float64

In [60]:
df.loc[:, "Body_Mass"].ffill().head()

0    3750.0
1    3800.0
2    3250.0
3    3250.0
4    3450.0
Name: Body_Mass, dtype: float64

In [62]:
df.loc[:, "Body_Mass"].bfill().head()

0    3750.0
1    3800.0
2    3250.0
3    3450.0
4    3450.0
Name: Body_Mass, dtype: float64

In [64]:
df.loc[:, "Body_Mass"].interpolate(method="linear").head()

0    3750.0
1    3800.0
2    3250.0
3    3350.0
4    3450.0
Name: Body_Mass, dtype: float64

In [65]:
pd.Series([1, None, 10], index=[1, 10, 100]).interpolate()

1       1.0
10      5.5
100    10.0
dtype: float64

In [66]:
pd.Series([1, None, 10], index=[1, 10, 100]).interpolate(method="index")

1       1.000000
10      1.818182
100    10.000000
dtype: float64

In [67]:
pd.Series([1, None, 10], index=[1, 10, 100]).interpolate(method="values")

1       1.000000
10      1.818182
100    10.000000
dtype: float64

In [68]:
pd.Series(
    [1, None, 10],
    index=[
        pd.Timestamp("2025-02-01"),
        pd.Timestamp("2025-02-05"),
        pd.Timestamp("2025-02-28"),
    ],
).interpolate(method="time")

2025-02-01     1.000000
2025-02-05     2.333333
2025-02-28    10.000000
dtype: float64