# Missing data in Numpy

In [1]:
import numpy as np

In [3]:
# nan - not a number
# inf - infinity
np.nan, np.inf

(nan, inf)

In [4]:
a1 = np.array([1, 2, 3, np.nan, 5, 6])
a1

array([ 1.,  2.,  3., nan,  5.,  6.])

In [5]:
np.isnan(a1)

array([False, False, False,  True, False, False])

In [6]:
a2 = np.array([1, 2, 3, np.inf, 5, 6])
a2

array([ 1.,  2.,  3., inf,  5.,  6.])

In [8]:
np.isfinite(a2)

array([ True,  True,  True, False,  True,  True])

In [9]:
np.isfinite(a1)

array([ True,  True,  True, False,  True,  True])

In [26]:
a3 = np.array([1, np.nan, 3, np.inf, 5])
a3

array([ 1., nan,  3., inf,  5.])

In [11]:
mask_arr = np.array([True, False, True, False, True])
mask_arr

array([ True, False,  True, False,  True])

In [13]:
a3[[True, False, True, False, True]], a3[mask_arr]

(array([1., 3., 5.]), array([1., 3., 5.]))

In [14]:
np.isfinite(a3)

array([ True, False,  True, False,  True])

In [15]:
a3[np.isfinite(a3)]

array([1., 3., 5.])

In [17]:
~np.isfinite(a3)

array([False,  True, False,  True, False])

In [18]:
a3[~np.isfinite(a3)]

array([nan, inf])

In [19]:
a4 = np.array([1, 2, np.nan, 4])
a4

array([ 1.,  2., nan,  4.])

In [21]:
np.isnan(a4)

array([False, False,  True, False])

In [23]:
a4[np.isnan(a4)] = 1000
a4

array([   1.,    2., 1000.,    4.])

In [25]:
a3[np.isnan(a3)] = 1000
a3

array([   1., 1000.,    3.,   inf,    5.])

In [27]:
a3 = np.array([1, np.nan, 3, np.inf, 5])
a3

array([ 1., nan,  3., inf,  5.])

In [30]:
a3[~np.isfinite(a3)] = 1000
a3

array([   1., 1000.,    3., 1000.,    5.])

In [31]:
a5 = np.array([
    [1, 2, np.nan],
    [4, np.nan, 6],
    [7, 8, 9]
])
a5

array([[ 1.,  2., nan],
       [ 4., nan,  6.],
       [ 7.,  8.,  9.]])

In [32]:
a5[np.isnan(a5)]

array([nan, nan])

In [33]:
np.where(np.isnan(a5))

(array([0, 1], dtype=int64), array([2, 1], dtype=int64))

In [46]:
np.where(np.isnan(a5))[1]

array([2, 1], dtype=int64)

- first nan value has index (0, 2)
- second nan value has index (1, 1)

In [34]:
a5

array([[ 1.,  2., nan],
       [ 4., nan,  6.],
       [ 7.,  8.,  9.]])

In [35]:
np.mean(a5)

nan

In [44]:
col_mean, row_mean = np.nanmean(a5, axis=0), np.nanmean(a5, axis=1)
col_mean, row_mean

(array([4. , 5. , 7.5]), array([1.5, 5. , 8. ]))

- axis=0 : column-wise operation
- axis=1 : row-wise operation

In [48]:
np.where(np.isnan(a5))[1]

array([2, 1], dtype=int64)

In [47]:
np.take(col_mean, np.where(np.isnan(a5))[1])

array([7.5, 5. ])

In [49]:
a5

array([[ 1.,  2., nan],
       [ 4., nan,  6.],
       [ 7.,  8.,  9.]])

In [50]:
a5[np.isnan(a5)] = np.take(col_mean, np.where(np.isnan(a5))[1])

In [51]:
a5

array([[1. , 2. , 7.5],
       [4. , 5. , 6. ],
       [7. , 8. , 9. ]])

In [56]:
a5 = np.array([
    [1, 2, np.nan],
    [4, np.nan, 6],
    [7, 8, 9]
])
a5

array([[ 1.,  2., nan],
       [ 4., nan,  6.],
       [ 7.,  8.,  9.]])

In [57]:
row_mean

array([1.5, 5. , 8. ])

In [58]:
a5[np.isnan(a5)]

array([nan, nan])

In [59]:
np.take(row_mean, np.where(np.isnan(a5))[0])

array([1.5, 5. ])

In [None]:
a5[np.isnan(a5)] = np.take(row_mean, np.where(np.isnan(a5))[0])
# np.array([nan, nan]) = np.array([1.5, 5. ])
a5

array([[1. , 2. , 1.5],
       [4. , 5. , 6. ],
       [7. , 8. , 9. ]])