# Handling Missing data

1. None

In [1]:
import numpy as np
import pandas as pd

In [3]:
val1 = np.array([1, None, 3, 4])
val1

array([1, None, 3, 4], dtype=object)

In [4]:
for dtype in ['object', 'int']:
    print("dtype=", dtype)
    %timeit np.arange(1E6, dtype= dtype).sum()
    print()

dtype= object
365 ms ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

dtype= int
16.6 ms ± 4.54 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [6]:
#u can not perform sum, max with nonevalue
val1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

2. NaN: Missing Numerical data

In [8]:
val2 = np.array([1, np.nan, 3,4])
val2.dtype

dtype('float64')

In [9]:
val2

array([ 1., nan,  3.,  4.])

In [10]:
1 + np.nan

nan

In [11]:
0*np.nan

nan

In [12]:
np.nan/3

nan

In [13]:
np.nan**np.nan

nan

In [14]:
val2.sum()

nan

In [20]:
#pandas can handle the two of themin one series
val3 =pd.Series([1,np.nan, 2, None])
val3

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

### Operating on Null Values

In [23]:
val3.isnull()  #true for null and false for no null

0    False
1     True
2    False
3     True
dtype: bool

In [24]:
val3.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [26]:
val3.dropna()     #drop the null values

0    1.0
2    2.0
dtype: float64

In [27]:
val3

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [30]:
val3.fillna(4)     #fill null valus

0    1.0
1    4.0
2    2.0
3    4.0
dtype: float64

In [31]:
df = pd.DataFrame([[1, np.nan, 2],
                  [3,4,5],
                  [np.nan, np.nan, 7]])

In [33]:
df.head()

Unnamed: 0,0,1,2
0,1.0,,2
1,3.0,4.0,5
2,,,7


In [34]:
df.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,False
2,True,True,False


In [37]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,7


In [38]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
1,3.0,4.0,5


In [46]:
df[3] = np.nan

In [47]:
df

Unnamed: 0,0,1,2,3,"(3, 5)"
0,1.0,,2.0,,
1,3.0,4.0,5.0,,
2,,,7.0,,


In [48]:
df.fillna(0)

Unnamed: 0,0,1,2,3,"(3, 5)"
0,1.0,0.0,2.0,0.0,0.0
1,3.0,4.0,5.0,0.0,0.0
2,0.0,0.0,7.0,0.0,0.0


In [49]:
df

Unnamed: 0,0,1,2,3,"(3, 5)"
0,1.0,,2.0,,
1,3.0,4.0,5.0,,
2,,,7.0,,


In [50]:
#forward fill
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3,"(3, 5)"
0,1.0,,2.0,,
1,3.0,4.0,5.0,,
2,3.0,4.0,7.0,,


In [51]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3,"(3, 5)"
0,1.0,4.0,2.0,,
1,3.0,4.0,5.0,,
2,,,7.0,,
