### 7.1 Handling missing data

In [231]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'none'

In [232]:
import pandas as pd
import numpy as np

##### Scenario 1

In [233]:
# Enabling node interactivity
InteractiveShell.ast_node_interactivity = 'all'

In [234]:
string_data = pd.Series(['aardward', 'artichoke', np.nan, 'avocado'])
string_data

0     aardward
1    artichoke
2          NaN
3      avocado
dtype: object

In [235]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [236]:
# The built-in python None value is also treated as NA value
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [237]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

#### 7.1.1 Filtering out missing data

In [238]:
from numpy import nan as NA

In [239]:
## Missing data values in Pandas Series
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

 > Above result could be find out in following way also

In [240]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [241]:
## Missing data values in Pandas Dataframe object
data_df = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
data_df[4] = NA
data_df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [242]:
cleaned_df_i = data_df.dropna()
cleaned_df_i

Unnamed: 0,0,1,2,4


In [243]:
## Passing how = 'all' will only drop rows that are all NA
cleaned_df_ii = data_df.dropna(how='all')
cleaned_df_ii

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
3,,6.5,3.0,


In [244]:
## df.dropna with axis =1 and how='all' parameters
cleaned_df_iii = data_df.dropna(axis=1, how='all')
cleaned_df_iii

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [245]:
## df.dropna with axis =1 and how='any' parameters
cleaned_df_iv = data_df.dropna(axis=1, how='any')
cleaned_df_iv

0
1
2
3


> Another way to filter out  DataFrame rows tends to concern time series data

In [246]:
time_series_df = pd.DataFrame(np.random.randn(7, 3))
time_series_df

Unnamed: 0,0,1,2
0,-0.27163,0.147173,1.03076
1,-0.4785,0.1113,0.274277
2,1.073974,0.701161,-1.246204
3,-0.34609,-1.605487,0.473288
4,-2.496468,0.652347,-0.316954
5,2.317377,0.261494,-0.054233
6,0.382525,1.725468,-0.119406


In [247]:
time_series_df.iloc[:4, 1] = NA
time_series_df.iloc[:2, 2] = NA
time_series_df

Unnamed: 0,0,1,2
0,-0.27163,,
1,-0.4785,,
2,1.073974,,-1.246204
3,-0.34609,,0.473288
4,-2.496468,0.652347,-0.316954
5,2.317377,0.261494,-0.054233
6,0.382525,1.725468,-0.119406


In [248]:
time_series_df.dropna()

Unnamed: 0,0,1,2
4,-2.496468,0.652347,-0.316954
5,2.317377,0.261494,-0.054233
6,0.382525,1.725468,-0.119406


In [249]:
## Dataframe.dropna by default takes axis = 0 and how = 'any'
## thresh=2 means 
time_series_df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.073974,,-1.246204
3,-0.34609,,0.473288
4,-2.496468,0.652347,-0.316954
5,2.317377,0.261494,-0.054233
6,0.382525,1.725468,-0.119406
