### 7.1 Handling missing data

In [125]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'none'

In [126]:
import pandas as pd
import numpy as np

##### Scenario 1

In [127]:
# Enabling node interactivity
InteractiveShell.ast_node_interactivity = 'all'

In [128]:
string_data = pd.Series(['aardward', 'artichoke', np.nan, 'avocado'])
string_data

0     aardward
1    artichoke
2          NaN
3      avocado
dtype: object

In [129]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [130]:
# The built-in python None value is also treated as NA value
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [131]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

#### 7.1.1 Filtering out missing data

In [132]:
from numpy import nan as NA

In [133]:
## Missing data values in Pandas Series
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

 > Above result could be find out in following way also

In [134]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [135]:
## Missing data values in Pandas Dataframe object
data_df = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
data_df[4] = NA
data_df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [136]:
cleaned_df_i = data_df.dropna()
cleaned_df_i

Unnamed: 0,0,1,2,4


In [137]:
## Passing how = 'all' will only drop rows that are all NA
cleaned_df_ii = data_df.dropna(how='all')
cleaned_df_ii

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
3,,6.5,3.0,


In [138]:
## df.dropna with axis =1 and how='all' parameters
cleaned_df_iii = data_df.dropna(axis=1, how='all')
cleaned_df_iii

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [139]:
## df.dropna with axis =1 and how='any' parameters
cleaned_df_iv = data_df.dropna(axis=1, how='any')
cleaned_df_iv

0
1
2
3


> Another way to filter out  DataFrame rows tends to concern time series data

In [140]:
time_series_df = pd.DataFrame(np.random.randn(7, 3))
time_series_df

Unnamed: 0,0,1,2
0,2.168387,-0.46152,0.864721
1,-2.680354,0.196821,-0.332535
2,-0.138909,-0.095041,-1.573011
3,-0.064755,0.068935,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [141]:
time_series_df.iloc[:4, 1] = NA
time_series_df.iloc[:2, 2] = NA
time_series_df_ii = time_series_df.copy(deep = True)
time_series_df

Unnamed: 0,0,1,2
0,2.168387,,
1,-2.680354,,
2,-0.138909,,-1.573011
3,-0.064755,,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [142]:
time_series_df.dropna()

Unnamed: 0,0,1,2
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [143]:
## Dataframe.dropna by default takes axis = 0 and how = 'any'
## Dataframe.dropna(thresh=2) targets rows contain atlease 2 'NA' value.
time_series_df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.138909,,-1.573011
3,-0.064755,,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


#### 7.1.2 Filling in missing data

In [144]:
time_series_df_ii

Unnamed: 0,0,1,2
0,2.168387,,
1,-2.680354,,
2,-0.138909,,-1.573011
3,-0.064755,,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [145]:
time_series_df_ii.fillna(value = 0, inplace = False)

Unnamed: 0,0,1,2
0,2.168387,0.0,0.0
1,-2.680354,0.0,0.0
2,-0.138909,0.0,-1.573011
3,-0.064755,0.0,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [146]:
time_series_df_ii.fillna(value = {1: 0.5, 2:0 }, inplace = False)

Unnamed: 0,0,1,2
0,2.168387,0.5,0.0
1,-2.680354,0.5,0.0
2,-0.138909,0.5,-1.573011
3,-0.064755,0.5,0.296041
4,-1.887263,0.577991,0.723275
5,0.922961,-0.378781,0.02157
6,1.159377,0.629359,-1.319896


In [152]:
## Same interpolation methods available for reindexing can be used with fillna
random_df_ii = pd.DataFrame(np.random.randn(6, 3))
random_df_ii.iloc[2:, 1] = NA
random_df_ii.iloc[4:, 2] = NA
random_df_ii

Unnamed: 0,0,1,2
0,0.422906,1.514798,0.681579
1,0.110767,-0.018314,-0.003011
2,0.359667,,2.658296
3,-0.113994,,2.311931
4,-0.580803,,
5,0.136022,,


In [153]:
random_df_ii.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,0.422906,1.514798,0.681579
1,0.110767,-0.018314,-0.003011
2,0.359667,-0.018314,2.658296
3,-0.113994,-0.018314,2.311931
4,-0.580803,-0.018314,2.311931
5,0.136022,-0.018314,2.311931
