## Data Preparation

In [1]:
import numpy as np
import pandas as pd

In [6]:
# we sometimes need to work with incomplete data
d = ['A', 'B', np.nan, 'D']
t = pd.Series(d)
t.isnull()
t.notnull()
t.isna()
t.notna()

0     True
1     True
2    False
3     True
dtype: bool

In [8]:
# strategies to deal with NaN
t.fillna('Q', inplace=True) # we may choose to mutate the original structure
t # be careful - keep a copy of the original data

0    A
1    B
2    Q
3    D
dtype: object

In [14]:
# we can choose to ignore NaN
t[2] = np.nan
t.dropna(inplace=True) # all NaN are completely removed
t

0    A
1    B
3    D
dtype: object

### Handling missing values in a Dataframe

In [40]:
df = pd.DataFrame(np.random.randn(10,3))
df

Unnamed: 0,0,1,2
0,0.842984,0.634378,-0.044482
1,1.59788,1.136194,-0.520279
2,0.516458,-0.325058,1.392601
3,0.437308,0.342863,0.594628
4,0.072029,-1.495749,-0.971898
5,-0.268345,-0.17024,0.927823
6,-0.985438,-0.43546,-0.653627
7,-1.586353,-1.665284,2.165871
8,1.159981,0.444252,0.205474
9,0.24532,-1.647461,0.465678


In [41]:
# inject some missing values
df.iloc[0:4,1] = np.nan
df.iloc[1:3,2] = np.nan
df

Unnamed: 0,0,1,2
0,0.842984,,-0.044482
1,1.59788,,
2,0.516458,,
3,0.437308,,0.594628
4,0.072029,-1.495749,-0.971898
5,-0.268345,-0.17024,0.927823
6,-0.985438,-0.43546,-0.653627
7,-1.586353,-1.665284,2.165871
8,1.159981,0.444252,0.205474
9,0.24532,-1.647461,0.465678


In [47]:
df.fillna(0) # unless we use inplace=True it will not mutate the original
# alternatively we can limit the replacements
df.fillna(method='ffill') # forward fill
df.fillna(method='ffill', axis=1) # default axis=0 (down)
df.fillna(method='ffill', axis=1, limit=1) # default axis=0 (down)
# often a good idea to use a meaningful mean as a fill value

Unnamed: 0,0,1,2
0,0.842984,0.842984,-0.044482
1,1.59788,1.59788,
2,0.516458,0.516458,
3,0.437308,0.437308,0.594628
4,0.072029,-1.495749,-0.971898
5,-0.268345,-0.17024,0.927823
6,-0.985438,-0.43546,-0.653627
7,-1.586353,-1.665284,2.165871
8,1.159981,0.444252,0.205474
9,0.24532,-1.647461,0.465678


### Statistical Methods

In [48]:
df.describe()

Unnamed: 0,0,1,2
count,10.0,6.0,8.0
mean,0.203182,-0.828324,0.336183
std,0.959138,0.897085,0.973517
min,-1.586353,-1.665284,-0.971898
25%,-0.183252,-1.609533,-0.196768
50%,0.341314,-0.965604,0.335576
75%,0.761353,-0.236545,0.677927
max,1.59788,0.444252,2.165871


In [51]:
df.sum() # or mean, count, min, max, std...

0    2.031824
1   -4.969941
2    2.689466
dtype: float64

In [52]:
# we can use a staticsical mean as a fill value
df.fillna( df.mean() )

Unnamed: 0,0,1,2
0,0.842984,-0.828324,-0.044482
1,1.59788,-0.828324,0.336183
2,0.516458,-0.828324,0.336183
3,0.437308,-0.828324,0.594628
4,0.072029,-1.495749,-0.971898
5,-0.268345,-0.17024,0.927823
6,-0.985438,-0.43546,-0.653627
7,-1.586353,-1.665284,2.165871
8,1.159981,0.444252,0.205474
9,0.24532,-1.647461,0.465678


In [54]:
# or we choose a value we consider important and use that
col1_mean = df.iloc[:, 1].mean()
col1_mean
df.fillna(col1_mean)

Unnamed: 0,0,1,2
0,0.842984,-0.828324,-0.044482
1,1.59788,-0.828324,-0.828324
2,0.516458,-0.828324,-0.828324
3,0.437308,-0.828324,0.594628
4,0.072029,-1.495749,-0.971898
5,-0.268345,-0.17024,0.927823
6,-0.985438,-0.43546,-0.653627
7,-1.586353,-1.665284,2.165871
8,1.159981,0.444252,0.205474
9,0.24532,-1.647461,0.465678
