## Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [6]:
# we can deal with incomplete data
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d)
y.isnull()
y.isna() # or notna()
y.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [13]:
# there are easy ways to deal with missing data
y.fillna('Q', inplace=True) # careful - this will permanently alter the original
y

0    A
1    B
2    Q
3    D
dtype: object

In [17]:
# alternatively we can just ignore missing values
y[2] = np.nan # put the missing value back in
y.dropna(inplace=True) # completely removes the missing value
y

0    A
1    B
3    D
dtype: object

### Dealing with missing data

In [33]:
# we need a fresh Data Frame
df = pd.DataFrame( np.random.randn(10,3) )
df

Unnamed: 0,0,1,2
0,1.712254,-0.778728,-0.133787
1,0.550387,-0.390722,-0.821818
2,0.496294,1.356833,-1.906342
3,-1.210811,-1.003817,1.578137
4,-0.977566,0.438075,0.231316
5,-0.252998,0.698888,1.425739
6,-0.106238,0.6019,-0.496458
7,-0.031155,0.551449,-0.629961
8,-0.288486,1.209465,1.126861
9,0.46336,-0.396923,0.585565


In [34]:
# here we will inject some missing values
df.iloc[1:4,1] = np.nan
df.iloc[:2,2] = np.nan
df

Unnamed: 0,0,1,2
0,1.712254,-0.778728,
1,0.550387,,
2,0.496294,,-1.906342
3,-1.210811,,1.578137
4,-0.977566,0.438075,0.231316
5,-0.252998,0.698888,1.425739
6,-0.106238,0.6019,-0.496458
7,-0.031155,0.551449,-0.629961
8,-0.288486,1.209465,1.126861
9,0.46336,-0.396923,0.585565


In [41]:
# there are several strategies to deal with the problem
df.fillna(0) # one solution is to fill missing values with a known value
df.fillna(method='ffill') # forward-fill values down the column (default)
df.fillna(method='ffill', axis=1) # fill across the rows (axis=1)
df # note - the changes do NOT persist
# we can limit how many values get changed
df.fillna(method='ffill', axis=1, limit=1)

Unnamed: 0,0,1,2
0,1.712254,-0.778728,-0.778728
1,0.550387,0.550387,
2,0.496294,0.496294,-1.906342
3,-1.210811,-1.210811,1.578137
4,-0.977566,0.438075,0.231316
5,-0.252998,0.698888,1.425739
6,-0.106238,0.6019,-0.496458
7,-0.031155,0.551449,-0.629961
8,-0.288486,1.209465,1.126861
9,0.46336,-0.396923,0.585565


### Statistical Methods and Strategies

In [42]:
# there are built-in statistical methods
df.describe()

Unnamed: 0,0,1,2
count,10.0,7.0,8.0
mean,0.035504,0.332018,0.239357
std,0.834288,0.683356,1.195741
min,-1.210811,-0.778728,-1.906342
25%,-0.279614,0.020576,-0.529834
50%,-0.068696,0.551449,0.408441
75%,0.488061,0.650394,1.20158
max,1.712254,1.209465,1.578137


In [45]:
df.max()
df.min()
df.mean()

0    0.035504
1    0.332018
2    0.239357
dtype: float64

In [47]:
# we can fill missing values with a statistrical mean from the other values
df.fillna( df.mean() )

Unnamed: 0,0,1,2
0,1.712254,-0.778728,0.239357
1,0.550387,0.332018,0.239357
2,0.496294,0.332018,-1.906342
3,-1.210811,0.332018,1.578137
4,-0.977566,0.438075,0.231316
5,-0.252998,0.698888,1.425739
6,-0.106238,0.6019,-0.496458
7,-0.031155,0.551449,-0.629961
8,-0.288486,1.209465,1.126861
9,0.46336,-0.396923,0.585565


In [49]:
# or we can calculate a significant value and use that for replacing missing data
col1_mean = df.iloc[:, 1].mean()
col1_mean
df.fillna(col1_mean)

Unnamed: 0,0,1,2
0,1.712254,-0.778728,0.332018
1,0.550387,0.332018,0.332018
2,0.496294,0.332018,-1.906342
3,-1.210811,0.332018,1.578137
4,-0.977566,0.438075,0.231316
5,-0.252998,0.698888,1.425739
6,-0.106238,0.6019,-0.496458
7,-0.031155,0.551449,-0.629961
8,-0.288486,1.209465,1.126861
9,0.46336,-0.396923,0.585565
