# Preparing and Cleaning Data

In [1]:
import numpy as np
import pandas as pd

In [6]:
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d) # or all on one line
y.isnull() # really handy way to spot if there are missing values
y.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [12]:
# we can simply fill missing values
y.fillna('Z', inplace=True) # does not persist unless we use inplace=True
# or we could say y = y.fillna(...)
y

0    A
1    B
2    Z
3    D
dtype: object

In [15]:
# Drop missing data
y[2] = np.nan # put the problem back in
x = y.dropna() # non-persisten: any missing values are simply dropped
x

0    A
1    B
3    D
dtype: object

## Strategies for replacing missing data

In [38]:
# here is a simple df to work with
df = pd.DataFrame(np.random.randn(10,3))
df.head()

Unnamed: 0,0,1,2
0,-0.168228,-1.308364,-1.698867
1,2.828216,-2.204205,0.124133
2,-1.806593,-0.464398,-0.116629
3,-1.305852,-0.770174,-0.389876
4,-0.066852,0.380923,-0.809339


In [39]:
# we inject some problematic values
df.iloc[:4,1] = np.nan # naturally these changes do persist
df.iloc[:2,2] = np.nan
df.head(5)

Unnamed: 0,0,1,2
0,-0.168228,,
1,2.828216,,
2,-1.806593,,-0.116629
3,-1.305852,,-0.389876
4,-0.066852,0.380923,-0.809339


In [51]:
# there are several approaches to solving this problem
df.fillna(method='ffill', axis=1) # fills across the rows (no persistence)
df.fillna({1:0.5, 2:0.8}) # inject specific separate values via a set
df.dropna(thresh=2) # remove any row with up to two NaN values
df.fillna(method='ffill', axis=1, limit=1) # only fill one value
# perhaps a better strategy is to interpolate from other values
# for small proportions of data this may be acceptable
df.fillna(df.mean()) # fill NaN with the mean for that column, ignore NaN inthe mean calculation

Unnamed: 0,0,1,2
0,-0.168228,-0.373034,-0.217447
1,2.828216,-0.373034,-0.217447
2,-1.806593,-0.373034,-0.116629
3,-1.305852,-0.373034,-0.389876
4,-0.066852,0.380923,-0.809339
5,-0.949886,-1.32117,0.947123
6,0.356295,-2.908152,0.793289
7,-1.025723,1.026871,-1.33027
8,-0.431662,-0.060493,-0.825608
9,-2.251427,0.643817,-0.008267


In [52]:
# Here then is one suggestion for this data set
c1_mean= df.iloc[:, 1].mean()
df.fillna(c1_mean, inplace=True) # careful - we may lose the original data
df

Unnamed: 0,0,1,2
0,-0.168228,-0.373034,-0.373034
1,2.828216,-0.373034,-0.373034
2,-1.806593,-0.373034,-0.116629
3,-1.305852,-0.373034,-0.389876
4,-0.066852,0.380923,-0.809339
5,-0.949886,-1.32117,0.947123
6,0.356295,-2.908152,0.793289
7,-1.025723,1.026871,-1.33027
8,-0.431662,-0.060493,-0.825608
9,-2.251427,0.643817,-0.008267
