## Cleaning Data

In [2]:
import numpy as np
import pandas as pd

In [11]:
# we sometimes need to handle incomplete data
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d)
y.isnull()
y.isna()
y.notna()
y.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [15]:
y.fillna('Q') # most operations do NTO mutate the original structure
y.fillna('Q', inplace=True) # mutate the original structure
y

0    A
1    B
2    Q
3    D
dtype: object

In [21]:
# Another approach is to ignire missing values
y[2] = np.nan
y
y.dropna(inplace=True) # completemly remove missing values
y

0    A
1    B
3    D
dtype: object

### Handling Missing Values in a DataFrame

In [24]:
df = pd.DataFrame(np.random.randn(10,3))
df # we have default numbers for column and row names

Unnamed: 0,0,1,2
0,0.009677,-0.348117,-0.963492
1,-0.46129,0.492645,-0.200994
2,1.870393,-0.211207,-0.181809
3,0.338563,-0.821811,0.010285
4,0.749232,1.967595,0.66501
5,1.843452,0.622885,0.493104
6,-2.241966,0.576459,-0.642578
7,0.407063,0.609616,-0.939494
8,0.793409,-0.602349,0.23422
9,0.668387,0.205581,-0.918211


In [29]:
df.iloc[0:4,1] = np.nan
df.iloc[1:3,2] = np.nan # start:stop-before
df

Unnamed: 0,0,1,2
0,0.009677,,-0.963492
1,-0.46129,,
2,1.870393,,
3,0.338563,,0.010285
4,0.749232,1.967595,0.66501
5,1.843452,0.622885,0.493104
6,-2.241966,0.576459,-0.642578
7,0.407063,0.609616,-0.939494
8,0.793409,-0.602349,0.23422
9,0.668387,0.205581,-0.918211


In [31]:
# There are several strategies to deal with missing members
df.fillna(0) # one option is to fill missing members
df # remember the original structure is NOT alters


Unnamed: 0,0,1,2
0,0.009677,,-0.963492
1,-0.46129,,
2,1.870393,,
3,0.338563,,0.010285
4,0.749232,1.967595,0.66501
5,1.843452,0.622885,0.493104
6,-2.241966,0.576459,-0.642578
7,0.407063,0.609616,-0.939494
8,0.793409,-0.602349,0.23422
9,0.668387,0.205581,-0.918211
