## Preparing and Cleaning Data

In [1]:
import pandas as pd
import numpy as np

In [7]:
y = pd.Series(['AAA', 'BBB', np.nan, 'DDD'])
y.isnull()

y[0] = None
y.notnull()
y.fillna('new Value') # we can replace values with our own alternatives

0    new Value
1          BBB
2    new Value
3          DDD
dtype: object

## filter out missing data

In [13]:
x = y.dropna() # any NaN or None values are removed from the output (does not persist)
x

1    BBB
3    DDD
dtype: object

In [15]:
# we CAN force the changes to persist
y.dropna(inplace=True) # the changes persist in the original structure
y

1    BBB
3    DDD
dtype: object

## Fill in missing values

In [47]:
# simple series to work with
df = pd.DataFrame(np.random.randn(10,3))
# inject some missing values
df.iloc[:4,1] = np.nan
df.iloc[:2,2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.229359,,
1,0.996537,,
2,0.940707,,-0.350989
3,-0.707323,,-0.240707
4,-0.050891,-1.43838,-0.978631
5,0.235573,-0.274778,0.001522
6,-0.075725,-0.980495,0.293504
7,-2.125656,-0.636884,-0.200232
8,-1.960446,-0.007234,-2.719418
9,1.015953,-1.317787,1.187072


In [51]:
# use fillna
c1mean = df.iloc[:, 1].mean()
df.fillna(c1mean, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.229359,-0.775926,-0.775926
1,0.996537,-0.775926,-0.775926
2,0.940707,-0.775926,-0.350989
3,-0.707323,-0.775926,-0.240707
4,-0.050891,-1.43838,-0.978631
5,0.235573,-0.274778,0.001522
6,-0.075725,-0.980495,0.293504
7,-2.125656,-0.636884,-0.200232
8,-1.960446,-0.007234,-2.719418
9,1.015953,-1.317787,1.187072


## Remove Duplicates

In [62]:
df = pd.DataFrame({
    'A': ['I', 'II','I','II','I','II','I','II','III'],
    'B': [1,    2,   3,  4,   1,  2,   3,  4,   3]
})
df.duplicated() # True for any rows which are duplicates of earlier rows
df.drop_duplicates(inplace=True)
df

Unnamed: 0,A,B
0,I,1
1,II,2
2,I,3
3,II,4
8,III,3


## Replacing Values

In [63]:
df.replace(4,22, inplace=True)
df

Unnamed: 0,A,B
0,I,1
1,II,2
2,I,3
3,II,22
8,III,3
