## we can deal with data values that are incomplete

In [8]:
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d)
y.isnull() # spot the null value
y.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [15]:
# there are ways to deal with NaN values
y.fillna('Z') # , inplace=True)

0    A
1    B
2    Z
3    D
dtype: object

In [21]:
# put a NaN value back in
y[2] = np.nan
y
z = y.dropna() # any row containing NaN will be dropped from the df
z

0    A
1    B
3    D
dtype: object

### Strategies for dealing with missing data

In [24]:
df = pd.DataFrame(np.random.randn(10, 3))
df

Unnamed: 0,0,1,2
0,0.387291,1.212205,-0.985311
1,0.710327,0.212245,-1.496423
2,-1.3475,0.405347,0.823658
3,-2.588114,1.426408,0.057494
4,0.132214,0.790627,-0.922519
5,0.680132,-0.096506,0.056333
6,0.189512,-0.77198,-0.074373
7,-0.74706,0.673265,2.085774
8,0.8221,-0.730361,1.15456
9,-0.065887,0.368034,-0.156443


In [36]:
# we can inject some NaN values to work with
df.iloc[:4,1] = np.nan # row 0, 1, 2, 3 in column 1
df.iloc[:2,2] = np.nan # row 0, 1 in column 2
df.head()

Unnamed: 0,0,1,2
0,0.387291,,
1,0.710327,,
2,-1.3475,,0.823658
3,-2.588114,,0.057494
4,0.132214,0.790627,-0.922519


In [42]:
# there are several ways to solve this
df.fillna(0) # non-persistent
df.fillna(method='ffill', axis=1) # forward fill (in this case across the rows)
df.fillna({1:0.5, 2:0.8}) # inject specific values per column
df.dropna(thresh=2) # only drop a row if there are TWO NaN values
df.fillna(method='ffill', axis=1, limit=1) # fill up to one NaN per row

Unnamed: 0,0,1,2
0,0.387291,,
1,0.710327,,
2,-1.3475,,0.823658
3,-2.588114,,0.057494
4,0.132214,0.790627,-0.922519
5,0.680132,-0.096506,0.056333
6,0.189512,-0.77198,-0.074373
7,-0.74706,0.673265,2.085774
8,0.8221,-0.730361,1.15456
9,-0.065887,0.368034,-0.156443


In [48]:
# in this case we will use the mean() to fill missing values
df.fillna(df.mean()) # fill each columns NaN with that columns mean value
# we can calculate a significant value and use that for all NaN replacements
c1_mean = df.iloc[:, 1].mean()
c1_mean
df.fillna(c1_mean, inplace=True) # careful - we lose the original data
df

Unnamed: 0,0,1,2
0,0.387291,0.038847,0.038847
1,0.710327,0.038847,0.038847
2,-1.3475,0.038847,0.823658
3,-2.588114,0.038847,0.057494
4,0.132214,0.790627,-0.922519
5,0.680132,-0.096506,0.056333
6,0.189512,-0.77198,-0.074373
7,-0.74706,0.673265,2.085774
8,0.8221,-0.730361,1.15456
9,-0.065887,0.368034,-0.156443
