## Cleaning Data

In [1]:
import numpy as np
import pandas as pd

In [6]:
# we sometimes need to handle incomplete data
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d)
y.isnull()
y.isna()
y.notna()
y.notnull()
y

0      A
1      B
2    NaN
3      D
dtype: object

In [9]:
y.fillna('Q') # most operations do NOT mutate the original structure
# a good idea is to keep an original data set in case you need it!
y.fillna('Q', inplace=True) # mutate the original structure
y

0    A
1    B
2    Q
3    D
dtype: object

In [11]:
# Another approach is to ignore missing values
y[2] = np.nan
y
y.dropna(inplace=True) # completely remove missing values
y

0    A
1    B
3    D
dtype: object

### Handling Missing Values in a DataFrame

In [37]:
df = pd.DataFrame(np.random.randn(10,3))
df # we have default numbers for column and row names

Unnamed: 0,0,1,2
0,0.306208,0.262878,1.101778
1,0.695931,-1.62823,0.964918
2,-0.381231,-2.226511,-0.524983
3,0.602854,2.015498,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307


In [38]:
df.iloc[0:4,1] = np.nan
df.iloc[1:3,2] = np.nan # start:stop-before
df

Unnamed: 0,0,1,2
0,0.306208,,1.101778
1,0.695931,,
2,-0.381231,,
3,0.602854,,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307


In [39]:
# There are several strategies to deal with missing members
df.fillna(0) # one option is to fill missing members
df # remember the original structure is NOT alters


Unnamed: 0,0,1,2
0,0.306208,,1.101778
1,0.695931,,
2,-0.381231,,
3,0.602854,,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307


In [42]:
# one solution is fill with a value
df.fillna(method='ffill')
df.fillna(method='ffill', axis=1) # work across axes
df.fillna(method='ffill', axis=1, limit=1)

Unnamed: 0,0,1,2
0,0.306208,0.306208,1.101778
1,0.695931,0.695931,
2,-0.381231,-0.381231,
3,0.602854,0.602854,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307


### Statistical Methods

In [43]:
df.describe() # count will ignore NaN

Unnamed: 0,0,1,2
count,10.0,6.0,8.0
mean,-0.319685,-0.322101,0.696247
std,0.874638,1.158652,0.95774
min,-1.329425,-1.717569,-0.451307
25%,-1.160198,-0.861799,0.140857
50%,-0.37961,-0.6101,0.379279
75%,0.528692,0.086569,1.199201
max,0.826998,1.632071,2.512003


In [47]:
df.min()
df.max()
df.count()
df.mean()

0   -0.319685
1   -0.322101
2    0.696247
dtype: float64

In [48]:
# we can use statistical values with fillNA
df.fillna( df.mean() )

Unnamed: 0,0,1,2
0,0.306208,-0.322101,1.101778
1,0.695931,-0.322101,0.696247
2,-0.381231,-0.322101,0.696247
3,0.602854,-0.322101,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307


In [51]:
# we could use a significant value
col1_mean = df.iloc[:, 1].mean()
col1_mean
df.fillna(col1_mean)
# /tmp/ipykernel_1957829/1193302488.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.306208,,1.101778
1,0.695931,,1.101778
2,-0.381231,,1.101778
3,0.602854,,0.467988
4,-1.012172,-1.717569,2.512003
5,-1.20954,0.310654,-0.045502
6,0.826998,-0.937561,0.202977
7,-1.329425,-0.634511,1.491468
8,-0.377988,-0.585688,0.29057
9,-1.31849,1.632071,-0.451307
