### Cleaning Data and using Numerical Analysis

In [2]:
import numpy as np
import pandas as pd

In [14]:
# we sometimes have incomplete data
d = ['A', 'B', np.nan, 'D']
y = pd.Series(d)
y.isnull() # NaN evaluates to null
y.isna() # we can spot NaN
y.notnull()
y.notna()

0     True
1     True
2    False
3     True
dtype: bool

In [24]:
# there are several strategies to deal with missing data
y.fillna('Q') # does not persist
# careful - we lose the original data structure
y.fillna('Q', inplace=True) # the changes will persist
y

0    A
1    B
2    Q
3    D
dtype: object

In [34]:
# Careful -when we make a copy it is SHALLOW (not a deep copy)
# so if we mutate the original it does NOT mutate the copy
y[2] = np.nan
y.dropna(inplace=True)
y

0    A
1    B
3    D
dtype: object

### Handling Missing Data in a DataFrame

In [None]:
df = pd.DataFrame(np.random.randn(10,3))
df.iloc[0:4,1] = np.nan
df.iloc[1:3,2] = np.nan
# we can fill NaN like this
df.fillna(0) # does not alter the original structure
# axis=1 lets us work across the df
df.fillna(method='ffill', axis=1) # careful - deprecation

### Statistical Methods

In [64]:
df.describe() # gives useful statistics for the DataFrame
# these ignore NaN

Unnamed: 0,0,1,2
count,10.0,6.0,8.0
mean,0.297425,-0.485277,0.430349
std,0.682437,0.663325,0.824252
min,-1.066677,-1.424335,-1.406486
25%,0.139035,-0.91165,0.292426
50%,0.207016,-0.437324,0.760807
75%,0.488197,-0.000171,0.955498
max,1.608524,0.311536,1.017329


In [78]:
df.min()
df.max()
df.count() # they all ignore NaN
df.mean()

0    0.297425
1   -0.485277
2    0.430349
dtype: float64

In [84]:
# we can use this to fillna
df.fillna( df.mean(), inplace=False) # careful - this will affect other stats

Unnamed: 0,0,1,2
0,-0.12184,-0.485277,-0.038893
1,0.120382,-0.485277,0.430349
2,0.85021,-0.485277,0.430349
3,0.489082,-0.485277,1.008542
4,0.200977,-1.424335,1.017329
5,-1.066677,-1.034438,-1.406486
6,1.608524,-0.543287,0.402866
7,0.194993,-0.331361,0.937817
8,0.485541,0.110225,0.815822
9,0.213056,0.311536,0.705793


In [88]:
col1_mean = df.iloc[:,1].mean()
col1_mean # the mean value of every significant member of col1
df.fillna(col1_mean) # use any reasonable significant value


Unnamed: 0,0,1,2
0,-0.12184,-0.485277,-0.038893
1,0.120382,-0.485277,-0.485277
2,0.85021,-0.485277,-0.485277
3,0.489082,-0.485277,1.008542
4,0.200977,-1.424335,1.017329
5,-1.066677,-1.034438,-1.406486
6,1.608524,-0.543287,0.402866
7,0.194993,-0.331361,0.937817
8,0.485541,0.110225,0.815822
9,0.213056,0.311536,0.705793
