# 4. 누락된 데이터 처리하기

#### * pandas애서는 누락된 데이터를 NaN(Not a Number)로 취급 -> 문자도 마찬가지

In [5]:
from pandas import Series, DataFrame

import numpy as np
import pandas as pd

In [6]:
string_data = Series(['apple', 'bananan', np.nan, 'grape'])

In [7]:
string_data

0      apple
1    bananan
2        NaN
3      grape
dtype: object

In [8]:
string_data[0] =  None

In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### NA 처리 method
* dropna : 누락된 데이터가 있는 축(row, column)을 제외 시킨다.
* fillna : 누락 된 데이터 값을 대신 채우거나 'ffill', 'bfill'같은 보간 메서드 사용
* isnull : 누락되거나 NA 값을 알려주는 불리언 값 객체 반환
* notnull : isnull과 반대

## 4.1 누락된 데이터 골라내기

In [11]:
from numpy import nan as NA

data = Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
# DataFrame
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

cleaned = data.dropna()

In [17]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
# how option
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [20]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
# 몇개의 이상값만이 들어있는 row 찾기 option(thresh=?)
df = DataFrame(np.random.randn(7,3))

In [23]:
df

Unnamed: 0,0,1,2
0,-0.866903,-0.449903,-0.357818
1,-1.218773,-0.231147,1.864712
2,0.539903,-0.455512,0.705001
3,0.028176,-0.061069,0.580474
4,-1.591547,-0.536868,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


In [24]:
df.ix[:4,1] = NA
df.ix[:2,2] = NA

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [25]:
df

Unnamed: 0,0,1,2
0,-0.866903,,
1,-1.218773,,
2,0.539903,,
3,0.028176,,0.580474
4,-1.591547,,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


In [29]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
3,0.028176,,0.580474
4,-1.591547,,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


## 4.2 누락된 값 채우기

In [31]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.866903,0.0,0.0
1,-1.218773,0.0,0.0
2,0.539903,0.0,0.0
3,0.028176,0.0,0.580474
4,-1.591547,0.0,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


In [33]:
df.fillna({1:0.5, 2:-1})

Unnamed: 0,0,1,2
0,-0.866903,0.5,-1.0
1,-1.218773,0.5,-1.0
2,0.539903,0.5,-1.0
3,0.028176,0.5,0.580474
4,-1.591547,0.5,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


In [34]:
# fillna는 채워넣은 객체의 참조를 반환한다.
_ = df.fillna(0, inplace=True)

In [35]:
df

Unnamed: 0,0,1,2
0,-0.866903,0.0,0.0
1,-1.218773,0.0,0.0
2,0.539903,0.0,0.0
3,0.028176,0.0,0.580474
4,-1.591547,0.0,-0.498249
5,-0.878292,-0.876356,-0.177811
6,-1.054761,0.014993,0.038097


In [36]:
# 보간 mothod
df = DataFrame(np.random.randn(6,3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.618007,0.431896,0.384354
1,0.779253,-0.520722,1.134516
2,0.870811,,-1.987916
3,0.72686,,0.0133
4,-1.322952,,
5,1.086585,,


In [37]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.618007,0.431896,0.384354
1,0.779253,-0.520722,1.134516
2,0.870811,-0.520722,-1.987916
3,0.72686,-0.520722,0.0133
4,-1.322952,-0.520722,0.0133
5,1.086585,-0.520722,0.0133


In [38]:
# limit option -> 각 column마다 몇개를 보간할지 결정
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,0.618007,0.431896,0.384354
1,0.779253,-0.520722,1.134516
2,0.870811,-0.520722,-1.987916
3,0.72686,,0.0133
4,-1.322952,,0.0133
5,1.086585,,


In [39]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### fillna의 함수인자
* value : 비어있는 값을 채울 스칼라값이나 사전 형식의 객체
* method : 보간 방식, 기본적으로 ffill을 사용
* axis : 기본값 0
* inplace : 복사본을 생성하지 않고 호출한 객체를 변경, 기본값 False
* limit : 값을 앞 혹은 뒤에서부터 몇 개까지 채울지 지정한다.