# Pandas Basic IV

In [1]:
# Pandas Basics
from IPython.display import Image 

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

### 4. Handling missing data
> 누락된 데이터 처리하기

- pandas의 설계 목표 중 하나는 누락 데이터를 가능한 한 쉽게 처리할 수 있도록 하는 것이다.
- pandas는 누락된 데이터를 실수든 아니든 모두 NaN(Not a Number)으로 취급한다.
- 그래서 누락된 값을 쉽게 찾을 수 있다.

In [2]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering out missing data
> 누락된 데이터 골라내기

- Series에 대해 dropna 메소드를 적용하면, 실제 데이터가 들어있는 색인값과 Series값으로 반환한다.

In [5]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data = DataFrame([[1., 6.5, 3.],[1., NA, NA],[NA, NA, NA],[NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [10]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
data.dropna(how = 'all')  # how = 'all'  =>  모든 값이 NA인 Row만 제외함

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [12]:
data[4] = NA

In [13]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [14]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
df = DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.239685,-0.840354,0.507486
1,0.610204,-1.447994,0.539701
2,0.133834,0.09379,0.246034
3,1.351323,0.322845,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [16]:
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,0.239685,,
1,0.610204,,
2,0.133834,,0.246034
3,1.351323,,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [17]:
df.dropna(thresh = 3)   # thresh : 몇 개 이상의 값이 들어있는 로우만 살펴보고 싶을때 넣는 인자값

Unnamed: 0,0,1,2
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [18]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,0.133834,,0.246034
3,1.351323,,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [19]:
df.dropna(thresh = 1)

Unnamed: 0,0,1,2
0,0.239685,,
1,0.610204,,
2,0.133834,,0.246034
3,1.351323,,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


### Filling in missing data
> 누락된 값 채우기

In [20]:
df

Unnamed: 0,0,1,2
0,0.239685,,
1,0.610204,,
2,0.133834,,0.246034
3,1.351323,,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.239685,0.0,0.0
1,0.610204,0.0,0.0
2,0.133834,0.0,0.246034
3,1.351323,0.0,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [22]:
df.fillna({1 : 0.5, 3 : -1})

Unnamed: 0,0,1,2
0,0.239685,0.5,
1,0.610204,0.5,
2,0.133834,0.5,0.246034
3,1.351323,0.5,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [23]:
df.fillna(0, inplace = True)    # inplace 통해 na값 0으로 실제 채워넣음
df

Unnamed: 0,0,1,2
0,0.239685,0.0,0.0
1,0.610204,0.0,0.0
2,0.133834,0.0,0.246034
3,1.351323,0.0,0.730635
4,0.1343,-0.886549,3.288666
5,-0.414705,0.361732,-0.235195
6,0.994256,-0.073413,0.87289


In [33]:
df = DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,-0.802631,0.074146,1.008342
1,1.220098,-0.970185,-0.023775
2,-0.9403,,1.319269
3,0.621869,,0.598882
4,0.49189,,
5,1.124791,,


In [34]:
df.fillna(method = 'ffill')
# (2,1) NaN값 : column 기준으로 하기 때문에 (2,0)값이 아닌 (1,1)의 값으로 채워진다!!!!!!!!!!!!!!!

# Q) 만약 (2,0)의 값으로 바꾸고 싶다면??
# A) 기존 data frame을 T 통해 행렬 변환 후 ffill로 값 채우고 다시 T를 통해 행렬변환!!!!!!!!!!!!!!

Unnamed: 0,0,1,2
0,-0.802631,0.074146,1.008342
1,1.220098,-0.970185,-0.023775
2,-0.9403,-0.970185,1.319269
3,0.621869,-0.970185,0.598882
4,0.49189,-0.970185,0.598882
5,1.124791,-0.970185,0.598882


In [28]:
?df.fillna

In [30]:
df.fillna(method = 'ffill', limit = 2) # limit = 2    => 2개까지만 채워넣음

Unnamed: 0,0,1,2
0,0.208073,0.702382,2.285916
1,-0.331176,-0.304657,-3.636535
2,-0.517874,-0.304657,0.923605
3,0.186768,-0.304657,-0.865628
4,0.11241,,-0.865628
5,0.48242,,-0.865628


In [31]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### 요온연습

In [35]:
df = DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,1.61641,-0.039057,0.468421
1,1.217106,0.996838,0.231059
2,0.354475,,-0.622454
3,0.590053,,-0.310647
4,-0.745349,,
5,-0.894728,,


In [37]:
df2 = df.T
df2

Unnamed: 0,0,1,2,3,4,5
0,1.61641,1.217106,0.354475,0.590053,-0.745349,-0.894728
1,-0.039057,0.996838,,,,
2,0.468421,0.231059,-0.622454,-0.310647,,


In [41]:
df2 = df2.fillna(method = 'ffill')
df2

Unnamed: 0,0,1,2,3,4,5
0,1.61641,1.217106,0.354475,0.590053,-0.745349,-0.894728
1,-0.039057,0.996838,0.354475,0.590053,-0.745349,-0.894728
2,0.468421,0.231059,-0.622454,-0.310647,-0.745349,-0.894728


In [42]:
df1 = df2.T
df1

Unnamed: 0,0,1,2
0,1.61641,-0.039057,0.468421
1,1.217106,0.996838,0.231059
2,0.354475,0.354475,-0.622454
3,0.590053,0.590053,-0.310647
4,-0.745349,-0.745349,-0.745349
5,-0.894728,-0.894728,-0.894728
