## 결측치 표현
### NaN / None으로 표현(Nan: float64, None: 파이썬 내장 객체)

In [78]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## 결측치 표현 및 확인

In [79]:
float_data = Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [80]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [81]:
float_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

## 결측치 골라내기(dropna)

In [82]:
data = Series([1, np.nan, 3.5, np.nan, 7])

In [83]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [84]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [85]:
data[data.notna()]  # 불리언 객체를 이용한 색인

0    1.0
2    3.5
4    7.0
dtype: float64

In [86]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [87]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [88]:
data.dropna()   # default: how='any' axis='index'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


### how

In [89]:
data.dropna(how="all")  # 모든 데이터가 결측치일 때만 삭제

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


### axis

In [90]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [91]:
data.dropna(axis="columns", how="any")

0
1
2
3


### thresh

In [92]:
df = DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [93]:
df

Unnamed: 0,0,1,2
0,-2.131876,,
1,0.373859,,
2,0.210625,,-0.963008
3,0.452729,,-0.510939
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


In [94]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


In [95]:
df.dropna(thresh=2) # 결측치가 thresh의 개수보다 적은 개수를 반환

Unnamed: 0,0,1,2
2,0.210625,,-0.963008
3,0.452729,,-0.510939
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


## 결측치 채우기(fillna)

In [96]:
df

Unnamed: 0,0,1,2
0,-2.131876,,
1,0.373859,,
2,0.210625,,-0.963008
3,0.452729,,-0.510939
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


In [97]:
df.fillna(0)    # 결측치를 0으로 채워줌

Unnamed: 0,0,1,2
0,-2.131876,0.0,0.0
1,0.373859,0.0,0.0
2,0.210625,0.0,-0.963008
3,0.452729,0.0,-0.510939
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


### 딕셔너리를 통한 결측치 채우기

In [98]:
df.fillna({1: 0.5, 2: 0})   # feature 별로 다른 결측치

Unnamed: 0,0,1,2
0,-2.131876,0.5,0.0
1,0.373859,0.5,0.0
2,0.210625,0.5,-0.963008
3,0.452729,0.5,-0.510939
4,-0.388396,-0.308349,1.076168
5,1.195809,1.074041,-0.796302
6,0.418461,0.458323,-0.39843


### method
reindex 할 때 결측치가 많이 발생함 -> 결측치를 채우기 위해서 method 사용 가능한 것처럼
DataFrame에서도 사용 가능

In [99]:
df = DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

In [100]:
df

Unnamed: 0,0,1,2
0,-1.121953,-0.44231,-1.031795
1,1.015185,-0.052421,-0.897845
2,0.499262,,0.341926
3,1.972816,,0.225129
4,1.738309,,
5,0.782236,,


In [101]:
df.fillna(method="ffill")   ## 전의 값으로 채워줌

  df.fillna(method="ffill")   ## 전의 값으로 채워줌


Unnamed: 0,0,1,2
0,-1.121953,-0.44231,-1.031795
1,1.015185,-0.052421,-0.897845
2,0.499262,-0.052421,0.341926
3,1.972816,-0.052421,0.225129
4,1.738309,-0.052421,0.225129
5,0.782236,-0.052421,0.225129


In [102]:
df.fillna(method="ffill", limit=2)  # limit: 결측치를 채울 개수 조정 가능

  df.fillna(method="ffill", limit=2)  # limit: 결측치를 채울 개수 조정 가능


Unnamed: 0,0,1,2
0,-1.121953,-0.44231,-1.031795
1,1.015185,-0.052421,-0.897845
2,0.499262,-0.052421,0.341926
3,1.972816,-0.052421,0.225129
4,1.738309,,0.225129
5,0.782236,,0.225129


In [103]:
data = Series([1., np.nan, 3.5, np.nan, 7.])

In [104]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [105]:
data.fillna(data.mean())    # 결측치를 평균으로 채워줌

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 데이터 변형
## 함수나 매핑을 이용하여 데이터 변형

In [106]:
data = DataFrame({"food": ["bacon", "pulled pork", "bacon",
                           "pastrami", "corned beef", "bacon",
                           "pastrami", "honey ham", "nova lox"],
                    "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [107]:
data    # DataFrame 요소 별 적용: applymap

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [108]:
# 매핑 할 딕셔너리 생성
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [109]:
# data["food"]만 가져왔으므로 Series 형태 -> map 이용
data["animal"] = data["food"].map(meat_to_animal)

In [110]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [111]:
def get_animal(x):
    return meat_to_animal[x]

In [112]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [113]:
df = DataFrame(np.random.standard_normal((7, 3)))

In [114]:
df

Unnamed: 0,0,1,2
0,0.688404,-1.273165,-0.278524
1,-1.622964,-0.343717,0.91331
2,-0.407396,0.142303,1.231099
3,0.258965,1.095978,-0.312721
4,1.669944,1.678714,0.096034
5,-1.405448,-1.043324,-0.714856
6,0.299999,-1.622535,-0.756338


In [115]:
df.applymap(lambda x: x**2)

  df.applymap(lambda x: x**2)


Unnamed: 0,0,1,2
0,0.4739,1.620948,0.077576
1,2.634014,0.118141,0.834135
2,0.165971,0.02025,1.515604
3,0.067063,1.201168,0.097794
4,2.788712,2.818079,0.009223
5,1.975283,1.088526,0.511019
6,0.089999,2.63262,0.572047


## 값 치환하기

### replace

In [116]:
data = Series([1., -999., 2., -999., -1000., 3.])

In [117]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [118]:
data.replace(-999, np.nan)  # replace(바꾸고 싶은 값, 바꿀 값)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [119]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 축 색인 이름 바꾸기
- map이나 rename 이용하여 변경 가능

### map

In [120]:
data = DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [121]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [122]:
def transform(x):
    return x[:4].upper()

In [123]:
data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [124]:
data.index = data.index.map(transform)

In [125]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### rename

In [126]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [127]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [128]:
data.rename(index={"OHIO": "INDIANA"},
            columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 이산화
- 연속적인 데이터를 이산적 데이터로 분할
- 분석을 위해 그룹으로 나눔

### cut, qcut < 중요!!
- pd.cut(x, bins, labels, right)
- x: 이산화 하기 위한 Series/리스트
- bins: 범위값을 나타냄
- labels: 이산화 된 데이터를 부를 이름
- right: 오른쪽에 포함/왼쪽에 포함

### cut

In [129]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [130]:
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [131]:
pd.cut(ages, bins, right=False) # 우측 값 포함 x -> 다음 그룹으로 넘어감

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [132]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

### qcut

In [133]:
data = np.random.standard_normal(1000)

In [134]:
data

array([-1.33988264e+00,  1.33048641e+00, -5.95455371e-01, -1.14459124e+00,
       -7.35313334e-02, -1.14573625e+00,  5.06095132e-01,  9.31011563e-01,
        2.68019673e-01,  5.54619356e-01,  8.76232129e-01,  1.01926057e+00,
        1.77989515e-01, -1.23395997e+00, -4.00597609e-01,  1.14464978e-01,
        6.21471870e-01, -1.42374514e-01,  2.31422128e-02, -1.31677711e+00,
       -2.21531534e+00,  1.10973505e-01, -1.93327011e+00,  1.24099070e-01,
       -7.75630391e-01, -5.68495803e-01, -1.65347972e+00,  4.90477513e-01,
       -1.03646898e-01, -4.83469797e-01, -1.03445238e+00, -2.17328581e+00,
       -5.59956657e-01,  1.26625332e+00,  1.18877104e+00, -2.12581032e+00,
       -2.41622065e+00, -1.44987458e+00, -6.49325695e-01, -1.07279401e+00,
       -5.86253052e-01,  1.77248439e+00, -2.14316726e-01, -3.93190736e-01,
        3.43054234e-01,  2.51543698e+00, -1.09615257e+00, -6.57542904e-01,
        4.16028781e-01, -3.04346265e-01,  3.40476774e-01,  2.08007165e-01,
        1.02278011e+00,  

In [135]:
# 균등한 길이의 그룹을 계산
quartiles = pd.qcut(data, 4, precision=2)   # precision: 정밀도, 소숫점 아래 자리
quartiles

[(-3.63, -0.7], (0.58, 3.62], (-0.7, -0.051], (-3.63, -0.7], (-0.7, -0.051], ..., (-0.051, 0.58], (-0.051, 0.58], (-0.051, 0.58], (-3.63, -0.7], (-0.051, 0.58]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.63, -0.7] < (-0.7, -0.051] < (-0.051, 0.58] < (0.58, 3.62]]

In [136]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-3.63, -0.7]     250
(-0.7, -0.051]    250
(-0.051, 0.58]    250
(0.58, 3.62]      250
Name: count, dtype: int64

In [137]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.622, -1.321]     100
(-1.321, -0.0514]    400
(-0.0514, 1.306]     400
(1.306, 3.621]       100
Name: count, dtype: int64

## 이상치를 찾고 제외하기

### 이상치(outlier)
- 전체적인 데이터 분포에서 크게 어긋나는 값

In [138]:
data = DataFrame(np.random.standard_normal((1000, 4)))

In [139]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.008367,-0.050018,0.016833,-0.011631
std,1.017422,0.948604,0.985848,1.045213
min,-3.561115,-3.202571,-2.63362,-3.510026
25%,-0.700796,-0.675833,-0.656762,-0.677406
50%,0.051877,-0.022888,-0.001628,-0.026374
75%,0.651626,0.626474,0.657255,0.672256
max,2.862284,3.069227,2.858819,3.602073


In [140]:
data.abs() > 3

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [141]:
(data.abs() > 3).any(axis="columns")    # 이상치가 하나라도 있는 data 호출

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [142]:
np.sum((data.abs() > 3).any(axis="columns"))

8

In [143]:
# 이상치가 아닌 데이터만 호출
data.loc[~(data.abs() > 3).any(axis="columns"), :]

Unnamed: 0,0,1,2,3
0,-0.566773,-1.074308,1.504595,1.341501
1,1.863393,0.474213,0.834386,-1.120961
2,-1.299195,-0.707784,0.363613,0.119621
3,1.883813,-1.259140,0.080922,2.442519
4,0.820041,0.169372,0.816713,0.000934
...,...,...,...,...
995,-0.507047,-0.319113,-0.065834,0.525657
996,0.236625,1.142104,0.478477,0.255124
997,-0.179900,0.750669,0.008043,0.169301
998,0.709674,-0.299589,-0.547514,0.472684


In [144]:
data.index[(data.abs() > 3).any(axis="columns")]

Index([60, 84, 103, 489, 514, 720, 942, 993], dtype='int64')

In [145]:
# 이상치가 있는 데이터 제거
data.drop(data.loc[(data.abs() > 3).any(axis="columns")])

Unnamed: 0,0,1,2,3
4,0.820041,0.169372,0.816713,0.000934
5,2.139228,0.937811,-1.975914,0.565577
6,-1.610146,-0.342366,-0.359429,-0.849230
7,-0.436366,-0.836716,-0.098202,-1.805743
8,1.082947,1.806027,-0.091976,-0.740057
...,...,...,...,...
995,-0.507047,-0.319113,-0.065834,0.525657
996,0.236625,1.142104,0.478477,0.255124
997,-0.179900,0.750669,0.008043,0.169301
998,0.709674,-0.299589,-0.547514,0.472684


In [148]:
np.sign(data) * 3

Unnamed: 0,0,1,2,3
0,-3.0,-3.0,3.0,3.0
1,3.0,3.0,3.0,-3.0
2,-3.0,-3.0,3.0,3.0
3,3.0,-3.0,3.0,3.0
4,3.0,3.0,3.0,3.0
...,...,...,...,...
995,-3.0,-3.0,-3.0,3.0
996,3.0,3.0,3.0,3.0
997,-3.0,3.0,3.0,3.0
998,3.0,-3.0,-3.0,3.0


In [146]:
data[data.abs() > 3] = np.sign(data) * 3

In [147]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.007646,-0.049824,0.016833,-0.012262
std,1.015128,0.947534,0.985848,1.040012
min,-3.0,-3.0,-2.63362,-3.0
25%,-0.700796,-0.675833,-0.656762,-0.677406
50%,0.051877,-0.022888,-0.001628,-0.026374
75%,0.651626,0.626474,0.657255,0.672256
max,2.862284,3.0,2.858819,3.0


## 표시자, 더미 변수 계산하기
- 더미 변수: 0, 1로 구성된 데이터
- 분류값 -> 더미, 표시자 행렬로 전환
- 통계 모델이나 머신러닝 애플리케이션 개발 적합
- k개의 feature로 확장, 분류값에 따라 0, 1로 나타냄

In [149]:
df = DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                "data1": range(6)})

In [150]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [151]:
# Series에 적용 가능
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [154]:
# default 값은 boolean
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [152]:
# prefix로 feature 이름 조정 가능
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)

In [153]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [155]:
# concat: 데이터를 이어붙임
df_with_dummy = pd.concat([df[["data1"]], dummies], axis=1)

In [156]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0


In [157]:
values = np.random.uniform(size=10)

In [158]:
values

array([0.77700179, 0.7535326 , 0.9927688 , 0.98009925, 0.45512746,
       0.07617168, 0.50747085, 0.11292624, 0.61765845, 0.82026705])

In [159]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.]

In [160]:
pd.cut(values, bins, labels=['a', 'b', 'c', 'd', 'e'])

['d', 'd', 'e', 'e', 'c', 'a', 'c', 'a', 'd', 'e']
Categories (5, object): ['a' < 'b' < 'c' < 'd' < 'e']

In [161]:
pd.get_dummies(pd.cut(values, bins, labels=['a', 'b', 'c', 'd', 'e']))

Unnamed: 0,a,b,c,d,e
0,False,False,False,True,False
1,False,False,False,True,False
2,False,False,False,False,True
3,False,False,False,False,True
4,False,False,True,False,False
5,True,False,False,False,False
6,False,False,True,False,False
7,True,False,False,False,False
8,False,False,False,True,False
9,False,False,False,False,True


In [162]:
pd.get_dummies(pd.cut(values, bins, labels=['a', 'b', 'c', 'd', 'e']), dtype=int)

Unnamed: 0,a,b,c,d,e
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,1,0,0
5,1,0,0,0,0
6,0,0,1,0,0
7,1,0,0,0,0
8,0,0,0,1,0
9,0,0,0,0,1
