# 데이터 인코딩
## 레이블 인코딩 (Label encoding)
- sklearn

In [1]:
# sklearn
from sklearn.preprocessing import LabelEncoder

items = ['사자', '호랑이', '사자', '강아지', '고양이', '고양이', '사슴', '고양이']

# LabelEncoder 객체 생성
encoder = LabelEncoder()

# fit
encoder.fit(items)

# transform
labels = encoder.transform(items)

labels

array([3, 4, 3, 0, 1, 1, 2, 1])

In [2]:
# 인코딩 클래스
encoder.classes_

array(['강아지', '고양이', '사슴', '사자', '호랑이'], dtype='<U3')

In [3]:
# 숫자 -> 문자열
encoder.inverse_transform([3, 4, 3, 0, 1, 1, 2, 1])

array(['사자', '호랑이', '사자', '강아지', '고양이', '고양이', '사슴', '고양이'], dtype='<U3')

In [4]:
# fit과 transform을 함께
encoder.fit_transform(items)

array([3, 4, 3, 0, 1, 1, 2, 1], dtype=int64)

## one-hot encoding
- sklearn 
- pandas

### sklearn

In [7]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
items = ['사자', '호랑이', '사자', '강아지', '고양이', '고양이', '사슴', '고양이']

# 1차원 -> 2차원
items = np.array(items).reshape(-1, 1)
items

array([['사자'],
       ['호랑이'],
       ['사자'],
       ['강아지'],
       ['고양이'],
       ['고양이'],
       ['사슴'],
       ['고양이']], dtype='<U3')

In [8]:
# 원-핫 인코딩 객체 생성
oh_encoder = OneHotEncoder()

# fit
oh_encoder.fit(items)

# transform
# sparse matrix (희소 행렬)
# : 값 대부분이 0인 행렬
# : 반대는 밀집행렬 (dense matrix)
oh_labels = oh_encoder.transform(items)
oh_labels

<8x5 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [9]:
# 원-핫 인코딩 (희소행렬) 출력
oh_labels.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [10]:
# 원-핫 인코딩 차원 
oh_labels.shape

(8, 5)

In [11]:
# 원-핫 인코딩 타입
type(oh_labels)

scipy.sparse._csr.csr_matrix

### pandas

In [12]:
import pandas as pd

In [14]:
df = pd.DataFrame({
    'items' : ['사자', '호랑이', '사자', '강아지', '고양이', '고양이', '사슴', '고양이']
})

pd.get_dummies(df)

Unnamed: 0,items_강아지,items_고양이,items_사슴,items_사자,items_호랑이
0,False,False,False,True,False
1,False,False,False,False,True
2,False,False,False,True,False
3,True,False,False,False,False
4,False,True,False,False,False
5,False,True,False,False,False
6,False,False,True,False,False
7,False,True,False,False,False


# 피처 스케일링
- StandardScaler : 평균이 0이고, 분산이 1인 정규 분표 형태로 변환
- MinMaxScaler : 데이터 값을 0~1 범위 값으로 변환

## StandardScaler

In [16]:
from sklearn.datasets import load_iris
import pandas as pd

# 붓꽃 데이터 셋 로딩
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df[:2]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [17]:
# feature들의 평균값
iris_df.mean()

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

In [18]:
# feature들의 분산값
iris_df.var()

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64

In [19]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# [방법 1] fit과 transform 개별 실행
# fit
scaler.fit(iris_df)

# transform
iris_scaler = scaler.transform(iris_df)

# [방법 2] fit과 transform 한번에 실행
iris_scaler = scaler.fit_transform(iris_df)

iris_scaler

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [20]:
# array -> dataframe
iris_scaler_df = pd.DataFrame(data=iris_scaler, columns=iris.feature_names)
iris_scaler_df[:2]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444


In [21]:
# feature들의 평균값
iris_scaler_df.mean()

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

In [22]:
# feature들의 분산값
iris_scaler_df.var()

sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64

## MinMaxScaler
- default : 최솟값 0, 최댓값 1

In [23]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
scaler = MinMaxScaler()

# [방법 1] fit과 transform 개별 실행
# fit 
scaler.fit(iris_df)

# transform
iris_scaler_min_max = scaler.transform(iris_df)

# [방법 2] fit과 transform 한번에 실행
# iris_scaler_min_max = scaler.fit_transform(iris_df)

iris_scaler_min_max

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [24]:
# array -> dataframe
iris_scaler_min_max_df = pd.DataFrame(data=iris_scaler_min_max, columns=iris.feature_names)
iris_scaler_min_max_df[:2]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667


In [25]:
# feature들의 평균값
iris_scaler_min_max_df.mean()

sepal length (cm)    0.428704
sepal width (cm)     0.440556
petal length (cm)    0.467458
petal width (cm)     0.458056
dtype: float64

In [26]:
# feature들의 분산값
iris_scaler_min_max_df.var()

sepal length (cm)    0.052908
sepal width (cm)     0.032983
petal length (cm)    0.089522
petal width (cm)     0.100869
dtype: float64

## scaler를 이용한 fit, transform, fit_transform 적용 시 주의사항!

In [27]:
# 학습 데이터 : 0~10
# 테스트 데이터 : 0~5
train_data = np.arange(0, 11).reshape(-1, 1)
test_data = np.arange(0, 6).reshape(-1, 1)

train_data

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])

In [28]:
# 훈련 데이터 스케일링
# MinMaxScaler 객체 생성
# default : 최솟값 0, 최댓값 1로 변환
scaler = MinMaxScaler()

# [방법 1] fit과 transform 개별 실행
# fit() 
scaler.fit(train_data)

# transform()
train_scale = scaler.transform(train_data)

# [방법 2] fit과 transform 한번에 실행
train_scale = scaler.fit_transform(train_data)

# scale된 train_data
train_scale.reshape(-1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [29]:
# 원본 train_data
train_data.reshape(-1)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [30]:
# 잘못된 테스트 데이터 스케일링

# [방법 1]
scaler.fit(test_data)
test_scale = scaler.transform(test_data)

# [방법 2]
# test_scale = scaler.fit_transform(test_data)

test_scale.reshape(-1)

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [32]:
# 잘된 테스트 데이터 스케일링

# 1. scaler 객체 생성
scaler = MinMaxScaler()

# 2-1. fit : 학습 데이터 
scaler.fit(train_data)

# 2-2. 학습 데이터 transform
train_scaler = scaler.transform(train_data)

# 3. transform : 테스트 데이터
test_scale = scaler.transform(test_data)

test_scale.reshape(-1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5])

In [None]:
test_data.reshape(-1)