# 데이터 전처리

In [21]:
import numpy as np
import pandas as pd

## LabelEncoder

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
df = pd.DataFrame({
    'pName':['냉장고','TV','에어컨','냉장고','노트북','공기청정기'],
    'price':[100,80,150,100,50,40]
})
df

Unnamed: 0,pName,price
0,냉장고,100
1,TV,80
2,에어컨,150
3,냉장고,100
4,노트북,50
5,공기청정기,40


In [24]:
le = LabelEncoder()
le.fit(df.pName) # 각 값들과 코드의 매핑 관계 생성
le.transform(df.pName)  # le.fit_transform(df.pName) 으로 하면 한번에 할 수 있다

array([2, 0, 4, 2, 3, 1])

In [25]:
df['pid'] = le.transform(df.pName)
df

Unnamed: 0,pName,price,pid
0,냉장고,100,2
1,TV,80,0
2,에어컨,150,4
3,냉장고,100,2
4,노트북,50,3
5,공기청정기,40,1


In [26]:
print('Encoding Class : ',le.classes_)

Encoding Class :  ['TV' '공기청정기' '냉장고' '노트북' '에어컨']


In [27]:
le.inverse_transform([3,1,2,3,2,0,1,2]) # 숫자를 클래스로

array(['노트북', '공기청정기', '냉장고', '노트북', '냉장고', 'TV', '공기청정기', '냉장고'],
      dtype=object)

## OneHotEncoder

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
df = pd.DataFrame({
    'pName':['냉장고','TV','에어컨','냉장고','노트북','공기청정기'],
    'price':[100,80,150,100,50,40]
})
df

Unnamed: 0,pName,price
0,냉장고,100
1,TV,80
2,에어컨,150
3,냉장고,100
4,노트북,50
5,공기청정기,40


In [30]:
ohe = OneHotEncoder()
ohe.fit_transform(df.pName.values.reshape(-1,1) # numpy의 array로 만들어야해서
                 ).toarray()

array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

## LabelBinarizer : OneHotEncoding을 대신 할 수 있다

In [31]:
from sklearn.preprocessing import LabelBinarizer

In [32]:
lb = LabelBinarizer()
lb.fit_transform(df.pName) # 동일한 결과를 얻을 수 있다

array([[0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0]])

In [33]:
# pandas의 get_dummies의 함수에서도 가능!!
# sklearn보다 이게 훨씬 편하겠네
pd.get_dummies(df)

Unnamed: 0,price,pName_TV,pName_공기청정기,pName_냉장고,pName_노트북,pName_에어컨
0,100,0,0,1,0,0
1,80,1,0,0,0,0
2,150,0,0,0,0,1
3,100,0,0,1,0,0
4,50,0,0,0,1,0
5,40,0,1,0,0,0


# 데이터 스케일링

## StandardScaler

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

In [37]:
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [38]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [40]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)
iris_scaled_df = pd.DataFrame(iris_scaled,columns=iris.feature_names)
iris_scaled_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
5,-0.537178,1.939791,-1.169714,-1.052180
6,-1.506521,0.788808,-1.340227,-1.183812
7,-1.021849,0.788808,-1.283389,-1.315444
8,-1.748856,-0.362176,-1.340227,-1.315444
9,-1.143017,0.098217,-1.283389,-1.447076


In [41]:
iris_scaled_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


## MinMaxScaler
- 데이터를 0과 1 사이의 값으로 변환 (음수가 있으면 -1과 1 사이의 값으로 변환)

In [43]:
from sklearn.preprocessing import MinMaxScaler

In [46]:
mms = MinMaxScaler()
iris_scaled2 = mms.fit_transform(iris_df)
iris_scaled2_df = pd.DataFrame(iris_scaled2,columns=iris.feature_names)
iris_scaled2_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
5,0.305556,0.791667,0.118644,0.125000
6,0.083333,0.583333,0.067797,0.083333
7,0.194444,0.583333,0.084746,0.041667
8,0.027778,0.375000,0.067797,0.041667
9,0.166667,0.458333,0.084746,0.000000
