# 03.모델링을 위한 전처리(Dummy Variable, Scaling)

### 라이브러리 로딩 & 데이터 준비

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load and Prepare the data

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/diamonds.csv'
                 , sep=',', skipinitialspace=True)  

data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### 1. Dummy Variable

diamonds 데이터에서 범주형 데이터는 무엇인가요?

In [3]:
# 범주형(명목형) 변수를 가변수로 만들고 붙여봅시다.
# pd.get_dummies 함수를 이용합니다.
dummies = pd.get_dummies(data['cut'], prefix='cut', drop_first=False)
dummies.head()

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,1,0,0,0


In [4]:
# 원본과 합칩니다. pd.concat
data = pd.concat([data, dummies], axis=1)

data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0,0,1,0,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0,0,0,1,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0,1,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,0,0,0,1,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0,1,0,0,0


그럼 나머지 범주형 변수들도 직접 작업해 봅시다.

In [5]:
dummies = pd.get_dummies(data['color'], prefix='color', drop_first=False)
dummies.head()

Unnamed: 0,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1


In [6]:
data = pd.concat([data, dummies], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,...,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,...,1,0,0,0,1,0,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,...,0,1,0,0,1,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,...,0,0,0,0,1,0,0,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,...,0,1,0,0,0,0,0,0,1,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,...,0,0,0,0,0,0,0,0,0,1


In [7]:
dummies = pd.get_dummies(data['clarity'], prefix='clarity', drop_first=False)
data = pd.concat([data, dummies], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,...,0,0,0,0,0,1,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,...,0,0,0,0,1,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,...,0,0,0,0,0,0,1,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,...,1,0,0,0,0,0,0,1,0,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,...,0,1,0,0,0,1,0,0,0,0


In [8]:
data.drop(['cut','color','clarity'], axis = 1, inplace = True)

In [9]:
data.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1,0,...,0,1,0,0,0,1,0,0,0,0


그런데, 이 작업(여러 범주형 변수들의 각각 가변수화)들 한번에 하기 위한 코드를 작성해 봅시다.

In [None]:
# 다시 데이터 불러오고

data = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/diamonds.csv'
                 , sep=',', skipinitialspace=True)  

data.head()

In [None]:
#data._get_numeric_data()
a = data.dtypes
type(a)

In [None]:
a == 'object'

In [None]:
a[a == 'object'].index.tolist()

In [None]:
cat_vars = a[a == 'object'].index.tolist()
cat_vars

In [None]:
# for 반복문으로 한번에...

a = data.dtypes
cat_vars = a[a == 'object'].index.tolist()

for each in cat_vars:
    dummies = pd.get_dummies(data[each], prefix=each, drop_first=False)
    data = pd.concat([data, dummies], axis=1)

data.head()

In [None]:
data.dtypes

불필요한 칼럼 제거

In [None]:
# 불필요한 변수들 제거 : 가변수화 하기 전 변수, id, 일련번호 등.
data = data.drop(cat_vars, axis=1)
data.head()

In [None]:
data.describe()

### 2. Scaling

#### 2.1 Standardization (mean, std)

In [10]:
# 스케일링 방법 중 표준화 방법을 사용해 봅시다.

standSc = np.array([1,2,2,3,4,3,2,4,5,7]).reshape(-1,1)
standSc

array([[1],
       [2],
       [2],
       [3],
       [4],
       [3],
       [2],
       [4],
       [5],
       [7]])

In [11]:
s_mean, s_std = standSc.mean(), standSc.std()
print(s_mean, s_std)

3.3 1.676305461424021


In [12]:
stdanSc_ss = ((standSc - s_mean) / s_std).reshape(-1,1)
stdanSc_ss

array([[-1.37206497],
       [-0.77551498],
       [-0.77551498],
       [-0.178965  ],
       [ 0.41758499],
       [-0.178965  ],
       [-0.77551498],
       [ 0.41758499],
       [ 1.01413498],
       [ 2.20723495]])

In [13]:
print(np.mean(stdanSc_ss), np.std(stdanSc_ss))

1.3322676295501878e-16 1.0


In [14]:
# 원래값과 표준화된 값 비교해 보자.

np.hstack((standSc, stdanSc_ss))

array([[ 1.        , -1.37206497],
       [ 2.        , -0.77551498],
       [ 2.        , -0.77551498],
       [ 3.        , -0.178965  ],
       [ 4.        ,  0.41758499],
       [ 3.        , -0.178965  ],
       [ 2.        , -0.77551498],
       [ 4.        ,  0.41758499],
       [ 5.        ,  1.01413498],
       [ 7.        ,  2.20723495]])

In [None]:
# 표준화 한 값을 다시 원래대로 환원시키기

standSc_rt = stdanSc_ss * s_std + s_mean
np.hstack((standSc, stdanSc_ss, standSc_rt))

#### 2.2 Normalization (min, max)

In [15]:
# 스케일링 방법 중 정규화 방법을 사용해 봅시다.

normalSc = np.array([1,2,2,3,4,3,2,4,5,7]).reshape(-1,1)
normalSc

array([[1],
       [2],
       [2],
       [3],
       [4],
       [3],
       [2],
       [4],
       [5],
       [7]])

In [16]:
s_min, s_max = normalSc.min(), normalSc.max()
print(s_min, s_max)

1 7


In [17]:
normalSc_ss = ((normalSc - s_min) / (s_max - s_min)).reshape(-1,1)
normalSc_ss

array([[0.        ],
       [0.16666667],
       [0.16666667],
       [0.33333333],
       [0.5       ],
       [0.33333333],
       [0.16666667],
       [0.5       ],
       [0.66666667],
       [1.        ]])

In [18]:
print(normalSc_ss.min(), normalSc_ss.max())

0.0 1.0


In [None]:
# 원래값과 표준화된 값 비교해 보자.

np.hstack((normalSc, normalSc_ss))

원래대로 돌려 놓아봅시다.

#### 2.3 숫자형 변수를 한꺼번에 Scaling 

In [None]:
data.head()

In [None]:
# 가변수화 된 변수은 빼고, 나머지 숫자형 변수들의 스케일을 맞춰봅시다.
quant_features = ['depth', 'table','price','x','y','z']

# test set의 값을 표준화 시키거나, 표준화된 값을 원래대로 복구하기 위해 저장.
scaled_features = {}

for each in quant_features:
    mean, std = data[each].mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std

data.head()

#### 실전에서는 패키지를 이용해서 scaling을 수행합니다.
여기서는 코드 구경만하고, 실습때 수행해 봅니다.

In [None]:
# 필요한 함수 로딩
from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import StandardScaler

# 선언하기
minmax = MinMaxScaler()

# train 데이터로 .fit ==> 모든변수의 min, max, 공식 도출.
# 말하자면, scaling 하는 모델을 만드는 것!
minmax.fit(train_features) # train min, max

# 위 scaling 공식으로 적용하기(변환하기)
train_features_sc = minmax.transform(train_features)
test_features_sc = minmax.transform(test_features)