In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mglearn
import warnings
import os

plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='Malgun Gothic')

warnings.filterwarnings('ignore')

# 범주형 변수

In [2]:
adult_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult'

In [3]:
adult_data = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "adult.data"),
                 header=None, index_col=False,
                 names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                       'martial-status', 'occupation', 'relationship', 'race', 'gender',
                       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                       'income'])

## 원-핫-인코딩(가변수)

범주형 변수를 표현하는 데 가장 널리 쓰이는 방법은 원-핫-인코딩(one-hot-encoding)이다.<br>
이를 one-out-of-N encoding 혹은 가변수(dummy variable)라고도 한다.<br>
가변수는 범주형 변수를 0 또는 1 값을 가진 하나 이상의 새로운 특성을 바꾼 것.<br>
0과 1로 표현된 변수는 선형 이진 분류 공식에 적용할 수 있어서, 다음과 같이 개수에 상관없이 범주마다 하나의 특성으로 표현한다.



In [4]:
# 예제를 위해 몇 개의 열만 선택한다.
data = adult_data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
                  'occupation', 'income']]

In [5]:
data.head(10)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
5,37,Private,Masters,Female,40,Exec-managerial,<=50K
6,49,Private,9th,Female,16,Other-service,<=50K
7,52,Self-emp-not-inc,HS-grad,Male,45,Exec-managerial,>50K
8,31,Private,Masters,Female,50,Prof-specialty,>50K
9,42,Private,Bachelors,Male,40,Exec-managerial,>50K


In [6]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [7]:
print(f"original Feature: {list(data.columns)} \n")

data_dummies = pd.get_dummies(data)

print(f"get_dummies 후의 특성: \n{list(data_dummies.columns)}")

original Feature: ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

get_dummies 후의 특성: 
['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'oc

In [8]:
data.workclass.value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [14]:
features = data_dummies.loc[:, 'age': 'occupation_ Transport-moving']
# Numpy 배열 추출
X = features.values
y = data_dummies['income_ >50K'].values

print(f"X.shpae: {X.shape}, y.shape:{y.shape}")

X.shpae: (32561, 44), y.shape:(32561,)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

print(f"Test Score: {logreg.score(X_test, y_test): .2f}")

Test Score:  0.81


In [16]:
demo_df = pd.DataFrame({'숫자 특성': [0, 1, 2, 1],
                       '범주형 특성': ['양말', '여우', '양말', '상자']})

In [17]:
pd.get_dummies(demo_df)

Unnamed: 0,숫자 특성,범주형 특성_상자,범주형 특성_양말,범주형 특성_여우
0,0,0,1,0
1,1,0,0,1
2,2,0,1,0
3,1,1,0,0


In [21]:
demo_df['숫자 특성'] = demo_df['숫자 특성'].astype(str)
pd.get_dummies(demo_df, columns=['숫자 특성', '범주형 특성'])

Unnamed: 0,숫자 특성_0,숫자 특성_1,숫자 특성_2,범주형 특성_상자,범주형 특성_양말,범주형 특성_여우
0,1,0,0,0,1,0
1,0,1,0,0,0,1
2,0,0,1,0,1,0
3,0,1,0,1,0,0


## OneHotEncoder와 ColumnTransformer: scikit-learn으로 범주현 변수 다루기

In [22]:
from sklearn.preprocessing import OneHotEncoder
# sparse=False 로 설정하면 OHE가 희소 행렬이 아닌 NumPy 배열로 반환
ohe = OneHotEncoder(sparse=False)
print(ohe.fit_transform(demo_df))

[[1. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0.]]


In [23]:
print(ohe.get_feature_names_out())

['숫자 특성_0' '숫자 특성_1' '숫자 특성_2' '범주형 특성_상자' '범주형 특성_양말' '범주형 특성_여우']


In [24]:
data.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


ColumnsTransformer로 OneHotEncoding과 Scaling까지 동시에 할 수 있다?!

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer(
    [("scaling", StandardScaler(), ['age', 'hours-per-week']),
    ("onehot", OneHotEncoder(sparse=False),
    ['workclass', 'education', 'gender', 'occupation'])]
)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# income을 제외한 모든 열을 추출
data_features = data.drop('income', axis=1)

In [31]:
# 데이터프레임과 income을 분할

X_train, X_test, y_train, y_test = train_test_split(
    data_features, data.income, random_state=0
)

ct.fit(X_train)
X_train_trans = ct.transform(X_train)
print(X_train_trans.shape)

(24420, 44)


하나의 변환기로 모든 전처리 단계를 캡슐화하면 장점이 더 있다고 한다..
뭘까?

In [32]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_trans, y_train)

X_test_trans = ct.transform(X_test)

print(f"Test Score: {logreg.score(X_test_trans, y_test): .2f}")

Test Score:  0.81


In [33]:
ct.named_transformers_.onehot

OneHotEncoder(sparse=False)

## make_column_transformer로 간편하게 ColumnTransformer 만들기

In [None]:
from sklearn.compose import make_column_transformer
ct = make_columns_tras