In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mglearn
import warnings
import os

plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='Malgun Gothic')

warnings.filterwarnings('ignore')

# 범주형 변수

In [6]:
adult_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult'

In [17]:
adult_data = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "adult.data"),
                 header=None, index_col=False,
                 names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                       'martial-status', 'occupation', 'relationship', 'race', 'gender',
                       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                       'income'])

## 원-핫-인코딩(가변수)

범주형 변수를 표현하는 데 가장 널리 쓰이는 방법은 원-핫-인코딩(one-hot-encoding)이다.<br>
이를 one-out-of-N encoding 혹은 가변수(dummy variable)라고도 한다.<br>
가변수는 범주형 변수를 0 또는 1 값을 가진 하나 이상의 새로운 특성을 바꾼 것.<br>
0과 1로 표현된 변수는 선형 이진 분류 공식에 적용할 수 있어서, 다음과 같이 개수에 상관없이 범주마다 하나의 특성으로 표현한다.



In [18]:
# 예제를 위해 몇 개의 열만 선택한다.
data = adult_data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
                  'occupation', 'income']]

In [19]:
data.head(10)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
5,37,Private,Masters,Female,40,Exec-managerial,<=50K
6,49,Private,9th,Female,16,Other-service,<=50K
7,52,Self-emp-not-inc,HS-grad,Male,45,Exec-managerial,>50K
8,31,Private,Masters,Female,50,Prof-specialty,>50K
9,42,Private,Bachelors,Male,40,Exec-managerial,>50K


In [21]:
data.gender.value_counts()

 Male      21790
 Female    10771
Name: gender, dtype: int64

In [25]:
print(f"original Feature: {list(data.columns)} \n")

data_dummies = pd.get_dummies(data)

print(f"get_dummies 후의 특성: \n{list(data_dummies.columns)}")

original Feature: ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

get_dummies 후의 특성: 
['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'oc

In [26]:
data.workclass.value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64