In [1]:
import numpy as np
import pandas as pd

예제 데이터 프레임 생성

In [2]:
columns = ['농장', '종류', '품질', '가격']
data = [['A농장', '고기', '하', 28700],
        ['B농장', '채소', '하', 12300],
        ['C농장', '과일', '중', 30000],
        ['A농장', '채소', '상', 22000],
        ['B농장', '고기', '중', 32000],
        ['C농장', '과일', '하', 15000],
        ['A농장', '과일', '중', 22000],
        ['B농장', '채소', '상', 24000],
        ['C농장', '고기', '상', 53000],
        ['A농장', '채소', '중', 12000]]

test_columns = ['농장', '종류', '품질']
test_data = [['A농장', '고기', '하'],
        ['B농장', '채소', '하'],
        ['D농장', '과일', '중']]


farm_df = pd.DataFrame(data, columns=columns)
test_farm_df = pd.DataFrame(test_data, columns=test_columns)
display(farm_df)
display(test_farm_df)

Unnamed: 0,농장,종류,품질,가격
0,A농장,고기,하,28700
1,B농장,채소,하,12300
2,C농장,과일,중,30000
3,A농장,채소,상,22000
4,B농장,고기,중,32000
5,C농장,과일,하,15000
6,A농장,과일,중,22000
7,B농장,채소,상,24000
8,C농장,고기,상,53000
9,A농장,채소,중,12000


Unnamed: 0,농장,종류,품질
0,A농장,고기,하
1,B농장,채소,하
2,D농장,과일,중


## Label Encoding

농장, 종류, 품질에 대해 라벨 인코딩

In [3]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

farm1 = farm_df.copy()

category = ['농장', '종류', '품질']
label_encode = {}

for c in category:
    l = LabelEncoder()
    farm1[c] = l.fit_transform(farm1[c])
    label_encode[c] = l

display(farm1)

# 인코딩 수행 과정
for c in category:
    print(f"{c} : ", end=' ')
    for i in range(len(label_encode[c].classes_)):
        print(f"{label_encode[c].classes_[i]} ({i})", end=' ')
    print()
    

Unnamed: 0,농장,종류,품질,가격
0,0,0,2,28700
1,1,2,2,12300
2,2,1,1,30000
3,0,2,0,22000
4,1,0,1,32000
5,2,1,2,15000
6,0,1,1,22000
7,1,2,0,24000
8,2,0,0,53000
9,0,2,1,12000


농장 :  A농장 (0) B농장 (1) C농장 (2) 
종류 :  고기 (0) 과일 (1) 채소 (2) 
품질 :  상 (0) 중 (1) 하 (2) 


테스트 데이터에도 라벨 인코딩 값 적용

In [4]:
test1 = test_farm_df.copy()

for c in category:
    try:
        test1[c] = label_encode[c].transform(test1[c])
    except Exception as e:
        print(e)

test1

y contains previously unseen labels: 'D농장'


Unnamed: 0,농장,종류,품질
0,A농장,0,2
1,B농장,2,2
2,D농장,1,1


학습 데이터에 없는 카테고리 처리

In [5]:
test2 = test_farm_df.copy()

# 라벨 인코더 클래스에 '기타' 추가
for l in label_encode.values():
    l.classes_ = np.append(l.classes_, 'Other')
# 새로운 카테고리를 '기타'로 매핑
for c in category:
    test2[c] = test2[c].apply(lambda x: x if x in label_encode[c].classes_ else 'Other')
    test2[c] = label_encode[c].transform(test2[c])
    
test2

Unnamed: 0,농장,종류,품질
0,0,0,2
1,1,2,2
2,3,1,1


## Direct Mapping

각 카테고리의 순서를 직접 지정

In [6]:
farm2 = farm_df.copy()
quality = {'하': 0,
           '중': 1,
           '상': 2}

# 테스트 데이터에 대해서도 똑같이 적용 가능
farm2['품질'] = farm2['품질'].replace(quality)
farm2

  farm2['품질'] = farm2['품질'].replace(quality)


Unnamed: 0,농장,종류,품질,가격
0,A농장,고기,0,28700
1,B농장,채소,0,12300
2,C농장,과일,1,30000
3,A농장,채소,2,22000
4,B농장,고기,1,32000
5,C농장,과일,0,15000
6,A농장,과일,1,22000
7,B농장,채소,2,24000
8,C농장,고기,2,53000
9,A농장,채소,1,12000


## Ordinary Encoder

In [7]:
from sklearn.preprocessing import OrdinalEncoder

farm3 = farm_df.copy()
# 카테고리 순서 지정
ordinary_encoder = OrdinalEncoder(categories=[['하', '중', '상']])
# 기본이 float형 이므로 int형 변경 필요
# 테스트 데이터에 대해서도 똑같이 적용 가능
farm3['품질'] = ordinary_encoder.fit_transform(farm3[['품질']]).astype(int)

farm3


Unnamed: 0,농장,종류,품질,가격
0,A농장,고기,0,28700
1,B농장,채소,0,12300
2,C농장,과일,1,30000
3,A농장,채소,2,22000
4,B농장,고기,1,32000
5,C농장,과일,0,15000
6,A농장,과일,1,22000
7,B농장,채소,2,24000
8,C농장,고기,2,53000
9,A농장,채소,1,12000


## One-Hot Encoding

농장, 종류 값을 One-Hot Encoding 처리

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

farm4 = farm_df.copy()

# sparse_output: 밀집 형태로 저장 -> 모든 요소를 가지는 2차원 배열 형태
# handle_unknown='ignore': 새로운 범주를 만나더라도 오류 없이 처리
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
farm4_encoded = encoder.fit_transform(farm4[['농장', '종류']])
farm4_columns = encoder.get_feature_names_out(['농장', '종류'])
farm4_encoded_df = pd.DataFrame(farm4_encoded, columns=farm4_columns)

# 인코딩 값과 나머지 값(농장, 종류를 제외한 값)을 결합
farm4_df = pd.concat([farm4, farm4_encoded_df], axis=1).drop(['농장', '종류'], axis=1)

farm4_df

Unnamed: 0,품질,가격,농장_A농장,농장_B농장,농장_C농장,종류_고기,종류_과일,종류_채소
0,하,28700,1.0,0.0,0.0,1.0,0.0,0.0
1,하,12300,0.0,1.0,0.0,0.0,0.0,1.0
2,중,30000,0.0,0.0,1.0,0.0,1.0,0.0
3,상,22000,1.0,0.0,0.0,0.0,0.0,1.0
4,중,32000,0.0,1.0,0.0,1.0,0.0,0.0
5,하,15000,0.0,0.0,1.0,0.0,1.0,0.0
6,중,22000,1.0,0.0,0.0,0.0,1.0,0.0
7,상,24000,0.0,1.0,0.0,0.0,0.0,1.0
8,상,53000,0.0,0.0,1.0,1.0,0.0,0.0
9,중,12000,1.0,0.0,0.0,0.0,0.0,1.0


테스트 데이터도 원-핫 인코딩 값 적용

In [9]:
test3 = test_farm_df.copy()

try:
    # 이미 훈련 데이터 셋에 맞춰진 인코더를 사용하므로 fit_transform이 아닌 transform 사용
    test_encoded = encoder.transform(test3[['농장', '종류']])
    test_encoded_df = pd.DataFrame(test_encoded, columns=farm4_columns)
    test_df = pd.concat([test3, test_encoded_df], axis=1).drop(['농장', '종류'], axis=1)
    display(test_df)
except Exception as e:
    print(e)

Unnamed: 0,품질,농장_A농장,농장_B농장,농장_C농장,종류_고기,종류_과일,종류_채소
0,하,1.0,0.0,0.0,1.0,0.0,0.0
1,하,0.0,1.0,0.0,0.0,0.0,1.0
2,중,0.0,0.0,0.0,0.0,1.0,0.0


불필요한 컬럼을 제거한 코드 -> drop='first'

In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

farm4 = farm_df.copy()

# sparse_output: 밀집 형태로 저장 -> 모든 요소를 가지는 2차원 배열 형태
# handle_unknown='ignore': 새로운 범주를 만나더라도 오류 없이 처리
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
farm4_encoded = encoder.fit_transform(farm4[['농장', '종류']])
farm4_columns = encoder.get_feature_names_out(['농장', '종류'])
farm4_encoded_df = pd.DataFrame(farm4_encoded, columns=farm4_columns)

# 인코딩 값과 나머지 값(농장, 종류를 제외한 값)을 결합
farm4_df = pd.concat([farm4, farm4_encoded_df], axis=1).drop(['농장', '종류'], axis=1)

farm4_df

Unnamed: 0,품질,가격,농장_B농장,농장_C농장,종류_과일,종류_채소
0,하,28700,0.0,0.0,0.0,0.0
1,하,12300,1.0,0.0,0.0,1.0
2,중,30000,0.0,1.0,1.0,0.0
3,상,22000,0.0,0.0,0.0,1.0
4,중,32000,1.0,0.0,0.0,0.0
5,하,15000,0.0,1.0,1.0,0.0
6,중,22000,0.0,0.0,1.0,0.0
7,상,24000,1.0,0.0,0.0,1.0
8,상,53000,0.0,1.0,0.0,0.0
9,중,12000,0.0,0.0,0.0,1.0


## Binary Encoding

In [15]:
import category_encoders as ce

farm5 = farm_df.copy()
encoder = ce.BinaryEncoder(cols=['농장', '종류'])
farm5_encoded = encoder.fit_transform(farm5[['농장', '종류']])
farm5_colums = farm5_encoded.columns
farm5_encoded_df = pd.DataFrame(farm5_encoded, columns=farm5_colums)

farm5_df = pd.concat([farm5, farm5_encoded_df], axis=1).drop(['농장', '종류'], axis=1)
farm5_df


Unnamed: 0,품질,가격,농장_0,농장_1,종류_0,종류_1
0,하,28700,0,1,0,1
1,하,12300,1,0,1,0
2,중,30000,1,1,1,1
3,상,22000,0,1,1,0
4,중,32000,1,0,0,1
5,하,15000,1,1,1,1
6,중,22000,0,1,1,1
7,상,24000,1,0,1,0
8,상,53000,1,1,0,1
9,중,12000,0,1,1,0
