In [2]:
import numpy as np
import pandas as pd

import category_encoders as ce

# 변수 유형

- 수치형 데이터
    - 연속형 데이터
    - 이산형 데이터 (불연속형 데이터)
- 범주형 데이터
    - 순서형 데이터
    - 명목형 데이터

# 데이터 인코딩

범주형 데이터를 수치형 데이터로 변환하는 과정
- Norminal Encoding
> 순서 정보가 있는 데이터에 대한 인코딩  
> ex) 과목 성적, 영화 평점...
- Ordinal Encoding
> 순서 정보가 없는 데이터에 대한 인코딩  
> ex) 성별, 혈액형...

## Norminal Encoding

### One hot encoding

> 데이터의 1개의 Feature에 대해 (unique, 1)의 벡터를 만들어 해당하는 항목만 1, 나머지는 0으로 표시하는 방법  
> 가장 확실한 인코딩 방법  
> Feature의 unique한 값이 많을 수록 (sparse한 데이터일수록) 벡터가 많아져서 불리해진다.  

In [5]:
data = {"color": ["Red", "Blue", "Green", "Blue"]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,color
0,Red
1,Blue
2,Green
3,Blue


In [6]:
encoder = ce.OneHotEncoder(use_cat_names=True)
df_encoded = encoder.fit_transform(df)
df_encoded

Unnamed: 0,color_Red,color_Blue,color_Green
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0


### Mean Encoding

> 같은 값을 가진 row들의 해당 feature 값의 평균을 저장하는 방식

In [7]:
data = {'Pincode': ['753001', '753002', '753003', '753001', '753004', '753002', '753002', '753001', '753003'], 'O/P': [1, 1, 0, 0, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data) 
df

Unnamed: 0,Pincode,O/P
0,753001,1
1,753002,1
2,753003,0
3,753001,0
4,753004,1
5,753002,0
6,753002,1
7,753001,0
8,753003,1


In [8]:
group_mean = df.groupby('Pincode')['O/P'].mean()
group_mean

Pincode
753001    0.333333
753002    0.666667
753003    0.500000
753004    1.000000
Name: O/P, dtype: float64

In [9]:
df['Mean'] = df['Pincode'].map(group_mean)
df.head()

Unnamed: 0,Pincode,O/P,Mean
0,753001,1,0.333333
1,753002,1,0.666667
2,753003,0,0.5
3,753001,0,0.333333
4,753004,1,1.0


## Ordinal Encoding

### Label Encoding

> feature 값의 종류에 따라서 숫자로 분류하는 방식

In [10]:
data = {"column": ["Btech", "Masters", "High School", "PHD"]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,column
0,Btech
1,Masters
2,High School
3,PHD


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
encoder = LabelEncoder()
df['column_encoded'] = encoder.fit_transform(df['column'])
df

Unnamed: 0,column,column_encoded
0,Btech,0
1,Masters,2
2,High School,1
3,PHD,3


In [13]:
encoder.classes_

array(['Btech', 'High School', 'Masters', 'PHD'], dtype=object)

In [16]:
encoder.inverse_transform([0, 1])

array(['Btech', 'High School'], dtype=object)

In [29]:
encoder.classes_ = encoder.inverse_transform([1, 0, 2, 3])
encoder.classes_ 

array(['High School', 'Btech', 'Masters', 'PHD'], dtype=object)

In [26]:
df['column_encoded'] = encoder.fit_transform(df["column"])
df

Unnamed: 0,column,column_encoded
0,Btech,0
1,Masters,2
2,High School,1
3,PHD,3


### Target Encoding

> 해당 feature의 통계를 기준으로 랭킹을 매겨서 인코딩하는 방법  
> category_encoders에서는 ["average", "min", "max", "first", "dense"]를 지원

In [31]:
data = {"Column": ["Btech", "PHD", "Masters", "High School", "PHD", "Btech", "Masters", "High School", "High School"], "O/P": [1, 1, 0, 0, 1, 0, 0, 0, 1]}
df = pd.DataFrame(data)
df

Unnamed: 0,Column,O/P
0,Btech,1
1,PHD,1
2,Masters,0
3,High School,0
4,PHD,1
5,Btech,0
6,Masters,0
7,High School,0
8,High School,1


In [32]:
encoder = ce.TargetEncoder()

df_encoded = encoder.fit_transform(df["Column"], df["O/P"])
df["Encoded"] = df_encoded["Column"]
df["Rank"] = df["Encoded"].rank(method="dense", ascending=False)
df

Unnamed: 0,Column,O/P,Encoded,Rank
0,Btech,1,0.452325,2.0
1,PHD,1,0.523251,1.0
2,Masters,0,0.3814,4.0
3,High School,0,0.427282,3.0
4,PHD,1,0.523251,1.0
5,Btech,0,0.452325,2.0
6,Masters,0,0.3814,4.0
7,High School,0,0.427282,3.0
8,High School,1,0.427282,3.0


### Ordinal Encoding

In [35]:
df = pd.DataFrame(
    {'Fruit': ['사과', '딸기', '바나나', '수박', '포도',
            '메론', '자두', '체리', '화이트베리', '무화과'],
    'color':['red1','red2','yellow','red','purple','green','light red','pink','white','brown'],
    'price': [2000,300,400, 30000, 150, 8000,1000,100,300,800]})

df

Unnamed: 0,Fruit,color,price
0,사과,red1,2000
1,딸기,red2,300
2,바나나,yellow,400
3,수박,red,30000
4,포도,purple,150
5,메론,green,8000
6,자두,light red,1000
7,체리,pink,100
8,화이트베리,white,300
9,무화과,brown,800


In [36]:
encoder = ce.OrdinalEncoder(cols='color')

df_encoded = encoder.fit_transform(df)
df_encoded

Unnamed: 0,Fruit,color,price
0,사과,1,2000
1,딸기,2,300
2,바나나,3,400
3,수박,4,30000
4,포도,5,150
5,메론,6,8000
6,자두,7,1000
7,체리,8,100
8,화이트베리,9,300
9,무화과,10,800


# 예제

In [37]:
results = []

In [38]:
from sklearn.tree import DecisionTreeClassifier
SEED = 42

In [53]:
import seaborn as sns

df = sns.load_dataset('titanic')
cols = ["age", "sibsp", "parch", "fare", "pclass", "sex", "embarked", "survived"]
df = df[cols]
df.shape

(891, 8)

In [54]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=SEED, test_size=0.2)
train.shape, test.shape

((712, 8), (179, 8))

In [55]:
train.age = train.age.fillna(train.age.mean())
test.age = test.age.fillna(train.age.mean())

train.embarked = train.embarked.fillna(train.embarked.mode().values[0])
test.embarked = test.embarked.fillna(train.embarked.mode().values[0])

train.isnull().sum().sum()

0

In [58]:
cols = ["age", "fare"]
features_tr = train[cols]
target_tr = train["survived"]
features_te = test[cols]
target_te = test["survived"]

features_tr.shape, target_tr.shape

((712, 2), (712,))

In [71]:
cols_encoding = ["pclass", "sex", "embarked", "sibsp", "parch"]
tmp_tr = train[cols_encoding]
tmp_te = test[cols_encoding]

tmp_tr.shape

(712, 5)

In [72]:
tmp_tr.head()

Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,male,S,0,0
733,2,male,S,0,0
382,3,male,S,0,0
704,3,male,S,1,0
813,3,female,S,4,2


In [73]:
tmp_tr["sex"] = tmp_tr["sex"].map({"male":1, "female":0})
tmp_tr["embarked"] = tmp_tr["embarked"].map({"S":2, "C":1, "Q":0})

tmp_te["sex"] = tmp_te["sex"].map({"male":1, "female":0})
tmp_te["embarked"] = tmp_te["embarked"].map({"S":2, "C":1, "Q":0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr["sex"] = tmp_tr["sex"].map({"male":1, "female":0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr["embarked"] = tmp_tr["embarked"].map({"S":2, "C":1, "Q":0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_te["sex"] = tmp_te["sex"].map({"male":1, "female":0})
A value is trying to b

In [None]:
tmp_tr

Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,1,2,0,0
733,2,1,2,0,0
382,3,1,2,0,0
704,3,1,2,1,0
813,3,0,2,4,2
...,...,...,...,...,...
106,3,0,2,0,0
270,1,1,2,0,0
860,3,1,2,2,0
435,1,0,2,1,2


In [78]:
tmp_tr.isnull().sum().sum(), tmp_te.isnull().sum().sum()

(0, 0)

## One hot Encoding

In [79]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [80]:
enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
    _enco = encoder.fit_transform(tmp_tr[col].astype('category'))
    enco_tr = pd.concat([enco_tr, _enco], axis=1)

    _enco = encoder.transform(tmp_te[col].astype('category'))
    enco_te = pd.concat([enco_te, _enco], axis=1)

print(f"{enco_tr.shape} / {enco_te.shape}")
enco_tr.head()

(712, 22) / (179, 22)


Unnamed: 0,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,sibsp_0.0,sibsp_1.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
331,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
733,0,1,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
382,0,0,1,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
704,0,0,1,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
813,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [81]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 22))

In [85]:
print(features_tr.head())
enco_tr.head()

    age     fare
0  45.5  28.5000
1  23.0  13.0000
2  32.0   7.9250
3  26.0   7.8542
4   6.0  31.2750


Unnamed: 0,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,sibsp_0.0,sibsp_1.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
0,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,0,1,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,1,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [86]:
df_tr = pd.concat([features_tr, enco_tr], axis=1).reset_index(drop=True)
df_te = pd.concat([features_te, enco_te], axis=1).reset_index(drop=True)

print(f"{df_tr.shape} / {df_te.shape}")
df_tr.head()

(712, 24) / (179, 24)


Unnamed: 0,age,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
0,45.5,28.5,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,23.0,13.0,0,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,32.0,7.925,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,7.8542,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,6.0,31.275,0,0,1,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [87]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr, target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.776536312849162)

In [88]:
results.append(
    {
        'encoding': 'one-hot',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

## Mean Encoding

In [93]:
enco_tr = pd.concat([tmp_tr, target_tr], axis=1)
enco_tr

Unnamed: 0,pclass,sex,embarked,sibsp,parch,survived
331,1,1,2,0,0,0
733,2,1,2,0,0,0
382,3,1,2,0,0,0
704,3,1,2,1,0,0
813,3,0,2,4,2,0
...,...,...,...,...,...,...
106,3,0,2,0,0,1
270,1,1,2,0,0,0
860,3,1,2,2,0,0
435,1,0,2,1,2,1


In [94]:
enco_tr = enco_tr.groupby("survived").mean()
enco_tr

Unnamed: 0_level_0,pclass,sex,embarked,sibsp,parch
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2.536036,0.855856,1.70045,0.596847,0.331081
1,1.988806,0.324627,1.58209,0.481343,0.458955


In [98]:
df_tr = features_tr.copy()
df_tr = pd.concat([df_tr.reset_index(drop=True), target_tr.reset_index(drop=True)], axis=1)
df_te = features_te.copy()
df_te = pd.concat([df_te.reset_index(drop=True), target_te.reset_index(drop=True)], axis=1)

df_tr

Unnamed: 0,age,fare,survived
0,45.500000,28.5000,0
1,23.000000,13.0000,0
2,32.000000,7.9250,0
3,26.000000,7.8542,0
4,6.000000,31.2750,0
...,...,...,...
707,21.000000,7.6500,1
708,29.498846,31.0000,0
709,41.000000,14.1083,0
710,14.000000,120.0000,1


In [99]:
df_tr = pd.merge(df_tr, enco_tr, on="survived", how="left")
df_te = pd.merge(df_te, enco_tr, on="survived", how="left")

df_tr

Unnamed: 0,age,fare,survived,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,0,2.536036,0.855856,1.70045,0.596847,0.331081
1,23.000000,13.0000,0,2.536036,0.855856,1.70045,0.596847,0.331081
2,32.000000,7.9250,0,2.536036,0.855856,1.70045,0.596847,0.331081
3,26.000000,7.8542,0,2.536036,0.855856,1.70045,0.596847,0.331081
4,6.000000,31.2750,0,2.536036,0.855856,1.70045,0.596847,0.331081
...,...,...,...,...,...,...,...,...
707,21.000000,7.6500,1,1.988806,0.324627,1.58209,0.481343,0.458955
708,29.498846,31.0000,0,2.536036,0.855856,1.70045,0.596847,0.331081
709,41.000000,14.1083,0,2.536036,0.855856,1.70045,0.596847,0.331081
710,14.000000,120.0000,1,1.988806,0.324627,1.58209,0.481343,0.458955


In [101]:
df_tr.drop(['survived'], axis=1, inplace=True)
df_te.drop(['survived'], axis=1, inplace=True)

print(f"{df_tr.shape} / {df_te.shape}")
df_tr

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,2.536036,0.855856,1.70045,0.596847,0.331081
1,23.000000,13.0000,2.536036,0.855856,1.70045,0.596847,0.331081
2,32.000000,7.9250,2.536036,0.855856,1.70045,0.596847,0.331081
3,26.000000,7.8542,2.536036,0.855856,1.70045,0.596847,0.331081
4,6.000000,31.2750,2.536036,0.855856,1.70045,0.596847,0.331081
...,...,...,...,...,...,...,...
707,21.000000,7.6500,1.988806,0.324627,1.58209,0.481343,0.458955
708,29.498846,31.0000,2.536036,0.855856,1.70045,0.596847,0.331081
709,41.000000,14.1083,2.536036,0.855856,1.70045,0.596847,0.331081
710,14.000000,120.0000,1.988806,0.324627,1.58209,0.481343,0.458955


In [106]:
df_tr

Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,2.536036,0.855856,1.70045,0.596847,0.331081
1,23.000000,13.0000,2.536036,0.855856,1.70045,0.596847,0.331081
2,32.000000,7.9250,2.536036,0.855856,1.70045,0.596847,0.331081
3,26.000000,7.8542,2.536036,0.855856,1.70045,0.596847,0.331081
4,6.000000,31.2750,2.536036,0.855856,1.70045,0.596847,0.331081
...,...,...,...,...,...,...,...
707,21.000000,7.6500,1.988806,0.324627,1.58209,0.481343,0.458955
708,29.498846,31.0000,2.536036,0.855856,1.70045,0.596847,0.331081
709,41.000000,14.1083,2.536036,0.855856,1.70045,0.596847,0.331081
710,14.000000,120.0000,1.988806,0.324627,1.58209,0.481343,0.458955


In [102]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr, target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(1.0, 1.0)

In [107]:
results.append(
    {
        'encoding': 'mean',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

## Label Encoding

In [112]:
encoder = LabelEncoder()
enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
    enco_tr[col] = encoder.fit_transform(tmp_tr[col])
    enco_te[col] = encoder.transform(tmp_te[col])

print(f"{enco_tr.shape} / {enco_te.shape}")
enco_tr

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
0,0,1,2,0,0
1,1,1,2,0,0
2,2,1,2,0,0
3,2,1,2,1,0
4,2,0,2,4,2
...,...,...,...,...,...
707,2,0,2,0,0
708,0,1,2,0,0
709,2,1,2,2,0
710,0,0,2,1,2


In [113]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [114]:
df_tr = pd.concat([features_tr, enco_tr], axis=1).reset_index(drop=True)
df_te = pd.concat([features_te, enco_te], axis=1).reset_index(drop=True)

print(f"{df_tr.shape} / {df_te.shape}")
df_tr

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,0,1,2,0,0
1,23.000000,13.0000,1,1,2,0,0
2,32.000000,7.9250,2,1,2,0,0
3,26.000000,7.8542,2,1,2,1,0
4,6.000000,31.2750,2,0,2,4,2
...,...,...,...,...,...,...,...
707,21.000000,7.6500,2,0,2,0,0
708,29.498846,31.0000,0,1,2,0,0
709,41.000000,14.1083,2,1,2,2,0
710,14.000000,120.0000,0,0,2,1,2


In [115]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr, target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.7821229050279329)

In [116]:
results.append(
    {
        'encoding': 'label',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

## Target Encoding

In [117]:
encoder = ce.TargetEncoder()

In [119]:
enco_tr = encoder.fit_transform(tmp_tr.reset_index(drop=True), target_tr.reset_index(drop=True))
enco_te = encoder.transform(tmp_te.reset_index(drop=True))

print(f"{enco_tr.shape} / {enco_te.shape}")
enco_tr

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
0,1,1,2,0,0
1,2,1,2,0,0
2,3,1,2,0,0
3,3,1,2,1,0
4,3,0,2,4,2
...,...,...,...,...,...
707,3,0,2,0,0
708,1,1,2,0,0
709,3,1,2,2,0
710,1,0,2,1,2


In [121]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [122]:
df_tr = pd.concat([features_tr, enco_tr], axis=1).reset_index(drop=True)
df_te = pd.concat([features_te, enco_te], axis=1).reset_index(drop=True)

print(f"{df_tr.shape} / {df_te.shape}")
df_tr

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,1,1,2,0,0
1,23.000000,13.0000,2,1,2,0,0
2,32.000000,7.9250,3,1,2,0,0
3,26.000000,7.8542,3,1,2,1,0
4,6.000000,31.2750,3,0,2,4,2
...,...,...,...,...,...,...,...
707,21.000000,7.6500,3,0,2,0,0
708,29.498846,31.0000,1,1,2,0,0
709,41.000000,14.1083,3,1,2,2,0
710,14.000000,120.0000,1,0,2,1,2


In [125]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr, target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.7821229050279329)

In [126]:
results.append({
    'encoding': 'target',
    'tr_score': tr_score,
    'te_score': te_score
})

## Ordinal Encoding

In [127]:
encoder = ce.OrdinalEncoder(cols = tmp_tr.columns)

In [128]:
enco_tr = encoder.fit_transform(tmp_tr)
enco_te = encoder.transform(tmp_te)

print(f"{enco_tr.shape} / {enco_te.shape}")
enco_tr

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,1,1,1,1
733,2,1,1,1,1
382,3,1,1,1,1
704,3,1,1,2,1
813,3,2,1,3,2
...,...,...,...,...,...
106,3,2,1,1,1
270,1,1,1,1,1
860,3,1,1,5,1
435,1,2,1,2,2


In [129]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [130]:
df_tr = pd.concat([features_tr, enco_tr], axis=1).reset_index(drop=True)
df_te = pd.concat([features_te, enco_te], axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.500000,28.5000,1,1,1,1,1
1,23.000000,13.0000,2,1,1,1,1
2,32.000000,7.9250,3,1,1,1,1
3,26.000000,7.8542,3,1,1,2,1
4,6.000000,31.2750,3,2,1,3,2
...,...,...,...,...,...,...,...
707,21.000000,7.6500,3,2,1,1,1
708,29.498846,31.0000,1,1,1,1,1
709,41.000000,14.1083,3,1,1,5,1
710,14.000000,120.0000,1,2,1,2,2


In [131]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score

(0.9803370786516854, 0.776536312849162)

In [132]:
results.append(
    {
        'encoding': 'ordinal',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

## 결과 확인

In [133]:
pd.DataFrame(results).sort_values(by=['te_score', 'tr_score'], ascending=[False,])

Unnamed: 0,encoding,tr_score,te_score
1,mean,1.0,1.0
2,label,0.980337,0.782123
3,target,0.980337,0.782123
0,one-hot,0.980337,0.776536
4,ordinal,0.980337,0.776536
