In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import make_column_transformer

##########데이터 로드

#hour, attendance, region, score
df = pd.DataFrame([
    [2, 3, 'country', 'A'],
    [3, 2, 'country', 'B'],
    [3, 4, 'country', 'A0'],
    [5, 5, 'country', 'A+'],
    [7, 5, 'city', 'B'],
    [2, 5, 'country', 'C'],
    [8, 9, 'city', 'B'],
    [9, 10, 'city', 'D'],
    [6, 12, 'city', 'D'],
    [9, 2, 'city', 'C'],
    [6, 10, 'city', 'C'],
    [2, 4, 'country', 'F']
], columns=['hour', 'attendance', 'region', 'score'])

df

Unnamed: 0,hour,attendance,region,score
0,2,3,country,A
1,3,2,country,B
2,3,4,country,A0
3,5,5,country,A+
4,7,5,city,B
5,2,5,country,C
6,8,9,city,B
7,9,10,city,D
8,6,12,city,D
9,9,2,city,C


In [6]:
#유일한 값
df['score'].unique()

array(['A', 'B', 'A0', 'A+', 'C', 'D', 'F'], dtype=object)

In [9]:
transformer = make_column_transformer(
    #유일한 값 종류 지정
    (OrdinalEncoder(categories=[['A+','A0','A','B','C', 'D', 'F']]), ['score']), 
    (OneHotEncoder(), ['region']),
    remainder='passthrough')

transformer.fit(df)
trans_data = transformer.transform(df)
trans_data

array([[ 2.,  0.,  1.,  2.,  3.],
       [ 3.,  0.,  1.,  3.,  2.],
       [ 1.,  0.,  1.,  3.,  4.],
       [ 0.,  0.,  1.,  5.,  5.],
       [ 3.,  1.,  0.,  7.,  5.],
       [ 4.,  0.,  1.,  2.,  5.],
       [ 3.,  1.,  0.,  8.,  9.],
       [ 5.,  1.,  0.,  9., 10.],
       [ 5.,  1.,  0.,  6., 12.],
       [ 4.,  1.,  0.,  9.,  2.],
       [ 4.,  1.,  0.,  6., 10.],
       [ 6.,  0.,  1.,  2.,  4.]])

In [10]:
transformer.get_feature_names_out()
trans_df = pd.DataFrame(data=trans_data,columns=transformer.get_feature_names_out())
trans_df

Unnamed: 0,ordinalencoder__score,onehotencoder__region_city,onehotencoder__region_country,remainder__hour,remainder__attendance
0,2.0,0.0,1.0,2.0,3.0
1,3.0,0.0,1.0,3.0,2.0
2,1.0,0.0,1.0,3.0,4.0
3,0.0,0.0,1.0,5.0,5.0
4,3.0,1.0,0.0,7.0,5.0
5,4.0,0.0,1.0,2.0,5.0
6,3.0,1.0,0.0,8.0,9.0
7,5.0,1.0,0.0,9.0,10.0
8,5.0,1.0,0.0,6.0,12.0
9,4.0,1.0,0.0,9.0,2.0


In [11]:
# 라벨링 된 순서 , 순서대로 라벨링 됨.
transformer.transformers_[0][1].categories_

[array(['A+', 'A0', 'A', 'B', 'C', 'D', 'F'], dtype=object)]

---

## 연습문제

In [31]:
import seaborn as sns
titanic = sns.load_dataset("titanic")

In [32]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [33]:
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [16]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder

import seaborn as sns
titanic = sns.load_dataset("titanic")

# Make a column transformer
# one-hot:[ "embarked",'who','adult_male','deck','embark_town',"sex",'alive',"alone"]
# ordinary:["class"]

transformer = make_column_transformer(
    (OneHotEncoder(), 인코딩 적용할 컬럼 리스트),
    (OrdinalEncoder(유일한 값 종류),    인코딩 적용할 컬럼 리스트 ),
     remainder='passthrough'
)

transformer.fit(titanic)
trans_data = transformer.transform(titanic)
trans_df = pd.DataFrame(data=trans_data,columns=transformer.get_feature_names_out())
trans_df

Unnamed: 0,onehotencoder__embarked_C,onehotencoder__embarked_Q,onehotencoder__embarked_S,onehotencoder__embarked_nan,onehotencoder__who_child,onehotencoder__who_man,onehotencoder__who_woman,onehotencoder__adult_male_False,onehotencoder__adult_male_True,onehotencoder__deck_A,...,onehotencoder__alive_yes,onehotencoder__alone_False,onehotencoder__alone_True,ordinalencoder__class,remainder__survived,remainder__pclass,remainder__age,remainder__sibsp,remainder__parch,remainder__fare
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,2.0,0.0,3.0,22.0,1.0,0.0,7.2500
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,38.0,1.0,0.0,71.2833
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,2.0,1.0,3.0,26.0,0.0,0.0,7.9250
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,35.0,1.0,0.0,53.1000
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,2.0,0.0,3.0,35.0,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,2.0,27.0,0.0,0.0,13.0000
887,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,19.0,0.0,0.0,30.0000
888,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,2.0,0.0,3.0,,1.0,2.0,23.4500
889,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,26.0,0.0,0.0,30.0000


In [17]:
transformer.get_feature_names_out()

array(['onehotencoder__embarked_C', 'onehotencoder__embarked_Q',
       'onehotencoder__embarked_S', 'onehotencoder__embarked_nan',
       'onehotencoder__who_child', 'onehotencoder__who_man',
       'onehotencoder__who_woman', 'onehotencoder__adult_male_False',
       'onehotencoder__adult_male_True', 'onehotencoder__deck_A',
       'onehotencoder__deck_B', 'onehotencoder__deck_C',
       'onehotencoder__deck_D', 'onehotencoder__deck_E',
       'onehotencoder__deck_F', 'onehotencoder__deck_G',
       'onehotencoder__deck_nan', 'onehotencoder__embark_town_Cherbourg',
       'onehotencoder__embark_town_Queenstown',
       'onehotencoder__embark_town_Southampton',
       'onehotencoder__embark_town_nan', 'onehotencoder__sex_female',
       'onehotencoder__sex_male', 'onehotencoder__alive_no',
       'onehotencoder__alive_yes', 'onehotencoder__alone_False',
       'onehotencoder__alone_True', 'ordinalencoder__class',
       'remainder__survived', 'remainder__pclass', 'remainder__age',
   

In [6]:
trans_df.isnull().sum() #결측치는 NaN으로 따로 컬럼이 만들어짐

onehotencoder__embarked_C                   0
onehotencoder__embarked_Q                   0
onehotencoder__embarked_S                   0
onehotencoder__embarked_nan                 0
onehotencoder__who_child                    0
onehotencoder__who_man                      0
onehotencoder__who_woman                    0
onehotencoder__adult_male_False             0
onehotencoder__adult_male_True              0
onehotencoder__deck_A                       0
onehotencoder__deck_B                       0
onehotencoder__deck_C                       0
onehotencoder__deck_D                       0
onehotencoder__deck_E                       0
onehotencoder__deck_F                       0
onehotencoder__deck_G                       0
onehotencoder__deck_nan                     0
onehotencoder__embark_town_Cherbourg        0
onehotencoder__embark_town_Queenstown       0
onehotencoder__embark_town_Southampton      0
onehotencoder__embark_town_nan              0
onehotencoder__sex_female         