# 변수의 종류

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = {
    'Name':['John', 'Sabre', 'Kim', 'Sato', 'Lee', 'Smith', 'David', 'Park'],
    'Country':['USA', 'France', 'Korea', None, 'Korea', 'UK', 'USA', 'Korea'],
    'Age':['31', 33, None, 40, 36, 55, np.nan, 35], # numerical인데 categorical처럼 인식될 수 있음
    'Job':['Student', np.nan, 'Developer', 'Chef', 'Professor', 'CEO', 'Banker', 'Student'],
    'Hand':['L', 'R', 'R', 'B', 'L', 'L', 'R', 'R'],
    'Height':['T', 'S', 'M', 'S', 'T', 'S', 'S', 'T'],
    'Capital':[48.35, 150.8, 99.0, 100.0, 182.3, 1101.65, 131.87, 65.8]
}

df_nan = pd.DataFrame(data)
df = df_nan.copy()
df

Unnamed: 0,Name,Country,Age,Job,Hand,Height,Capital
0,John,USA,31.0,Student,L,T,48.35
1,Sabre,France,33.0,,R,S,150.8
2,Kim,Korea,,Developer,R,M,99.0
3,Sato,,40.0,Chef,B,S,100.0
4,Lee,Korea,36.0,Professor,L,T,182.3
5,Smith,UK,55.0,CEO,L,S,1101.65
6,David,USA,,Banker,R,S,131.87
7,Park,Korea,35.0,Student,R,T,65.8


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     8 non-null      object 
 1   Country  7 non-null      object 
 2   Age      6 non-null      object 
 3   Job      7 non-null      object 
 4   Hand     8 non-null      object 
 5   Height   8 non-null      object 
 6   Capital  8 non-null      float64
dtypes: float64(1), object(6)
memory usage: 576.0+ bytes


In [None]:
# Age -> Int16[+]
df['Age'] = df['Age'].astype('Float32').astype('Int16')

df

Unnamed: 0,Name,Country,Age,Job,Hand,Height,Capital
0,John,USA,31.0,Student,L,T,48.35
1,Sabre,France,33.0,,R,S,150.8
2,Kim,Korea,,Developer,R,M,99.0
3,Sato,,40.0,Chef,B,S,100.0
4,Lee,Korea,36.0,Professor,L,T,182.3
5,Smith,UK,55.0,CEO,L,S,1101.65
6,David,USA,,Banker,R,S,131.87
7,Park,Korea,35.0,Student,R,T,65.8


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     8 non-null      object 
 1   Country  7 non-null      object 
 2   Age      6 non-null      Int16  
 3   Job      7 non-null      object 
 4   Hand     8 non-null      object 
 5   Height   8 non-null      object 
 6   Capital  8 non-null      float64
dtypes: Int16(1), float64(1), object(5)
memory usage: 536.0+ bytes


In [None]:
cat_cols = ['Name', 'Country', 'Job', 'Hand', 'Height']
num_cols = ['Age', 'Capital']

In [None]:
num_cols

['Age', 'Capital']

In [None]:
df[cat_cols]

Unnamed: 0,Name,Country,Job,Hand,Height
0,John,USA,Student,L,T
1,Sabre,France,,R,S
2,Kim,Korea,Developer,R,M
3,Sato,,Chef,B,S
4,Lee,Korea,Professor,L,T
5,Smith,UK,CEO,L,S
6,David,USA,Banker,R,S
7,Park,Korea,Student,R,T


In [None]:
df_enc = df.copy()

- LabelEncoding하는 방법
    - 판다스에서 직접하는 방법
    - sklearn을 이용하는 방법

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Hand 칼럼 ordinal [+]
ord_enc = OrdinalEncoder()
ord_enc.fit(df[['Hand']])
ord_enc.transform(df[['Hand']])

array([[1.],
       [2.],
       [2.],
       [0.],
       [1.],
       [1.],
       [2.],
       [2.]])

In [None]:
df_enc['Hand'] = ord_enc.transform(df[['Hand']])
df_enc

Unnamed: 0,Name,Country,Age,Job,Hand,Height,Capital
0,John,USA,31.0,Student,1.0,T,48.35
1,Sabre,France,33.0,,2.0,S,150.8
2,Kim,Korea,,Developer,2.0,M,99.0
3,Sato,,40.0,Chef,0.0,S,100.0
4,Lee,Korea,36.0,Professor,1.0,T,182.3
5,Smith,UK,55.0,CEO,1.0,S,1101.65
6,David,USA,,Banker,2.0,S,131.87
7,Park,Korea,35.0,Student,2.0,T,65.8


- Hand는 순서가 없는 Norminal 변수인데 0 < 1 < 2로 인코딩되어서 차후 모델이 학습될 때 이 순서를 결과에 반영할 수 있게 됨

- 이런 경우 원핫인코딩을 사용하는 것이 더 좋음

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Hand 칼럼 onehot [+]
oh_enc = OneHotEncoder(sparse=False)
oh_enc.fit(df[['Hand']])
oh_enc.transform(df[['Hand']])

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [None]:
oh_enc.get_feature_names_out()

array(['Hand_B', 'Hand_L', 'Hand_R'], dtype=object)

In [None]:
df_enc = pd.concat( [df_enc, pd.DataFrame(oh_enc.transform(df[['Hand']]), 
                    columns=oh_enc.get_feature_names_out())], axis=1).drop('Hand', axis=1)
df_enc

Unnamed: 0,Name,Country,Age,Job,Height,Capital,Hand_B,Hand_L,Hand_R
0,John,USA,31.0,Student,T,48.35,0.0,1.0,0.0
1,Sabre,France,33.0,,S,150.8,0.0,0.0,1.0
2,Kim,Korea,,Developer,M,99.0,0.0,0.0,1.0
3,Sato,,40.0,Chef,S,100.0,1.0,0.0,0.0
4,Lee,Korea,36.0,Professor,T,182.3,0.0,1.0,0.0
5,Smith,UK,55.0,CEO,S,1101.65,0.0,1.0,0.0
6,David,USA,,Banker,S,131.87,0.0,0.0,1.0
7,Park,Korea,35.0,Student,T,65.8,0.0,0.0,1.0


- Height는 순서가 있는 Ordinal 변수 이므로 그대로 OrdinalEncoding

In [None]:
ord_enc.fit(df[['Height']])
ord_enc.transform(df[['Height']])

array([[2.],
       [1.],
       [0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [2.]])

- 인코딩 된 결과를 보면  M:0 < S:1 < T:2 로 되어 S < M < T 라는 의도된 순서를 지키지 못함

- 순서대로 하기 위해 카테고리 순서를 직접 입력

In [None]:
# height 칼럼 ordinal 순서 지켜서[+]
ord_enc = OrdinalEncoder(categories=[['S', 'M', 'T']])
ord_enc.fit(df[['Height']])
ord_enc.transform(df[['Height']])

array([[2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [0.],
       [2.]])

-결과를 보면 S:0 < M:1 < T:2 로 제대로 인코딩 된 것을 확인

In [None]:
df_enc['Height'] = ord_enc.transform(df[['Height']])
df_enc

Unnamed: 0,Name,Country,Age,Job,Height,Capital,Hand_B,Hand_L,Hand_R
0,John,USA,31.0,Student,2.0,48.35,0.0,1.0,0.0
1,Sabre,France,33.0,,0.0,150.8,0.0,0.0,1.0
2,Kim,Korea,,Developer,1.0,99.0,0.0,0.0,1.0
3,Sato,,40.0,Chef,0.0,100.0,1.0,0.0,0.0
4,Lee,Korea,36.0,Professor,2.0,182.3,0.0,1.0,0.0
5,Smith,UK,55.0,CEO,0.0,1101.65,0.0,1.0,0.0
6,David,USA,,Banker,0.0,131.87,0.0,0.0,1.0
7,Park,Korea,35.0,Student,2.0,65.8,0.0,0.0,1.0
