# 범주형변수 가공 데모

## 라이브러리 import 및 설정

In [29]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [30]:
import kaggler
from lightgbm import LGBMRegressor
from matplotlib import rcParams, pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import hstack
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from warnings import simplefilter

In [31]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
simplefilter('ignore')

## 학습데이터 로드

In [32]:
data_dir = Path('./movies')
trn_file = data_dir / 'movies_train.csv'
seed = 42
target_col = 'box_off_num'

In [33]:
df = pd.read_csv(trn_file, index_col=0)
print(df.shape)
df.head()

(600, 11)


Unnamed: 0_level_0,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


## EDA (Exploratory Data Analysis)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 개들의 전쟁 to 베를린
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   distributor     600 non-null    object 
 1   genre           600 non-null    object 
 2   release_time    600 non-null    object 
 3   time            600 non-null    int64  
 4   screening_rat   600 non-null    object 
 5   director        600 non-null    object 
 6   dir_prev_bfnum  270 non-null    float64
 7   dir_prev_num    600 non-null    int64  
 8   num_staff       600 non-null    int64  
 9   num_actor       600 non-null    int64  
 10  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 56.2+ KB


* 독립변수를 포함 총 6개의 수치형변수가 있다. 그 중 `dir_prev_bfnum`은 결측값이 많음을 확인할 수 있다.
* `distributor`, `genre`, `screening_rat`, `director`의 총 4개의 범주형변수가 있다.
* `release_time`은 시계열변수이지만 문자열 (`object`)으로 인식하고 있다.

In [35]:
df['release_time'] = pd.to_datetime(df['release_time'])
print(df['release_time'].dtype)

datetime64[ns]


In [36]:
df.fillna(0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 개들의 전쟁 to 베를린
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   distributor     600 non-null    object        
 1   genre           600 non-null    object        
 2   release_time    600 non-null    datetime64[ns]
 3   time            600 non-null    int64         
 4   screening_rat   600 non-null    object        
 5   director        600 non-null    object        
 6   dir_prev_bfnum  600 non-null    float64       
 7   dir_prev_num    600 non-null    int64         
 8   num_staff       600 non-null    int64         
 9   num_actor       600 non-null    int64         
 10  box_off_num     600 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(4)
memory usage: 56.2+ KB


`release_time`을 시계열타입 (`datetime`)으로 변환하고 결측값을 0으로 대체하였다.

In [37]:
num_cols = [x for x in df.columns if df[x].dtype in [np.int64, np.float64] and x != target_col]
cat_cols = ['distributor', 'genre', 'screening_rat', 'director']
print(f'    numeric ({len(num_cols)}):\t{num_cols}')
print(f'categorical ({len(cat_cols)}):\t{cat_cols}')

    numeric (5):	['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
categorical (4):	['distributor', 'genre', 'screening_rat', 'director']


### 범주형변수 EDA

In [38]:
print(cat_cols)

['distributor', 'genre', 'screening_rat', 'director']


In [39]:
pd.DataFrame(df['distributor'].value_counts())

Unnamed: 0,distributor
CJ 엔터테인먼트,54
롯데엔터테인먼트,52
(주)NEW,30
(주)마운틴픽쳐스,29
인디스토리,26
...,...
(주)두타연,1
미라클 필름,1
에스와이코마드,1
(주)팝 파트너스,1


In [40]:
pd.DataFrame(df['genre'].value_counts())

Unnamed: 0,genre
드라마,221
다큐멘터리,93
멜로/로맨스,78
코미디,53
공포,42
액션,28
느와르,27
애니메이션,21
미스터리,17
SF,13


In [41]:
pd.DataFrame(df['screening_rat'].value_counts())

Unnamed: 0,screening_rat
청소년 관람불가,204
15세 관람가,202
12세 관람가,102
전체 관람가,92


In [42]:
pd.DataFrame(df['director'].value_counts())

Unnamed: 0,director
홍상수,7
전규환,4
노진수,4
장률,4
우민호,4
...,...
장규성,1
신춘수,1
박중구,1
김진만,1


## 수치형/시계열변수 가공

수치형 독립변수 중 멱변환 분포를 따르는 변수에도 `np.log1p()` 변환을 적용하였다.

In [43]:
df[['dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']] = df[['dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']].apply(np.log1p)
df[num_cols].describe()

Unnamed: 0,time,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
count,600.0,600.0,600.0,600.0,600.0
mean,100.863333,5.305796,0.462197,4.026352,1.44613
std,18.097528,6.254561,0.55557,1.789517,0.446256
min,45.0,0.0,0.0,0.0,0.0
25%,89.0,0.0,0.0,2.890372,1.098612
50%,100.0,0.0,0.0,4.424829,1.386294
75%,114.0,12.837611,1.098612,5.57973,1.609438
max,180.0,16.684279,1.791759,6.768493,3.258097


In [44]:
df['year'] = df['release_time'].dt.year
df['month'] = df['release_time'].dt.month
df.head()

Unnamed: 0_level_0,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,year,month
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,0.0,0.0,4.521789,1.098612,23398,2012,11
내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,13.965312,1.098612,5.961005,1.386294,7072501,2015,11
은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,12.304905,1.609438,5.840642,1.609438,6959083,2013,6
나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,10.081425,1.098612,3.044522,1.94591,217866,2012,7
불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,0.693147,0.693147,5.529429,1.098612,483387,2010,11


In [45]:
num_cols += ['year', 'month']
print(num_cols)

['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor', 'year', 'month']


In [46]:
features = num_cols + cat_cols
print(features)

['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor', 'year', 'month', 'distributor', 'genre', 'screening_rat', 'director']


## 범주형변수 가공

In [47]:
rmse = lambda y, p: np.sqrt(mean_squared_error(y, p))
rmsle = lambda y, p: np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

### Ordinal Encoding

In [48]:
from sklearn.preprocessing import OrdinalEncoder
df_cat = df.copy()
oe = OrdinalEncoder()
df_cat[cat_cols] = oe.fit_transform(df[cat_cols])
df_cat[cat_cols].head()

Unnamed: 0_level_0,distributor,genre,screening_rat,director
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
개들의 전쟁,80.0,10.0,3.0,393.0
내부자들,21.0,2.0,3.0,252.0
은밀하게 위대하게,21.0,10.0,1.0,357.0
나는 공무원이다,6.0,11.0,2.0,17.0
불량남녀,96.0,11.0,1.0,218.0


In [49]:
trn, tst = train_test_split(df_cat, test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(trn[features], np.log1p(trn[target_col]))
p = np.expm1(clf.predict(tst[features]))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1412591.02
RMSLE:	        1.98


### Label Encoding with Grouping

In [55]:
from kaggler.preprocessing import LabelEncoder
df_cat = df.copy()
le = LabelEncoder(min_obs=2)
df_cat[cat_cols] = le.fit_transform(df[cat_cols])
df_cat[cat_cols].head()

Unnamed: 0_level_0,distributor,genre,screening_rat,director
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
개들의 전쟁,2,5,0,0
내부자들,6,6,0,5
은밀하게 위대하게,6,5,1,0
나는 공무원이다,3,3,3,0
불량남녀,67,3,1,0


In [23]:
trn, tst = train_test_split(df_cat, test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(trn[features], np.log1p(trn[target_col]))
p = np.expm1(clf.predict(tst[features]))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1182500.97
RMSLE:	        1.92


### One-Hot-Encoding

In [57]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X = hstack((df[num_cols],
            ohe.fit_transform(df[cat_cols])))
print(X.shape)

(600, 664)


In [58]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, df[target_col], test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(X_trn, np.log1p(y_trn))
p = np.expm1(clf.predict(X_tst))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1239343.52
RMSLE:	        1.99


### One-Hot-Encoding with Grouping

In [72]:
from kaggler.preprocessing import OneHotEncoder
ohe = OneHotEncoder(min_obs=2)
ohe.fit(df[cat_cols])
X = hstack((df[num_cols],
            ohe.transform(df[cat_cols])))
print(X.shape)

(600, 187)


In [75]:
df2 = pd.DataFrame(X.toarray(), index=df.index) # 512 변경필요 # Hash함수 적용한 컬럼갯수
df2.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
개들의 전쟁,96.0,0.0,0.0,4.521789,1.098612,2012.0,11.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
내부자들,130.0,13.965312,1.098612,5.961005,1.386294,2015.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
은밀하게 위대하게,123.0,12.304905,1.609438,5.840642,1.609438,2013.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
나는 공무원이다,101.0,10.081425,1.098612,3.044522,1.94591,2012.0,7.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
불량남녀,108.0,0.693147,0.693147,5.529429,1.098612,2010.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, df[target_col], test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(X_trn, np.log1p(y_trn))
p = np.expm1(clf.predict(X_tst))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1183708.39
RMSLE:	        1.96


### Target Encoding without Cross-Validation

In [60]:
from kaggler.preprocessing import TargetEncoder
trn, tst = train_test_split(df, test_size=.2, random_state=seed)
te = TargetEncoder(cv=None)
trn[cat_cols] = te.fit_transform(trn[cat_cols], trn[target_col])
tst[cat_cols] = te.transform(tst[cat_cols])
trn[cat_cols].head()

Unnamed: 0_level_0,distributor,genre,screening_rat,director
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
돼지의 왕,721746.077154,199800.8,351118.2,723283.709843
청춘그루브,723430.057925,609340.9,1289088.0,723430.057925
행복한 울릉인,722860.857509,74726.85,152900.0,723430.082604
옥희의 영화,718805.251941,609340.9,351118.2,711379.735399
권법형사 : 차이나타운,90921.244145,2212935.0,351118.2,723429.982531


In [61]:
clf = LGBMRegressor(random_state=seed)
clf.fit(trn[features], np.log1p(trn[target_col]))
p = np.expm1(clf.predict(tst[features]))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1271836.20
RMSLE:	        3.40


### Target Encoding with Cross-Validation

In [62]:
trn, tst = train_test_split(df, test_size=.2, random_state=seed)
te = TargetEncoder()
trn[cat_cols] = te.fit_transform(trn[cat_cols], trn[target_col])
tst[cat_cols] = te.transform(tst[cat_cols])
trn[cat_cols].head()

Unnamed: 0_level_0,distributor,genre,screening_rat,director
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
돼지의 왕,723277.44785,267982.09648,348028.101562,723432.730281
청춘그루브,742425.489583,742425.489583,742425.489583,742425.489583
행복한 울릉인,768901.203125,10425.029522,122162.823956,768901.203125
옥희의 영화,722877.80549,813347.509259,380361.663366,721858.927057
권법형사 : 차이나타운,722864.097814,727096.433397,115692.865474,597129.065104


In [63]:
clf = LGBMRegressor(random_state=seed)
clf.fit(trn[features], np.log1p(trn[target_col]))
p = np.expm1(clf.predict(tst[features]))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1529036.45
RMSLE:	        2.12


### Frequency Encoding

In [77]:
from kaggler.preprocessing import FrequencyEncoder
df_cat = df.copy()
fe = FrequencyEncoder()
df_cat[cat_cols] = fe.fit_transform(df[cat_cols])
df_cat[cat_cols].head()
df_cat

Unnamed: 0_level_0,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,year,month
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
개들의 전쟁,52,28,2012-11-22,96,204,1,0.000000,0.000000,4.521789,1.098612,23398,2012,11
내부자들,26,27,2015-11-19,130,204,4,13.965312,1.098612,5.961005,1.386294,7072501,2015,11
은밀하게 위대하게,26,28,2013-06-05,123,202,1,12.304905,1.609438,5.840642,1.609438,6959083,2013,6
나는 공무원이다,30,53,2012-07-12,101,92,1,10.081425,1.098612,3.044522,1.945910,217866,2012,7
불량남녀,2,53,2010-11-04,108,202,1,0.693147,0.693147,5.529429,1.098612,483387,2010,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
해무,30,221,2014-08-13,111,204,1,8.251664,0.693147,6.236370,2.079442,1475091,2014,8
파파로티,26,221,2013-03-14,127,202,1,13.114456,0.693147,5.659482,1.945910,1716438,2013,3
살인의 강,29,42,2010-09-30,99,204,1,0.000000,0.000000,4.820282,1.609438,2475,2010,9
악의 연대기,54,27,2015-05-14,102,202,1,0.000000,0.000000,6.068426,1.609438,2192525,2015,5


In [78]:
df

Unnamed: 0_level_0,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num,year,month
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,0.000000,0.000000,4.521789,1.098612,23398,2012,11
내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,13.965312,1.098612,5.961005,1.386294,7072501,2015,11
은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,12.304905,1.609438,5.840642,1.609438,6959083,2013,6
나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,10.081425,1.098612,3.044522,1.945910,217866,2012,7
불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,0.693147,0.693147,5.529429,1.098612,483387,2010,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
해무,(주)NEW,드라마,2014-08-13,111,청소년 관람불가,심성보,8.251664,0.693147,6.236370,2.079442,1475091,2014,8
파파로티,(주)쇼박스,드라마,2013-03-14,127,15세 관람가,윤종찬,13.114456,0.693147,5.659482,1.945910,1716438,2013,3
살인의 강,(주)마운틴픽쳐스,공포,2010-09-30,99,청소년 관람불가,김대현,0.000000,0.000000,4.820282,1.609438,2475,2010,9
악의 연대기,CJ 엔터테인먼트,느와르,2015-05-14,102,15세 관람가,백운학,0.000000,0.000000,6.068426,1.609438,2192525,2015,5


In [65]:
trn, tst = train_test_split(df_cat, test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(trn[features], np.log1p(trn[target_col]))
p = np.expm1(clf.predict(tst[features]))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1258846.99
RMSLE:	        1.96


### Hash Encoding

In [66]:
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=128, input_type='string')
X = hstack([df[num_cols]] + [fh.fit_transform(df[col]) for col in cat_cols])
print(X.shape)

(600, 519)


In [67]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, df[target_col], test_size=.2, random_state=seed)
clf = LGBMRegressor(random_state=seed)
clf.fit(X_trn, np.log1p(y_trn))
p = np.expm1(clf.predict(X_tst))
print(f' RMSE:\t{rmse(tst[target_col], p):12.2f}')
print(f'RMSLE:\t{rmsle(tst[target_col], p):12.2f}')

 RMSE:	  1261559.18
RMSLE:	        1.95
