In [160]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [161]:
df_price = pd.read_csv('./data/price.csv', encoding="utf-8")

In [162]:
df_price.head()

Unnamed: 0,숙박유형명,평균판매금액,지역명,성수기여부
0,Hotel,88667,강원도 강릉시,0
1,Hotel,205885,강원도 강릉시,1
2,Hotel,96455,강원도 강릉시,0
3,Hotel,142178,강원도 강릉시,1
4,Hotel,65000,강원도 강릉시,0


In [166]:
X = df_price.drop(columns=["평균판매금액","성수기여부"])
y = df_price["평균판매금액"]

In [167]:
encoder = ColumnTransformer([('encoder', OneHotEncoder(handle_unknown="ignore"), categorical_columns)],remainder='passthrough')

In [169]:
X

Unnamed: 0,숙박유형명,지역명
0,Hotel,강원도 강릉시
1,Hotel,강원도 강릉시
2,Hotel,강원도 강릉시
3,Hotel,강원도 강릉시
4,Hotel,강원도 강릉시
...,...,...
481,Motel,경기도 평택시
482,Motel,경기도 평택시
483,Motel,경기도 평택시
484,Motel,경기도 평택시


In [170]:
X_transformed = encoder.fit_transform(X)
X_transformed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 972 stored elements and shape (486, 60)>

In [144]:
pd.DataFrame(X_transformed)

Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
...,...
481,<Compressed Sparse Row sparse matrix of dtype ...
482,<Compressed Sparse Row sparse matrix of dtype ...
483,<Compressed Sparse Row sparse matrix of dtype ...
484,<Compressed Sparse Row sparse matrix of dtype ...


In [171]:
# 희소 행렬을 numpy 배열로 변환
X_transformed = X_transformed.toarray()
X_transformed 

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [172]:
# 변환된 컬럼명 가져오기
feature_names = encoder.get_feature_names_out()

In [173]:
feature_names

array(['encoder__지역명_강원도 강릉시', 'encoder__지역명_강원도 고성군',
       'encoder__지역명_강원도 동해시', 'encoder__지역명_강원도 삼척시',
       'encoder__지역명_강원도 속초시', 'encoder__지역명_강원도 양구군',
       'encoder__지역명_강원도 양양군', 'encoder__지역명_강원도 영월군',
       'encoder__지역명_강원도 원주시', 'encoder__지역명_강원도 인제군',
       'encoder__지역명_강원도 정선군', 'encoder__지역명_강원도 철원군',
       'encoder__지역명_강원도 춘천시', 'encoder__지역명_강원도 태백시',
       'encoder__지역명_강원도 평창군', 'encoder__지역명_강원도 홍천군',
       'encoder__지역명_강원도 화천군', 'encoder__지역명_강원도 횡성군',
       'encoder__지역명_경기도 가평군', 'encoder__지역명_경기도 고양시덕양구',
       'encoder__지역명_경기도 고양시일산동구', 'encoder__지역명_경기도 고양시일산서구',
       'encoder__지역명_경기도 과천시', 'encoder__지역명_경기도 광명시',
       'encoder__지역명_경기도 광주시', 'encoder__지역명_경기도 구리시',
       'encoder__지역명_경기도 군포시', 'encoder__지역명_경기도 김포시',
       'encoder__지역명_경기도 남양주시', 'encoder__지역명_경기도 동두천시',
       'encoder__지역명_경기도 부천시', 'encoder__지역명_경기도 성남시분당구',
       'encoder__지역명_경기도 성남시수정구', 'encoder__지역명_경기도 성남시중원구',
       'encoder__지역명_경기도 수원시권선구', 'encoder_

In [154]:
# 넘파이 배열을 pandas DataFrame으로 변환
df_encoded = pd.DataFrame(X_transformed, columns=feature_names) # 성수기여부 추가
df_encoded["평균판매금액"] = y  # 최종적으로 y 추가
df_encoded['성수기여부'] = df_price['성수기여부']


In [157]:
df_encoded

Unnamed: 0,encoder__지역명_강원도 강릉시,encoder__지역명_강원도 고성군,encoder__지역명_강원도 동해시,encoder__지역명_강원도 삼척시,encoder__지역명_강원도 속초시,encoder__지역명_강원도 양구군,encoder__지역명_강원도 양양군,encoder__지역명_강원도 영월군,encoder__지역명_강원도 원주시,encoder__지역명_강원도 인제군,...,encoder__지역명_경기도 의왕시,encoder__지역명_경기도 의정부시,encoder__지역명_경기도 이천시,encoder__지역명_경기도 파주시,encoder__지역명_경기도 평택시,encoder__숙박유형명_Hotel,encoder__숙박유형명_Motel,encoder__숙박유형명_Pension,평균판매금액,성수기여부
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,88667,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,205885,1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,96455,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,142178,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,65000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,41600,0
482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,55000,0
483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,55406,0
484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,46667,0


In [158]:
df_encoded.to_csv("data/processed_data.csv", index=False, encoding="utf-8-sig")

In [159]:
joblib.dump(encoder, "models/encoder.pkl")

['models/encoder.pkl']