## 0. 라이브러리 설정

In [1]:
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.options.display.float_format="{:.5f}".format
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

## 1. 데이터 불러오기

In [2]:
data = pd.read_csv('data/prf_dataset.csv')

In [3]:
data.head(1)

Unnamed: 0,공연일자,공연시작시간,선예매여부,예매시작일,장르,휴게시간,공연명,내한여부,러닝타임,가격종류,R석,A석,요일,결제내역수,티켓판매량,전체좌석,일반예매율,멤버십합계,가중예매율,멤버십유입률
0,2018-11-25,1700,1,20181029.0,클래식,15,김대진 피아노 독주회,0,1시간 50분,3,50000.0,20000.0,6,1818,1537,2505.0,0.61357,258,0.00412,0.00671


## 2. 데이터 전처리

In [4]:
def make_month(data):
    # 데이터프레임에서 '공연일자' 열을 datetime 형식으로 변환
    result = data.copy()
    result['공연일자'] = pd.to_datetime(result['공연일자'])

    # '월' 열 생성
    result['월'] = result['공연일자'].dt.month
    
    return result

In [5]:
# '러닝타임' column에서 시간과 분을 추출하여 분 단위로 변환
def convert_to_minutes(time_str):
    # '시간'과 '분'을 기준으로 문자열을 분리
    parts = time_str.split()
    
    total_minutes = 0  # 총 분 초기화
    
    for part in parts:
        if '시간' in part:
            # '시간'을 포함한 부분에서 숫자를 추출하여 분 단위로 변환
            hours = int(part.replace('시간', '')) * 60
            total_minutes += hours
        elif '분' in part:
            # '분'을 포함한 부분에서 숫자를 추출하여 분 단위로 변환
            minutes = int(part.replace('분', ''))
            total_minutes += minutes
    
    return total_minutes

In [6]:
def preprocessing_data(data):
    # '러닝타임' column을 분 단위로 변환하여 새로운 column에 저장
    data['러닝타임'] = data['러닝타임'].apply(convert_to_minutes)
    
    data['가격종류'] = data['가격종류'].replace(0, 1)
    data['요일'] = data['요일'].replace(0, '월요일')
    data['요일'] = data['요일'].replace(1, '화요일')
    data['요일'] = data['요일'].replace(2, '수요일')
    data['요일'] = data['요일'].replace(3, '목요일')
    data['요일'] = data['요일'].replace(4, '금요일')
    data['요일'] = data['요일'].replace(5, '토요일')
    data['요일'] = data['요일'].replace(6, '일요일')
    
    data_temp = data[['공연시작시간', '선예매여부', '장르', '내한여부', '러닝타임', '요일', '일반예매율', '멤버십유입률', '월']]
    data_temp = pd.get_dummies(data_temp, columns=['장르', '요일', '월'])
    
    return data_temp

In [7]:
data = make_month(data)

In [8]:
data_temp = preprocessing_data(data)

## 3. train test split

In [9]:
X = data_temp.drop(['일반예매율', '멤버십유입률'], axis=1)
y = data_temp[['일반예매율', '멤버십유입률']]

In [10]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
train_X

Unnamed: 0,공연시작시간,선예매여부,내한여부,러닝타임,장르_교향곡,장르_기타,장르_독주,장르_복합장르,장르_성악,장르_실내악,장르_오페라,장르_재즈,장르_콘서트,장르_크로스오버,장르_클래식,장르_합창,요일_금요일,요일_목요일,요일_수요일,요일_일요일,요일_토요일,요일_화요일,월_1,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12
329,1930,1,1,110,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
173,1930,1,0,120,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
272,1930,1,0,120,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
496,1930,1,0,120,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
182,1930,1,0,90,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2000,1,0,120,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
106,2000,1,0,90,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
270,1930,1,0,100,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
435,1700,0,1,100,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [12]:
# one-hot encoding 값들만 남기기
train_X_new = train_X.drop(['공연시작시간','러닝타임'], axis=1)
test_X_new = test_X.drop(['공연시작시간','러닝타임'], axis=1)

In [13]:
# scaling이 필요한 값들
train_X_new_cont = train_X[['공연시작시간','러닝타임']]
test_X_new_cont = test_X[['공연시작시간','러닝타임']]

In [14]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(train_X_new_cont)

train_X_new_cont_rbs = pd.DataFrame(data=scaler.transform(train_X_new_cont), columns=train_X_new_cont.columns)

test_X_new_cont_rbs = pd.DataFrame(data=scaler.transform(test_X_new_cont), columns=train_X_new_cont.columns)

In [15]:
train_X_new_cont_rbs.reset_index(drop=True, inplace=True)
train_X_new.reset_index(drop=True, inplace=True)
test_X_new_cont_rbs.reset_index(drop=True, inplace=True)
test_X_new.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)
test_y.reset_index(drop=True, inplace=True)

In [16]:
train_X.shape

(412, 34)

In [17]:
test_X.shape

(104, 34)

In [18]:
train_X = pd.concat([train_X_new_cont_rbs, train_X_new], axis=1)
test_X = pd.concat([test_X_new_cont_rbs, test_X_new], axis=1)

In [19]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((412, 34), (104, 34), (412, 2), (104, 2))

In [20]:
train_y_res = train_y['일반예매율']
train_y_mem = train_y['멤버십유입률']

test_y_res = test_y['일반예매율']
test_y_mem = test_y['멤버십유입률']

In [21]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error

## 4. 예측 모델

### 일반예매율 예측 - CatBoost

In [22]:
from catboost import CatBoostRegressor

cb_res = CatBoostRegressor(depth=7, learning_rate=0.001, random_state=42)
cb_res.fit(train_X, train_y_res)

print(f"MSE for CatBoost : {mean_squared_error(test_y_res, cb_res.predict(test_X))}")
print(f"MSLE for CatBoost : {mean_squared_log_error(test_y_res, cb_res.predict(test_X))}")

0:	learn: 0.2535106	total: 144ms	remaining: 2m 23s
1:	learn: 0.2534917	total: 145ms	remaining: 1m 12s
2:	learn: 0.2534704	total: 146ms	remaining: 48.6s
3:	learn: 0.2534516	total: 147ms	remaining: 36.7s
4:	learn: 0.2534387	total: 149ms	remaining: 29.6s
5:	learn: 0.2534298	total: 149ms	remaining: 24.7s
6:	learn: 0.2534149	total: 150ms	remaining: 21.3s
7:	learn: 0.2534010	total: 151ms	remaining: 18.8s
8:	learn: 0.2533830	total: 152ms	remaining: 16.8s
9:	learn: 0.2533535	total: 153ms	remaining: 15.2s
10:	learn: 0.2533421	total: 154ms	remaining: 13.9s
11:	learn: 0.2533206	total: 155ms	remaining: 12.8s
12:	learn: 0.2533063	total: 156ms	remaining: 11.9s
13:	learn: 0.2532711	total: 157ms	remaining: 11.1s
14:	learn: 0.2532439	total: 158ms	remaining: 10.4s
15:	learn: 0.2532184	total: 159ms	remaining: 9.8s
16:	learn: 0.2532031	total: 160ms	remaining: 9.28s
17:	learn: 0.2531902	total: 162ms	remaining: 8.82s
18:	learn: 0.2531736	total: 163ms	remaining: 8.4s
19:	learn: 0.2531603	total: 164ms	remaini

210:	learn: 0.2497973	total: 429ms	remaining: 1.6s
211:	learn: 0.2497807	total: 431ms	remaining: 1.6s
212:	learn: 0.2497685	total: 432ms	remaining: 1.6s
213:	learn: 0.2497540	total: 434ms	remaining: 1.59s
214:	learn: 0.2497383	total: 435ms	remaining: 1.59s
215:	learn: 0.2497164	total: 436ms	remaining: 1.58s
216:	learn: 0.2497026	total: 437ms	remaining: 1.58s
217:	learn: 0.2496779	total: 439ms	remaining: 1.57s
218:	learn: 0.2496518	total: 440ms	remaining: 1.57s
219:	learn: 0.2496300	total: 441ms	remaining: 1.56s
220:	learn: 0.2496181	total: 442ms	remaining: 1.56s
221:	learn: 0.2496051	total: 443ms	remaining: 1.55s
222:	learn: 0.2495857	total: 444ms	remaining: 1.55s
223:	learn: 0.2495762	total: 445ms	remaining: 1.54s
224:	learn: 0.2495617	total: 446ms	remaining: 1.53s
225:	learn: 0.2495532	total: 447ms	remaining: 1.53s
226:	learn: 0.2495426	total: 448ms	remaining: 1.52s
227:	learn: 0.2495292	total: 449ms	remaining: 1.52s
228:	learn: 0.2495123	total: 450ms	remaining: 1.51s
229:	learn: 0.2

478:	learn: 0.2453545	total: 735ms	remaining: 799ms
479:	learn: 0.2453407	total: 736ms	remaining: 798ms
480:	learn: 0.2453297	total: 738ms	remaining: 796ms
481:	learn: 0.2453251	total: 738ms	remaining: 794ms
482:	learn: 0.2453137	total: 740ms	remaining: 792ms
483:	learn: 0.2452999	total: 741ms	remaining: 790ms
484:	learn: 0.2452884	total: 742ms	remaining: 788ms
485:	learn: 0.2452603	total: 743ms	remaining: 786ms
486:	learn: 0.2452536	total: 744ms	remaining: 784ms
487:	learn: 0.2452329	total: 745ms	remaining: 782ms
488:	learn: 0.2452080	total: 746ms	remaining: 780ms
489:	learn: 0.2451953	total: 747ms	remaining: 778ms
490:	learn: 0.2451700	total: 748ms	remaining: 776ms
491:	learn: 0.2451634	total: 749ms	remaining: 773ms
492:	learn: 0.2451466	total: 750ms	remaining: 771ms
493:	learn: 0.2451344	total: 751ms	remaining: 769ms
494:	learn: 0.2451228	total: 752ms	remaining: 767ms
495:	learn: 0.2451083	total: 753ms	remaining: 765ms
496:	learn: 0.2450940	total: 754ms	remaining: 763ms
497:	learn: 

752:	learn: 0.2412381	total: 1.04s	remaining: 342ms
753:	learn: 0.2412167	total: 1.04s	remaining: 341ms
754:	learn: 0.2411997	total: 1.04s	remaining: 339ms
755:	learn: 0.2411849	total: 1.05s	remaining: 338ms
756:	learn: 0.2411705	total: 1.05s	remaining: 336ms
757:	learn: 0.2411564	total: 1.05s	remaining: 335ms
758:	learn: 0.2411458	total: 1.05s	remaining: 333ms
759:	learn: 0.2411312	total: 1.05s	remaining: 332ms
760:	learn: 0.2411199	total: 1.05s	remaining: 331ms
761:	learn: 0.2411121	total: 1.05s	remaining: 329ms
762:	learn: 0.2410996	total: 1.05s	remaining: 328ms
763:	learn: 0.2410835	total: 1.06s	remaining: 326ms
764:	learn: 0.2410692	total: 1.06s	remaining: 325ms
765:	learn: 0.2410606	total: 1.06s	remaining: 324ms
766:	learn: 0.2410444	total: 1.06s	remaining: 322ms
767:	learn: 0.2410189	total: 1.06s	remaining: 321ms
768:	learn: 0.2410059	total: 1.06s	remaining: 319ms
769:	learn: 0.2409882	total: 1.06s	remaining: 318ms
770:	learn: 0.2409745	total: 1.06s	remaining: 316ms
771:	learn: 

MSE for CatBoost : 0.07164038267910824
MSLE for CatBoost : 0.0317352509182896


### 멤버십유입률 예측 - CatBoost

In [23]:
from catboost import CatBoostRegressor

cb_mem = CatBoostRegressor(depth=3, learning_rate=0.01, random_state=42)
cb_mem.fit(train_X, train_y_mem)

print(f"MSE for CatBoost : {mean_squared_error(test_y_mem, cb_mem.predict(test_X))}")
print(f"MSLE for CatBoost : {mean_squared_log_error(test_y_res, cb_mem.predict(test_X))}")

0:	learn: 0.1337776	total: 479us	remaining: 479ms
1:	learn: 0.1337070	total: 1.06ms	remaining: 532ms
2:	learn: 0.1336577	total: 1.75ms	remaining: 583ms
3:	learn: 0.1335624	total: 2.2ms	remaining: 548ms
4:	learn: 0.1334748	total: 2.6ms	remaining: 518ms
5:	learn: 0.1333553	total: 3.17ms	remaining: 525ms
6:	learn: 0.1332507	total: 3.68ms	remaining: 522ms
7:	learn: 0.1331161	total: 4.3ms	remaining: 534ms
8:	learn: 0.1330680	total: 4.86ms	remaining: 535ms
9:	learn: 0.1329967	total: 5.48ms	remaining: 542ms
10:	learn: 0.1328619	total: 6.09ms	remaining: 548ms
11:	learn: 0.1327494	total: 7.13ms	remaining: 587ms
12:	learn: 0.1326631	total: 8.04ms	remaining: 611ms
13:	learn: 0.1325577	total: 8.92ms	remaining: 628ms
14:	learn: 0.1324782	total: 9.87ms	remaining: 648ms
15:	learn: 0.1323574	total: 10.7ms	remaining: 656ms
16:	learn: 0.1322431	total: 11.4ms	remaining: 658ms
17:	learn: 0.1321973	total: 12.1ms	remaining: 661ms
18:	learn: 0.1320873	total: 12.8ms	remaining: 659ms
19:	learn: 0.1319446	total

180:	learn: 0.1233404	total: 131ms	remaining: 592ms
181:	learn: 0.1232958	total: 132ms	remaining: 593ms
182:	learn: 0.1232573	total: 133ms	remaining: 592ms
183:	learn: 0.1231978	total: 133ms	remaining: 591ms
184:	learn: 0.1231640	total: 134ms	remaining: 589ms
185:	learn: 0.1231416	total: 134ms	remaining: 588ms
186:	learn: 0.1231198	total: 135ms	remaining: 587ms
187:	learn: 0.1230834	total: 136ms	remaining: 586ms
188:	learn: 0.1230671	total: 136ms	remaining: 585ms
189:	learn: 0.1230209	total: 137ms	remaining: 584ms
190:	learn: 0.1229951	total: 137ms	remaining: 582ms
191:	learn: 0.1229637	total: 138ms	remaining: 582ms
192:	learn: 0.1229386	total: 139ms	remaining: 583ms
193:	learn: 0.1229073	total: 140ms	remaining: 583ms
194:	learn: 0.1228664	total: 141ms	remaining: 582ms
195:	learn: 0.1228432	total: 141ms	remaining: 580ms
196:	learn: 0.1228017	total: 142ms	remaining: 579ms
197:	learn: 0.1227810	total: 143ms	remaining: 578ms
198:	learn: 0.1227672	total: 143ms	remaining: 577ms
199:	learn: 

372:	learn: 0.1187238	total: 268ms	remaining: 450ms
373:	learn: 0.1187060	total: 268ms	remaining: 449ms
374:	learn: 0.1186914	total: 269ms	remaining: 448ms
375:	learn: 0.1186742	total: 270ms	remaining: 447ms
376:	learn: 0.1186646	total: 270ms	remaining: 447ms
377:	learn: 0.1186353	total: 271ms	remaining: 446ms
378:	learn: 0.1186116	total: 272ms	remaining: 445ms
379:	learn: 0.1186005	total: 272ms	remaining: 444ms
380:	learn: 0.1185873	total: 273ms	remaining: 444ms
381:	learn: 0.1185654	total: 274ms	remaining: 443ms
382:	learn: 0.1185416	total: 275ms	remaining: 443ms
383:	learn: 0.1185358	total: 276ms	remaining: 442ms
384:	learn: 0.1185231	total: 276ms	remaining: 441ms
385:	learn: 0.1184982	total: 277ms	remaining: 440ms
386:	learn: 0.1184824	total: 278ms	remaining: 440ms
387:	learn: 0.1184675	total: 278ms	remaining: 439ms
388:	learn: 0.1184414	total: 279ms	remaining: 438ms
389:	learn: 0.1184128	total: 280ms	remaining: 437ms
390:	learn: 0.1183922	total: 280ms	remaining: 436ms
391:	learn: 

585:	learn: 0.1154900	total: 404ms	remaining: 286ms
586:	learn: 0.1154724	total: 405ms	remaining: 285ms
587:	learn: 0.1154590	total: 405ms	remaining: 284ms
588:	learn: 0.1154444	total: 406ms	remaining: 283ms
589:	learn: 0.1154131	total: 407ms	remaining: 283ms
590:	learn: 0.1153954	total: 408ms	remaining: 282ms
591:	learn: 0.1153839	total: 408ms	remaining: 281ms
592:	learn: 0.1153753	total: 409ms	remaining: 281ms
593:	learn: 0.1153604	total: 409ms	remaining: 280ms
594:	learn: 0.1153451	total: 410ms	remaining: 279ms
595:	learn: 0.1153340	total: 411ms	remaining: 278ms
596:	learn: 0.1153215	total: 412ms	remaining: 278ms
597:	learn: 0.1153069	total: 412ms	remaining: 277ms
598:	learn: 0.1152969	total: 413ms	remaining: 276ms
599:	learn: 0.1152791	total: 413ms	remaining: 276ms
600:	learn: 0.1152699	total: 414ms	remaining: 275ms
601:	learn: 0.1152510	total: 415ms	remaining: 274ms
602:	learn: 0.1152421	total: 415ms	remaining: 273ms
603:	learn: 0.1152318	total: 416ms	remaining: 273ms
604:	learn: 

796:	learn: 0.1129653	total: 543ms	remaining: 138ms
797:	learn: 0.1129583	total: 543ms	remaining: 138ms
798:	learn: 0.1129442	total: 544ms	remaining: 137ms
799:	learn: 0.1129348	total: 545ms	remaining: 136ms
800:	learn: 0.1129256	total: 545ms	remaining: 135ms
801:	learn: 0.1129211	total: 546ms	remaining: 135ms
802:	learn: 0.1129042	total: 547ms	remaining: 134ms
803:	learn: 0.1129007	total: 547ms	remaining: 133ms
804:	learn: 0.1128972	total: 548ms	remaining: 133ms
805:	learn: 0.1128852	total: 549ms	remaining: 132ms
806:	learn: 0.1128732	total: 549ms	remaining: 131ms
807:	learn: 0.1128648	total: 550ms	remaining: 131ms
808:	learn: 0.1128540	total: 550ms	remaining: 130ms
809:	learn: 0.1128301	total: 551ms	remaining: 129ms
810:	learn: 0.1128214	total: 552ms	remaining: 129ms
811:	learn: 0.1128105	total: 552ms	remaining: 128ms
812:	learn: 0.1127935	total: 553ms	remaining: 127ms
813:	learn: 0.1127797	total: 553ms	remaining: 126ms
814:	learn: 0.1127764	total: 554ms	remaining: 126ms
815:	learn: 

MSLE for CatBoost : 0.09847077891874695


## 5. 신규데이터 적용

In [24]:
new_data = {'공연일자' : ['2023-10-03','2023-10-19', '2023-11-04', '2023-11-12', '2023-12-01'],
            '공연시작시간': [1700, 1930, 1700, 1700, 2000],
            '선예매여부': [1, 1, 0, 1, 1],
            '예매시작일': [20230719, 20230823, 20230901, 20230906, 20221210],
            '장르': ['클래식', '클래식', '독주','교향곡', '교향곡'],
            '휴게시간': [20, 0, 20, 20, 15],
            '공연명': ['안드라스 쉬프 피아노 리사이틀', '예술의전당 전관 개관 30주년 특별음악회-코리안챔버오케스트라 초청 콘서트', '바리톤 김기훈 리사이틀', '키릴 페트렌코 & 베를린 필하모닉 (11.12)','2023 서울시향 얍 판 츠베덴의 베토벤 삼중 협주곡 ②'],
            '내한여부': [1, 0, 0, 1, 1],
            '러닝타임': ['2시간', '2시간', '1시간 30분', '2시간','1시간 40분'],
            '가격종류': [4, 4, 3, 5, 5],
            'R석': [160000, 80000, 100000, 550000, 100000],
            'A석': [60000, 20000, 60000, 100000, 10000],
            '요일': [1, 3, 5, 6, 4],
            '결제내역수': [0, 0, 0, 0, 0],
            '티켓판매량': [0, 0, 0, 0, 0],
            '전체좌석': [0, 0, 0, 0, 0],
            '일반예매율': [0, 0, 0, 0, 0],
            '멤버십합계': [0, 0, 0, 0, 0],
            '가중예매율': [0, 0, 0, 0, 0],
            '멤버십유입률': [0, 0, 0, 0, 0]
           }

In [25]:
new_data = pd.DataFrame(new_data)

In [26]:
new_data

Unnamed: 0,공연일자,공연시작시간,선예매여부,예매시작일,장르,휴게시간,공연명,내한여부,러닝타임,가격종류,R석,A석,요일,결제내역수,티켓판매량,전체좌석,일반예매율,멤버십합계,가중예매율,멤버십유입률
0,2023-10-03,1700,1,20230719,클래식,20,안드라스 쉬프 피아노 리사이틀,1,2시간,4,160000,60000,1,0,0,0,0,0,0,0
1,2023-10-19,1930,1,20230823,클래식,0,예술의전당 전관 개관 30주년 특별음악회-코리안챔버오케스트라 초청 콘서트,0,2시간,4,80000,20000,3,0,0,0,0,0,0,0
2,2023-11-04,1700,0,20230901,독주,20,바리톤 김기훈 리사이틀,0,1시간 30분,3,100000,60000,5,0,0,0,0,0,0,0
3,2023-11-12,1700,1,20230906,교향곡,20,키릴 페트렌코 & 베를린 필하모닉 (11.12),1,2시간,5,550000,100000,6,0,0,0,0,0,0,0
4,2023-12-01,2000,1,20221210,교향곡,15,2023 서울시향 얍 판 츠베덴의 베토벤 삼중 협주곡 ②,1,1시간 40분,5,100000,10000,4,0,0,0,0,0,0,0


In [27]:
new_data = make_month(new_data)

In [28]:
new_data_temp = preprocessing_data(new_data)

In [29]:
new_data_temp

Unnamed: 0,공연시작시간,선예매여부,내한여부,러닝타임,일반예매율,멤버십유입률,장르_교향곡,장르_독주,장르_클래식,요일_금요일,요일_목요일,요일_일요일,요일_토요일,요일_화요일,월_10,월_11,월_12
0,1700,1,1,120,0,0,0,0,1,0,0,0,0,1,1,0,0
1,1930,1,0,120,0,0,0,0,1,0,1,0,0,0,1,0,0
2,1700,0,0,90,0,0,0,1,0,0,0,0,1,0,0,1,0
3,1700,1,1,120,0,0,1,0,0,0,0,1,0,0,0,1,0
4,2000,1,1,100,0,0,1,0,0,1,0,0,0,0,0,0,1


In [30]:
new_data_temp = new_data_temp.drop(['일반예매율','멤버십유입률'], axis=1)

# one-hot encoding 값들만 남기기
new_data_temp_new = new_data_temp.drop(['공연시작시간','러닝타임'], axis=1)

# scaling이 필요한 값들
new_data_temp_cont = new_data_temp[['공연시작시간','러닝타임']]

In [31]:
new_data_temp_cont_rbs = pd.DataFrame(data=scaler.transform(new_data_temp_cont), columns=new_data_temp_cont.columns)

In [32]:
new_data_final = pd.concat([new_data_temp_cont_rbs, new_data_temp_new], axis=1)

In [33]:
new_data_final

Unnamed: 0,공연시작시간,러닝타임,선예매여부,내한여부,장르_교향곡,장르_독주,장르_클래식,요일_금요일,요일_목요일,요일_일요일,요일_토요일,요일_화요일,월_10,월_11,월_12
0,-1.08252,0.55189,1,1,0,0,1,0,0,0,0,1,1,0,0
1,0.42978,0.55189,1,0,0,0,1,0,1,0,0,0,1,0,0
2,-1.08252,-1.53034,0,0,0,1,0,0,0,0,1,0,0,1,0
3,-1.08252,0.55189,1,1,1,0,0,0,0,1,0,0,0,1,0
4,0.89005,-0.83626,1,1,1,0,0,1,0,0,0,0,0,0,1


In [34]:
input_data = train_X.copy()[:len(new_data_final)]
input_data.loc[:, :] = 0

In [35]:
input_data

Unnamed: 0,공연시작시간,러닝타임,선예매여부,내한여부,장르_교향곡,장르_기타,장르_독주,장르_복합장르,장르_성악,장르_실내악,장르_오페라,장르_재즈,장르_콘서트,장르_크로스오버,장르_클래식,장르_합창,요일_금요일,요일_목요일,요일_수요일,요일_일요일,요일_토요일,요일_화요일,월_1,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
for i in new_data_final.columns:
    input_data[i] = new_data_final[i]

In [37]:
input_data

Unnamed: 0,공연시작시간,러닝타임,선예매여부,내한여부,장르_교향곡,장르_기타,장르_독주,장르_복합장르,장르_성악,장르_실내악,장르_오페라,장르_재즈,장르_콘서트,장르_크로스오버,장르_클래식,장르_합창,요일_금요일,요일_목요일,요일_수요일,요일_일요일,요일_토요일,요일_화요일,월_1,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12
0,-1.08252,0.55189,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,0.42978,0.55189,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,-1.08252,-1.53034,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
3,-1.08252,0.55189,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0.89005,-0.83626,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [38]:
new_data['일반예매율'] = cb_res.predict(input_data)
new_data['멤버십유입률'] = cb_mem.predict(input_data)

In [39]:
new_data

Unnamed: 0,공연일자,공연시작시간,선예매여부,예매시작일,장르,휴게시간,공연명,내한여부,러닝타임,가격종류,R석,A석,요일,결제내역수,티켓판매량,전체좌석,일반예매율,멤버십합계,가중예매율,멤버십유입률,월
0,2023-10-03,1700,1,20230719,클래식,20,안드라스 쉬프 피아노 리사이틀,1,120,4,160000,60000,화요일,0,0,0,0.51596,0,0,0.14879,10
1,2023-10-19,1930,1,20230823,클래식,0,예술의전당 전관 개관 30주년 특별음악회-코리안챔버오케스트라 초청 콘서트,0,120,4,80000,20000,목요일,0,0,0,0.51146,0,0,0.12985,10
2,2023-11-04,1700,0,20230901,독주,20,바리톤 김기훈 리사이틀,0,90,3,100000,60000,토요일,0,0,0,0.47639,0,0,0.22579,11
3,2023-11-12,1700,1,20230906,교향곡,20,키릴 페트렌코 & 베를린 필하모닉 (11.12),1,120,5,550000,100000,일요일,0,0,0,0.49723,0,0,0.186,11
4,2023-12-01,2000,1,20221210,교향곡,15,2023 서울시향 얍 판 츠베덴의 베토벤 삼중 협주곡 ②,1,100,5,100000,10000,금요일,0,0,0,0.45764,0,0,0.26033,12


In [40]:
new_data.to_csv('data/new_data_reserve.csv', index=False) 