In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams['font.family'] = 'Malgun Gothic'
font = {'size': 16}
matplotlib.rc('font', **font)
tqdm.tqdm.pandas()

In [2]:
%%time
training_data_df = pd.read_csv('./Prepped Data/training_data_20211223.csv')
print(training_data_df.shape)
training_data_df.head()

(5068169, 15)
Wall time: 5.87 s


Unnamed: 0,주소_건축년도,전용면적(㎡),월세(만원),x좌표,y좌표,계약날짜일수_2006년기준,거래유형,층,건축년도,부동산유형,브랜드,대장구분명,구,동,거래/보증금(만원)
0,서울특별시 강남구 개포동 12 1991,25.35,12,206607.181143,443586.061143,4070,0,0,0,0,0,0,0,0,2253
1,서울특별시 강남구 개포동 12 1991,25.35,10,206607.181143,443586.061143,4133,0,1,0,0,0,0,0,0,730
2,서울특별시 강남구 개포동 12 1991,25.35,4,206607.181143,443586.061143,4260,0,1,0,0,0,0,0,0,1815
3,서울특별시 강남구 개포동 12 1991,25.35,17,206607.181143,443586.061143,4470,0,2,0,0,0,0,0,0,3865
4,서울특별시 강남구 개포동 12 1991,25.35,7,206607.181143,443586.061143,4577,0,3,0,0,0,0,0,0,3062


In [3]:
basedir = './Training/Training1_20211223/cat_col_maps/'

In [4]:
filenames = [f for f in os.listdir(basedir)]
filenames

['거래유형.xlsx',
 '건축년도.xlsx',
 '구.xlsx',
 '대장구분명.xlsx',
 '동.xlsx',
 '부동산유형.xlsx',
 '브랜드.xlsx',
 '층.xlsx']

In [5]:
cat_cols = [col.replace('.xlsx', '') for col in filenames]
cat_cols

['거래유형', '건축년도', '구', '대장구분명', '동', '부동산유형', '브랜드', '층']

# Training - Catboost Regressor

In [6]:
import catboost

In [7]:
from sklearn.model_selection import GroupKFold

In [8]:
gkfold = GroupKFold(n_splits=5)

In [9]:
count = 0
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='RMSE')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

-------------- Fold 0 starting ---------------
Learning rate set to 0.079026
0:	learn: 29495.4130184	test: 29224.0833510	best: 29224.0833510 (0)	total: 416ms	remaining: 1h 9m 16s
50:	learn: 13107.5961292	test: 13899.4340976	best: 13899.4340976 (50)	total: 8.5s	remaining: 27m 37s
100:	learn: 11452.1981834	test: 12738.3843224	best: 12738.3843224 (100)	total: 16.4s	remaining: 26m 45s
150:	learn: 10645.0050800	test: 12203.3633587	best: 12203.3633587 (150)	total: 24.1s	remaining: 26m 13s
200:	learn: 10099.4094592	test: 11894.6813997	best: 11894.6813997 (200)	total: 32.2s	remaining: 26m 7s
250:	learn: 9726.2781054	test: 11790.6130185	best: 11788.0866266 (249)	total: 40.2s	remaining: 26m 1s
300:	learn: 9459.0826921	test: 11661.6142151	best: 11661.6142151 (300)	total: 47.9s	remaining: 25m 44s
350:	learn: 9237.9533715	test: 11630.3586631	best: 11630.3586631 (350)	total: 55.6s	remaining: 25m 28s
400:	learn: 9061.4117854	test: 11541.4135533	best: 11541.4135533 (400)	total: 1m 3s	remaining: 25m 16

350:	learn: 9032.2003244	test: 13600.7777844	best: 13577.9150819 (338)	total: 54.5s	remaining: 24m 57s
400:	learn: 8842.6596776	test: 13483.1633629	best: 13483.1633629 (400)	total: 1m 2s	remaining: 24m 49s
450:	learn: 8689.3492194	test: 13446.1423699	best: 13444.6294451 (441)	total: 1m 9s	remaining: 24m 41s
500:	learn: 8547.7471603	test: 13452.3590854	best: 13433.1840517 (456)	total: 1m 17s	remaining: 24m 33s
550:	learn: 8408.6454645	test: 13419.0322558	best: 13398.7243576 (523)	total: 1m 25s	remaining: 24m 23s
600:	learn: 8281.9789820	test: 13372.2211702	best: 13372.2211702 (600)	total: 1m 33s	remaining: 24m 17s
650:	learn: 8171.8750494	test: 13437.5740623	best: 13370.6225123 (602)	total: 1m 40s	remaining: 24m 9s
700:	learn: 8070.3788697	test: 13383.2184458	best: 13370.6225123 (602)	total: 1m 48s	remaining: 24m 3s
750:	learn: 7985.3750790	test: 13425.9081477	best: 13369.5591101 (713)	total: 1m 56s	remaining: 23m 55s
800:	learn: 7904.3588546	test: 13383.0453012	best: 13369.5591101 (713