In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams['font.family'] = 'Malgun Gothic'
font = {'size': 16}
matplotlib.rc('font', **font)
tqdm.tqdm.pandas()

In [2]:
%%time
training_data_df = pd.read_csv('./Prepped Data/training_data_20220125.csv')
print(training_data_df.shape)
training_data_df.head()

(5068169, 19)
Wall time: 7.75 s


Unnamed: 0,주소_건축년도,건축년도,전용면적(㎡),월세(만원),x좌표,y좌표,계약날짜일수_2006년기준,계약날짜기준_건물연식,계약날짜일수_sin,계약날짜일수_cos,거래유형,층,부동산유형,브랜드,대장구분명,지목,구,동,거래/보증금(만원)
0,서울특별시 강남구 개포동 12 1991,1991,25.35,12,2.066072,4.435861,4070,26,0.811539,0.584298,0,0,0,0,0,0,0,0,2253
1,서울특별시 강남구 개포동 12 1991,1991,25.35,10,2.066072,4.435861,4133,26,0.895839,-0.444378,0,1,0,0,0,0,0,0,730
2,서울특별시 강남구 개포동 12 1991,1991,25.35,4,2.066072,4.435861,4260,26,-0.880012,-0.474951,0,1,0,0,0,0,0,0,1815
3,서울특별시 강남구 개포동 12 1991,1991,25.35,17,2.066072,4.435861,4470,27,0.999769,0.021516,0,2,0,0,0,0,0,0,3865
4,서울특별시 강남구 개포동 12 1991,1991,25.35,7,2.066072,4.435861,4577,27,-0.247022,-0.96901,0,3,0,0,0,0,0,0,3062


In [3]:
basedir = './Training/Training3_20220125/cat_col_maps/'

In [4]:
filenames = [f for f in os.listdir(basedir)]
filenames

['거래유형.xlsx',
 '구.xlsx',
 '대장구분명.xlsx',
 '동.xlsx',
 '부동산유형.xlsx',
 '브랜드.xlsx',
 '지목.xlsx',
 '층.xlsx']

In [5]:
cat_cols = [col.replace('.xlsx', '') for col in filenames]
cat_cols

['거래유형', '구', '대장구분명', '동', '부동산유형', '브랜드', '지목', '층']

# Training - Catboost Regressor

In [6]:
import catboost

In [7]:
from sklearn.model_selection import GroupKFold

In [8]:
gkfold = GroupKFold(n_splits=5)

In [9]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='RMSE')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_RMSE_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

-------------- Fold 0 starting ---------------
Learning rate set to 0.079026
0:	learn: 29495.4096515	test: 29224.0890147	best: 29224.0890147 (0)	total: 278ms	remaining: 46m 20s
50:	learn: 12916.0447118	test: 13667.3793701	best: 13667.3793701 (50)	total: 7.9s	remaining: 25m 40s
100:	learn: 11259.0964052	test: 12227.6212789	best: 12227.6212789 (100)	total: 15.5s	remaining: 25m 17s
150:	learn: 10503.6461666	test: 11598.0713354	best: 11598.0713354 (150)	total: 23s	remaining: 25m
200:	learn: 10067.4900832	test: 11303.3293486	best: 11303.3293486 (200)	total: 30.3s	remaining: 24m 37s
250:	learn: 9708.3286504	test: 11084.2967118	best: 11084.2967118 (250)	total: 37.6s	remaining: 24m 21s
300:	learn: 9440.5546647	test: 10921.9612532	best: 10921.9612532 (300)	total: 45s	remaining: 24m 9s
350:	learn: 9223.0501598	test: 10814.8786916	best: 10814.8786916 (350)	total: 52.3s	remaining: 23m 56s
400:	learn: 9042.5777967	test: 10733.2436246	best: 10732.8010377 (399)	total: 59.9s	remaining: 23m 53s
450:	le

850:	learn: 8185.8835714	test: 9788.6112652	best: 9788.6112652 (850)	total: 2m 7s	remaining: 22m 47s
900:	learn: 8123.7845424	test: 9769.6273116	best: 9764.1912786 (879)	total: 2m 14s	remaining: 22m 42s
950:	learn: 8066.1423887	test: 9734.7512612	best: 9734.7512612 (950)	total: 2m 22s	remaining: 22m 36s
1000:	learn: 8012.5892998	test: 9705.7318497	best: 9705.7318497 (1000)	total: 2m 30s	remaining: 22m 29s
1050:	learn: 7963.5262982	test: 9691.8776025	best: 9687.1715382 (1039)	total: 2m 37s	remaining: 22m 21s
1100:	learn: 7918.5997366	test: 9680.3188888	best: 9679.9794846 (1097)	total: 2m 45s	remaining: 22m 15s
1150:	learn: 7869.8099230	test: 9667.9738488	best: 9664.9585500 (1130)	total: 2m 52s	remaining: 22m 8s
1200:	learn: 7828.0254512	test: 9659.3333856	best: 9656.0852728 (1194)	total: 3m	remaining: 22m
1250:	learn: 7786.1889303	test: 9637.9952242	best: 9637.9952242 (1250)	total: 3m 7s	remaining: 21m 51s
1300:	learn: 7750.2438849	test: 9626.6546489	best: 9626.0077231 (1298)	total: 3m 

1150:	learn: 7826.3953455	test: 9359.2369314	best: 9358.2712956 (1148)	total: 3m 3s	remaining: 23m 32s
1200:	learn: 7784.3369088	test: 9342.9714203	best: 9342.9714203 (1200)	total: 3m 11s	remaining: 23m 22s
1250:	learn: 7739.9666194	test: 9318.8303055	best: 9318.8303055 (1250)	total: 3m 19s	remaining: 23m 11s
1300:	learn: 7700.5709457	test: 9306.8838755	best: 9306.8563099 (1299)	total: 3m 26s	remaining: 23m 1s
1350:	learn: 7659.0312287	test: 9283.1343357	best: 9283.1343357 (1350)	total: 3m 34s	remaining: 22m 51s
1400:	learn: 7625.8336137	test: 9270.8459950	best: 9270.8459950 (1400)	total: 3m 43s	remaining: 22m 51s
1450:	learn: 7590.7687307	test: 9263.6451801	best: 9262.9018742 (1449)	total: 3m 53s	remaining: 22m 57s
1500:	learn: 7555.5983457	test: 9253.7378728	best: 9247.5050665 (1497)	total: 4m 3s	remaining: 22m 59s
1550:	learn: 7525.0836171	test: 9244.2984601	best: 9243.7228091 (1548)	total: 4m 11s	remaining: 22m 48s
1600:	learn: 7496.8236602	test: 9238.7091905	best: 9238.7091905 (16

1600:	learn: 7247.3963619	test: 11873.2209791	best: 11873.2209791 (1600)	total: 4m 2s	remaining: 21m 13s
1650:	learn: 7218.6182393	test: 11863.7615477	best: 11863.7245766 (1619)	total: 4m 10s	remaining: 21m 6s
1700:	learn: 7189.3467962	test: 11857.1865460	best: 11854.4258147 (1696)	total: 4m 18s	remaining: 21m
1750:	learn: 7159.5306126	test: 11844.5992359	best: 11842.0948440 (1748)	total: 4m 25s	remaining: 20m 52s
1800:	learn: 7128.3702499	test: 11849.2649086	best: 11842.0948440 (1748)	total: 4m 33s	remaining: 20m 45s
1850:	learn: 7104.1820405	test: 11843.3743582	best: 11841.4386114 (1836)	total: 4m 41s	remaining: 20m 37s
1900:	learn: 7080.8363253	test: 11828.3077125	best: 11828.3077125 (1900)	total: 4m 48s	remaining: 20m 30s
1950:	learn: 7055.2263714	test: 11821.3930654	best: 11815.5600619 (1930)	total: 4m 56s	remaining: 20m 22s
2000:	learn: 7033.4538670	test: 11820.0068500	best: 11815.5600619 (1930)	total: 5m 3s	remaining: 20m 14s
2050:	learn: 7010.0278480	test: 11816.8599576	best: 1

1550:	learn: 7527.3772610	test: 8749.7833639	best: 8749.3402326 (1548)	total: 3m 52s	remaining: 21m 8s
1600:	learn: 7498.9424846	test: 8743.8978656	best: 8743.8978656 (1600)	total: 4m	remaining: 21m
1650:	learn: 7465.5215859	test: 8731.2041416	best: 8731.2041416 (1650)	total: 4m 7s	remaining: 20m 52s
1700:	learn: 7440.9165047	test: 8723.5166187	best: 8723.5166187 (1700)	total: 4m 14s	remaining: 20m 44s
1750:	learn: 7413.1986640	test: 8708.2604389	best: 8708.2604389 (1750)	total: 4m 22s	remaining: 20m 36s
1800:	learn: 7385.7689836	test: 8706.3205845	best: 8706.3205845 (1800)	total: 4m 29s	remaining: 20m 28s
1850:	learn: 7362.3632471	test: 8705.6794156	best: 8705.0614913 (1837)	total: 4m 37s	remaining: 20m 19s
1900:	learn: 7336.0225566	test: 8695.6522155	best: 8695.6522155 (1900)	total: 4m 44s	remaining: 20m 12s
1950:	learn: 7311.2677585	test: 8697.7314645	best: 8693.1393243 (1930)	total: 4m 52s	remaining: 20m 5s
2000:	learn: 7285.8260194	test: 8688.4542696	best: 8688.4542696 (2000)	tota

In [None]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='Quantile:alpha=0.5')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_quantile05_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

-------------- Fold 0 starting ---------------
0:	learn: 9218.8652558	test: 9195.8651742	best: 9195.8651742 (0)	total: 347ms	remaining: 57m 49s
50:	learn: 5168.4997268	test: 5175.3462729	best: 5175.3462729 (50)	total: 8.58s	remaining: 27m 54s


In [None]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='Quantile:alpha=0.25')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_quantile025_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

In [None]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='Quantile:alpha=0.75')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_quantile075_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

In [None]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='Quantile:alpha=0.125')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_quantile0125_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1

In [None]:
count = 0
models_list = []
for train_idx, val_idx in gkfold.split(training_data_df, groups=training_data_df['주소_건축년도']):
    print('-------------- Fold {} starting ---------------'.format(count))
    
    x_train = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[train_idx, :]
    y_train = training_data_df['거래/보증금(만원)'].loc[train_idx]
    
    x_val = training_data_df.drop(columns=['주소_건축년도', '거래/보증금(만원)']).loc[val_idx, :]
    y_val = training_data_df['거래/보증금(만원)'].loc[val_idx]
    
    train_pool = catboost.Pool(x_train, y_train, cat_features=cat_cols)
    val_pool = catboost.Pool(x_val, y_val, cat_features=cat_cols)
    
    model = catboost.CatBoostRegressor(iterations=10000, depth=6, task_type='GPU', loss_function='Quantile:alpha=0.875')
    
    model.fit(train_pool, eval_set=val_pool, verbose=50, early_stopping_rounds=200)
    
    model.save_model('./Training/Training3_20220125/catboost_quantile0875_{}'.format(count))
    
    print('-------------- Fold {} ending ---------------'.format(count))
    count += 1