In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
basedir = './prepped_data/'

In [3]:
%%time
main_data_df = pd.read_csv(basedir + 'main_data_baseline_checkpoint_2.csv')
main_data_df.shape

Wall time: 5.9 s


(4441256, 17)

In [4]:
%%time
land_prices_df = pd.read_csv(basedir + 'land_prices_ver_2.csv')
land_prices_df.shape

Wall time: 1.07 s


(1594565, 4)

In [6]:
%%time
land_specs_df = pd.read_csv(basedir + 'land_specs_ver_3.csv')
land_specs_df.shape

Wall time: 3.19 s


(1563645, 11)

In [5]:
%%time
land_plans_df = pd.read_csv(basedir + 'land_plans_ver_3.csv')
land_plans_df.shape



Wall time: 49.8 s


(1565190, 288)

In [7]:
land_plans_df.fillna('해당사항없음', inplace=True)

In [8]:
dfs_list = [main_data_df, land_prices_df, land_specs_df, land_plans_df]
for df in dfs_list:
    print(df.shape)

(4441256, 17)
(1594565, 4)
(1563645, 11)
(1565190, 288)


In [9]:
main_data_df.rename(columns={'전용면적(㎡)':'전용면적'}, inplace=True)

In [10]:
%%time
main_data_df['전용면적_mean'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('mean')['전용면적']
main_data_df['전용면적_min'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('min')['전용면적']
main_data_df['전용면적_max'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('max')['전용면적']

Wall time: 3.15 s


In [11]:
main_data_df = main_data_df[(main_data_df['전월세매매구분'] == '매매')|(main_data_df['전월세매매구분'] == '전세')].drop(columns=['월세(만원)', '도로명주소'])
print(main_data_df.shape)

(3607377, 18)


In [12]:
main_data_df['건물나이'] = main_data_df['년'] - main_data_df['건축년도']
main_data_df['건물나이'].describe()

count    3.607377e+06
mean     1.393074e+01
std      9.846349e+00
min     -1.200000e+01
25%      6.000000e+00
50%      1.300000e+01
75%      2.100000e+01
max      8.900000e+01
Name: 건물나이, dtype: float64

In [13]:
main_data_df.drop(columns=['건축년도'], inplace=True)

In [14]:
main_data_df['year_linear'] = (main_data_df['년'] - main_data_df['년'].min()) + ((main_data_df['월']-1)/12)

In [15]:
def month_circular_sine_func(x):
    return math.sin(2*math.pi*((x-1)/12))

def month_circular_cosine_func(x):
    return math.cos(2*math.pi*((x-1)/12))

In [16]:
main_data_df['month_sin'] = main_data_df['월'].apply(month_circular_sine_func)
main_data_df['month_cos'] = main_data_df['월'].apply(month_circular_cosine_func)

In [17]:
main_data_df.drop(columns=['월', '계약일'], inplace=True)

In [18]:
xmax = main_data_df['X좌표'].max()
xmin = main_data_df['X좌표'].min()
ymax = main_data_df['Y좌표'].max()
ymin = main_data_df['Y좌표'].min()

In [19]:
coor_minmax_df = pd.DataFrame({'xmax':xmax, 'xmin':xmin, 'ymax':ymax, 'ymin':ymin}, index=[0])
coor_minmax_df.to_csv('./Training/training_data_ver_10/coor_minmax.csv', index=False)
coor_minmax_df

Unnamed: 0,xmax,xmin,ymax,ymin
0,971728.508488,938089.149737,1965830.0,1937374.0


In [20]:
main_data_df['xnorm'] = (main_data_df['X좌표'] - xmin - ((xmax - xmin)/2)) / 10000
main_data_df['ynorm'] = (main_data_df['Y좌표'] - ymin - ((ymax - ymin)/2)) / 10000
main_data_df['x_2nd'] = main_data_df['xnorm']**2 / 2
main_data_df['y_2nd'] = main_data_df['ynorm']**2 / 2
main_data_df['x_sin'] = main_data_df['xnorm'].apply(np.sin)
main_data_df['y_sin'] = main_data_df['ynorm'].apply(np.sin)
main_data_df['x_cos'] = main_data_df['xnorm'].apply(np.cos)
main_data_df['y_cos'] = main_data_df['ynorm'].apply(np.cos)
main_data_df['xy'] = main_data_df['xnorm'] * main_data_df['ynorm']

In [21]:
main_data_df.drop(columns=['X좌표', 'Y좌표'], inplace=True)

In [22]:
main_data_df['trade_type'] = main_data_df['건물종류'] + '_' + main_data_df['전월세매매구분']
main_data_df.drop(columns=['건물종류', '전월세매매구분'], inplace=True)

In [23]:
merge1_df = main_data_df.merge(land_prices_df, on=['지번주소', '년'])
merge1_df.shape

(3493309, 27)

In [24]:
merge1_df['공시지가'] = merge1_df['공시지가'] / 10000
merge1_df.rename(columns={'공시지가':'공시지가(만원)'}, inplace=True)

In [25]:
land_specs_df.head()

Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,공시지가
0,서울특별시 강남구 개포동 1163-4,2006,대,336.1,제3종일반주거지역,지정되지않음,상업나지,평지,세로장방,광대세각,6290000.0
1,서울특별시 강남구 개포동 1163-4,2007,대,336.1,제3종일반주거지역,지정되지않음,상업나지,평지,세로장방,광대세각,6290000.0
2,서울특별시 강남구 개포동 1163-4,2008,대,336.1,제3종일반주거지역,지정되지않음,상업나지,평지,세로장방,광대세각,6290000.0
3,서울특별시 강남구 개포동 1163-4,2009,대,336.1,제3종일반주거지역,지정되지않음,상업나지,평지,세로장방,광대세각,6290000.0
4,서울특별시 강남구 개포동 1163-4,2010,대,336.1,제3종일반주거지역,지정되지않음,상업나지,평지,세로장방,광대세각,6290000.0


In [26]:
cols = [col for col in land_specs_df.columns if col != '공시지가']
cols

['지번주소',
 '년',
 '지목명',
 '토지면적',
 '용도지역명1',
 '용도지역명2',
 '토지이동상황',
 '지형높이',
 '지형형상',
 '도로접면']

In [28]:
merge2_df = merge1_df.merge(land_specs_df[cols], on=['지번주소', '년'])
merge2_df.shape

(3492486, 35)

In [48]:
merge3_df = merge2_df.merge(land_plans_df, on=['지번주소', '년'])
merge3_df.shape

(3490570, 321)

In [49]:
merge3_df.drop(columns=['지번주소'], inplace=True)

In [50]:
merge3_df.columns = [col.replace(':','').replace('/','').replace('\\','').replace('*','').replace('?','').replace('\"','')\
                     .replace('|','').replace('<','').replace('>','') for col in merge3_df.columns]

In [51]:
land_plans_df.columns = [col.replace(':','').replace('/','').replace('\\','').replace('*','').replace('?','').replace('\"','')\
                     .replace('|','').replace('<','').replace('>','') for col in land_plans_df.columns]

In [52]:
lp_catcols = land_plans_df.columns.tolist()[2:]
lp_catcols[:5]

['(한강)오염행위 제한지역', '(한강)폐기물매립시설 설치제한지역', '4대문안', '가로구역별 최고높이 제한지역', '가스공급설비']

In [53]:
ls_catcols = [col for col in cols if col not in ['지번주소', '년', '토지면적']]
ls_catcols

['지목명', '용도지역명1', '용도지역명2', '토지이동상황', '지형높이', '지형형상', '도로접면']

In [54]:
merge3_df['전용면적'] = merge3_df['전용면적'].apply(np.log)

In [55]:
merge3_df['토지면적'] = merge3_df['토지면적'].apply(np.log)

In [56]:
merge3_df.drop(columns=['년'], inplace=True)

In [57]:
def floor_feature_engi(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
    
        if x < 0:
            return '지하'
        elif (x == 0)|(x == 1):
            return '1층'
        elif x == 2:
            return '2층'
        elif x == 3:
            return '3층'
        elif x == 4:
            return '4층'
        elif x == 5:
            return '5층'
        elif x == 6:
            return '6층'
        elif x == 7:
            return '7층'
        elif x == 8:
            return '8층'
        elif x == 9:
            return '9층'
        elif x == 10:
            return '10층'
        elif x < 14:
            return '13층이하'
        elif x < 17:
            return '16층이하'
        elif x < 20:
            return '19층이하'
        elif x < 25:
            return '24층이하'
        elif x < 30:
            return '29층이하'
        elif x < 35:
            return '34층이하'
        elif x < 40:
            return '39층이하'
        elif x < 45:
            return '44층이하'
        elif x < 50:
            return '49층이하'
        else:
            return '50층이상'

In [58]:
merge3_df['층'] = merge3_df['층'].apply(floor_feature_engi)
merge3_df['층'].value_counts()

2층       461668
3층       437283
4층       376866
13층이하    351766
1층       286718
5층       282597
16층이하    233363
6층       183856
7층       160137
8층       147584
9층       141884
10층      135725
19층이하    114488
24층이하     86557
지하        54534
29층이하     23291
34층이하      7462
39층이하      2841
44층이하       854
49층이하       564
50층이상       532
Name: 층, dtype: int64

In [59]:
def building_age_feature_engi(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        
        if x < 0:
            return '건축전계약'
        elif x < 6:
            return '5년이하'
        elif x < 11:
            return '10년이하'
        elif x < 16:
            return '15년이하'
        elif x < 20:
            return '19년이하'
        elif x < 30:
            return '29년이하'
        elif x < 40:
            return '39년이하'
        else:
            return '40년이상'

In [60]:
merge3_df['건물나이'] = merge3_df['건물나이'].apply(building_age_feature_engi)
merge3_df['건물나이'].value_counts()

5년이하     764528
29년이하    717015
15년이하    657039
10년이하    635306
19년이하    435394
39년이하    248569
40년이상     31239
건축전계약      1480
Name: 건물나이, dtype: int64

In [61]:
cat_cols = ['trade_type', '표준지여부', '층', '건물나이'] + ls_catcols + lp_catcols

In [62]:
for col in tqdm(cat_cols):
    col_map = {}
    col_map_df = pd.DataFrame({'cat':merge3_df[col].unique(),'map':[i for i in range(len(merge3_df[col].unique()))]})
    for i in range(col_map_df.shape[0]):
        col_map[col_map_df['cat'][i]] = col_map_df['map'][i]
    merge3_df[col] = merge3_df[col].map(col_map)
    col_map_df.to_csv('./Training/training_data_ver_12/feature_maps/{}_map.csv'.format(col))

100%|████████████████████████████████████████████████████████████████████████████████| 297/297 [17:49<00:00,  3.60s/it]


In [63]:
cat_cols_df = pd.DataFrame({'colname':cat_cols})
cat_cols_df.head()

Unnamed: 0,colname
0,trade_type
1,표준지여부
2,층
3,건물나이
4,지목명


In [64]:
cat_cols_df.to_csv('./Training/training_data_ver_12/cat_cols.csv', index=False)

In [65]:
cols_to_drop = []
for col in tqdm(merge3_df.columns):
    try:
        if merge3_df[col].std() == 0:
            cols_to_drop.append(col)
        else:
            pass
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████████████| 319/319 [00:19<00:00, 16.58it/s]


In [66]:
len(cols_to_drop)

1

In [67]:
cols_to_drop

['기타유통및공급시설']

In [68]:
merge3_df.drop(columns=cols_to_drop, inplace=True)

In [69]:
%%time
merge3_df.to_csv('./Training/training_data_ver_12/training_data_ver_12.csv', index=False)

Wall time: 18min 4s
