In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
basedir = './prepped_data/'

In [3]:
%%time
main_data_df = pd.read_csv(basedir + 'main_data_monthly_converted.csv')
print(main_data_df.shape)
main_data_df.head()

(4441027, 17)
Wall time: 6.59 s


Unnamed: 0,전월세매매구분,전용면적(㎡),계약일,금액(만원),층,건축년도,가격/면적,년,월,target_log_transformed,target/area_log_transformed,건물종류,지번주소,도로명주소,X좌표,Y좌표,주택가격지수
0,매매,77.75,8,57000.0,2,1988,733.118971,2013,9,10.950807,6.597308,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,79.1
1,매매,77.75,16,57000.0,2,1988,733.118971,2013,12,10.950807,6.597308,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,79.213
2,매매,77.75,29,55000.0,7,1988,707.395498,2014,1,10.915088,6.56159,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,79.451
3,매매,77.75,10,59500.0,4,1988,765.273312,2014,11,10.993732,6.640233,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,82.1
4,매매,77.75,28,75000.0,5,1988,964.630225,2016,3,11.225243,6.871745,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,89.466


In [4]:
%%time
land_prices_df = pd.read_csv(basedir + 'land_prices_ver_2.csv')
land_prices_df.shape

Wall time: 627 ms


(1594565, 4)

In [5]:
%%time
land_specs_df = pd.read_csv(basedir + 'land_specs_ver_3.csv')
land_specs_df.shape

Wall time: 1.66 s


(1563645, 11)

In [6]:
%%time
land_plans_df = pd.read_csv(basedir + 'land_plans_ver_3.csv')
land_plans_df.shape



Wall time: 29.2 s


(1565190, 288)

In [7]:
land_plans_df.fillna('해당사항없음', inplace=True)

In [8]:
%%time
bdinfo_df = pd.read_csv(basedir + 'bdinfo_features_structure_purpose_roof.csv')
bdinfo_df.shape

Wall time: 3.55 s


(1436953, 63)

In [9]:
main_data_df.rename(columns={'전용면적(㎡)':'전용면적'}, inplace=True)

In [10]:
%%time
main_data_df['전용면적_mean'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('mean')['전용면적']
main_data_df['전용면적_min'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('min')['전용면적']
main_data_df['전용면적_max'] = main_data_df[['지번주소', '건축년도', '전용면적']].groupby(['지번주소', '건축년도']).transform('max')['전용면적']

Wall time: 1.71 s


In [11]:
main_data_df['가격/면적'] = main_data_df['금액(만원)'] / main_data_df['전용면적']

In [12]:
main_data_df['건물나이'] = main_data_df['년'] - main_data_df['건축년도']
main_data_df['건물나이'].describe()

count    4.441027e+06
mean     1.366252e+01
std      9.898723e+00
min     -1.200000e+01
25%      5.000000e+00
50%      1.200000e+01
75%      2.000000e+01
max      8.900000e+01
Name: 건물나이, dtype: float64

In [13]:
main_data_df['year_linear'] = (main_data_df['년'] - main_data_df['년'].min()) + ((main_data_df['월']-1)/12)

In [14]:
def month_circular_sine_func(x):
    return math.sin(2*math.pi*((x-1)/12))

def month_circular_cosine_func(x):
    return math.cos(2*math.pi*((x-1)/12))

In [15]:
main_data_df['month_sin'] = main_data_df['월'].apply(month_circular_sine_func)
main_data_df['month_cos'] = main_data_df['월'].apply(month_circular_cosine_func)

In [16]:
main_data_df.drop(columns=['월', '계약일'], inplace=True)

In [17]:
xmax = main_data_df['X좌표'].max()
xmin = main_data_df['X좌표'].min()
ymax = main_data_df['Y좌표'].max()
ymin = main_data_df['Y좌표'].min()

In [18]:
coor_minmax_df = pd.DataFrame({'xmax':xmax, 'xmin':xmin, 'ymax':ymax, 'ymin':ymin}, index=[0])
coor_minmax_df.to_csv('./Training/training_data_ver_14/coor_minmax.csv', index=False)
coor_minmax_df

Unnamed: 0,xmax,xmin,ymax,ymin
0,971728.508488,938060.786948,1965830.0,1937374.0


In [None]:
main_data_df['xnorm'] = (main_data_df['X좌표'] - xmin - ((xmax - xmin)/2))
main_data_df['ynorm'] = (main_data_df['Y좌표'] - ymin - ((ymax - ymin)/2))
main_data_df['x_2nd'] = main_data_df['xnorm']**2 / 2
main_data_df['y_2nd'] = main_data_df['ynorm']**2 / 2
main_data_df['x_sin'] = main_data_df['xnorm'].apply(np.sin)
main_data_df['y_sin'] = main_data_df['ynorm'].apply(np.sin)
main_data_df['x_cos'] = main_data_df['xnorm'].apply(np.cos)
main_data_df['y_cos'] = main_data_df['ynorm'].apply(np.cos)
main_data_df['xy'] = main_data_df['xnorm'] * main_data_df['ynorm']

In [None]:
main_data_df.drop(columns=['X좌표', 'Y좌표'], inplace=True)

In [None]:
main_data_df['trade_type'] = main_data_df['건물종류'] + '_' + main_data_df['전월세매매구분']
main_data_df.drop(columns=['건물종류', '전월세매매구분'], inplace=True)

In [None]:
merge1_df = main_data_df.merge(land_prices_df, on=['지번주소', '년'])
merge1_df.shape

In [None]:
merge1_df['공시지가'] = merge1_df['공시지가'] / 10000
merge1_df.rename(columns={'공시지가':'공시지가(만원)'}, inplace=True)

In [None]:
bdinfo_df.drop(columns=['사용승인년도'], inplace=True)
bdinfo_df.rename(columns={'기준년도':'년'}, inplace=True)

In [None]:
merge1_df.shape

In [None]:
merge2_df = merge1_df.merge(bdinfo_df, on=['지번주소', '년'])
merge2_df.shape

In [None]:
%%time
bdinfo2_df = pd.read_csv(basedir + 'extracted_building_features.csv')
print(bdinfo2_df.shape)
bdinfo2_df.head()

In [None]:
bdinfo2_df = bdinfo2_df[['지번주소', '기준년도'] + [col for col in bdinfo2_df.columns if '층' in col]]
print(bdinfo2_df.shape)
bdinfo2_df.head()

In [None]:
bdinfo2_df.rename(columns={'기준년도':'년'}, inplace=True)

In [None]:
merge2_df = merge2_df.merge(bdinfo2_df, on=['지번주소', '년'])
print(merge2_df.shape)

In [None]:
merge2_df['층'] = merge2_df['층'].apply(lambda x: -1 if x < 0 else x)

In [None]:
merge2_df['최고층-층'] = merge2_df['층max'] - merge2_df['층']
merge2_df['최고층-층'].describe()

In [None]:
negative_df = merge2_df[merge2_df['최고층-층'] < 0]
negative_df.shape

In [None]:
negative_df['지번주소'].nunique()

In [None]:
idx_to_drop = []
for addr in tqdm(negative_df['지번주소'].unique()):
    addr_df = merge2_df[merge2_df['지번주소'] == addr]
    addr2_df = negative_df[negative_df['지번주소'] == addr]
    min_year = addr2_df['년'].min()
    df_to_drop = addr_df[addr_df['년'] >= min_year]
    for idx in df_to_drop.index:
        idx_to_drop.append(idx)

In [None]:
len(list(set(idx_to_drop)))

In [None]:
merge2_df.drop(idx_to_drop, inplace=True)

In [None]:
merge2_df.shape

In [None]:
merge2_df['층norm'] = merge2_df['층'] / merge2_df['층max']

In [None]:
merge2_df['층norm'].describe()

In [None]:
merge2_df.drop(columns=['최고층-층'], inplace=True)

In [None]:
merge2_df['지하세대있음'] = merge2_df['층'].apply(lambda x: 1 if x < 0 else 0)
merge2_df['지하세대있음'].value_counts()

In [None]:
cols = [col for col in land_specs_df.columns if col != '공시지가']
cols

In [None]:
merge2_df = merge2_df.merge(land_specs_df[cols], on=['지번주소', '년'])
merge2_df.shape

In [None]:
merge2_df['토지면적'].describe()

In [None]:
merge3_df = merge2_df.merge(land_plans_df, on=['지번주소', '년'])
merge3_df.shape

In [None]:
merge3_df.columns = [col.replace(':','').replace('/','').replace('\\','').replace('*','').replace('?','').replace('\"','')\
                     .replace('|','').replace('<','').replace('>','').replace('ㆍ','_').replace(' ','_') for col in merge3_df.columns]

In [None]:
land_plans_df.columns = [col.replace(':','').replace('/','').replace('\\','').replace('*','').replace('?','').replace('\"','')\
                     .replace('|','').replace('<','').replace('>','').replace('ㆍ','_').replace(' ','_') for col in land_plans_df.columns]

In [None]:
lp_catcols = land_plans_df.columns.tolist()[2:]
lp_catcols[:5]

In [None]:
ls_catcols = [col for col in cols if col not in ['지번주소', '년', '토지면적']]
ls_catcols

In [None]:
merge3_df.drop(columns=['전용면적'], inplace=True)

In [None]:
merge3_df['토지면적'] = merge3_df['토지면적'].apply(np.log1p)

In [None]:
merge3_df.drop(columns=['년'], inplace=True)

In [None]:
def building_age_feature_engi(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        
        if x < 0:
            return '건축전계약'
        elif x < 5:
            return '5년미만'
        elif x < 10:
            return '10년미만'
        elif x < 15:
            return '15년미만'
        elif x < 20:
            return '20년미만'
        elif x < 30:
            return '30년미만'
        elif x < 40:
            return '40년미만'
        else:
            return '40년이상'

In [None]:
merge3_df = merge3_df[merge3_df['건물나이'] >= 0]
merge3_df.shape

In [None]:
merge3_df['건물나이'] = merge3_df['건물나이'].apply(building_age_feature_engi)
merge3_df['건물나이'].value_counts()

In [None]:
merge3_df.drop(columns=['도로명주소'], inplace=True)

In [None]:
cat_cols = ['trade_type', '표준지여부', '건물나이'] + ls_catcols + lp_catcols

In [None]:
for col in tqdm(cat_cols):
    col_map = {}
    col_map_df = pd.DataFrame({'cat':merge3_df[col].unique(),'map':[i for i in range(len(merge3_df[col].unique()))]})
    for i in range(col_map_df.shape[0]):
        col_map[col_map_df['cat'][i]] = col_map_df['map'][i]
    merge3_df[col] = merge3_df[col].map(col_map)
    col_map_df.to_csv('./Training/training_data_ver_15/feature_maps/{}_map.csv'.format(col))

In [None]:
cat_cols_df = pd.DataFrame({'colname':cat_cols})
cat_cols_df.head()

In [None]:
cat_cols_df.to_csv('./Training/training_data_ver_15/cat_cols.csv', index=False)

In [None]:
cols_to_drop = []
for col in tqdm(merge3_df.columns):
    try:
        if merge3_df[col].std() == 0:
            cols_to_drop.append(col)
        else:
            pass
    except:
        pass

In [None]:
len(cols_to_drop)

In [None]:
cols_to_drop

In [None]:
merge3_df.drop(columns=cols_to_drop, inplace=True)

In [None]:
%%time
merge3_df.to_csv('./Training/training_data_ver_15/training_data_ver_15.csv', index=False)

In [None]:
merge3_df.shape