In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc
import requests
from urllib.parse import urlparse

In [2]:
basedir = './prepped_data/'

In [3]:
%%time
main_data_df = pd.read_csv(basedir + 'main_data_monthly_converted.csv').dropna(subset=['X좌표', 'Y좌표', '지번주소'])
main_data_df = main_data_df.drop(columns=['가격/면적', 'target_log_transformed', 'target/area_log_transformed', '주택가격지수'])
print(main_data_df.shape)
main_data_df.head()

(4439689, 13)
Wall time: 10.2 s


Unnamed: 0,전월세매매구분,전용면적(㎡),계약일,금액(만원),층,건축년도,년,월,건물종류,지번주소,도로명주소,X좌표,Y좌표
0,매매,77.75,8,57000.0,2,1988,2013,9,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0
1,매매,77.75,16,57000.0,2,1988,2013,12,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0
2,매매,77.75,29,55000.0,7,1988,2014,1,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0
3,매매,77.75,10,59500.0,4,1988,2014,11,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0
4,매매,77.75,28,75000.0,5,1988,2016,3,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0


In [4]:
main_data_df['구'] = main_data_df['지번주소'].apply(lambda x: x.split(' ')[1])
main_data_df['동'] = main_data_df['지번주소'].apply(lambda x: x.split(' ')[2])

In [5]:
main_data_df['건물연식'] = main_data_df['년'] - main_data_df['건축년도']

In [6]:
main_data_df = main_data_df[main_data_df['건물연식'] >= 0]

In [7]:
def bdyear_feature_engi(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        for i in range(1, 9):
            if x <= i*5:
                return '{}년_이하'.format(i*5)
        
        return '40년_초과'

In [8]:
main_data_df['건물연식_engi'] = main_data_df['건물연식'].apply(bdyear_feature_engi)

In [9]:
main_data_df['건물연식_engi'].value_counts()

5년_이하     1113301
15년_이하     807855
10년_이하     798256
20년_이하     628055
25년_이하     450286
30년_이하     351545
35년_이하     187953
40년_이하      65792
40년_초과      31506
Name: 건물연식_engi, dtype: int64

In [10]:
main_data_df.drop(columns=['건물연식', '건축년도', '층'], inplace=True)

In [11]:
subway_df = pd.read_csv('./지하철역/subway_coor.csv').sort_values(['역명']).reset_index(drop=True)
print(subway_df.shape)
subway_df.head()

(292, 5)


Unnamed: 0,지번주소,도로명주소,역명,x좌표,y좌표
0,서울특별시 송파구 송파대로 지하 257,서울특별시 송파구 송파대로 지하 257,가락시장역_0,966274.711075,1943790.0
1,서울특별시 송파구 송파대로 지하 257,서울특별시 송파구 송파대로 지하 257,가락시장역_1,966274.711075,1943790.0
2,서울특별시 금천구 벚꽃로 309,서울특별시 금천구 벚꽃로 309,가산디지털단지역,945410.718812,1942663.0
3,서울특별시 강남구 학동로 지하346,서울특별시 강남구 학동로 지하 346,강남구청역,959464.837464,1946531.0
4,서울특별시 강남구 강남대로 지하 396,서울특별시 강남구 강남대로 지하 396,강남역,958305.955213,1944427.0


In [12]:
for i in tqdm(range(subway_df.shape[0])):
    station_name = subway_df['역명'].loc[i]
    subway_x = subway_df['x좌표'].loc[i]
    subway_y = subway_df['y좌표'].loc[i]
    
    main_data_df['{}_distance'.format(station_name)] = np.sqrt((main_data_df['X좌표'] - subway_x)**2\
    + (main_data_df['Y좌표'] - subway_y)**2)

100%|████████████████████████████████████████████████████████████████████████████████| 292/292 [00:18<00:00, 16.03it/s]


In [13]:
main_data_df.head()

Unnamed: 0,전월세매매구분,전용면적(㎡),계약일,금액(만원),년,월,건물종류,지번주소,도로명주소,X좌표,Y좌표,구,동,건물연식_engi,가락시장역_0_distance,가락시장역_1_distance,가산디지털단지역_distance,강남구청역_distance,강남역_distance,강동구청역_distance,강동역_distance,강변역_distance,개롱역_distance,개화산역_distance,거여역_distance,건대입구역_0_distance,건대입구역_1_distance,경복궁역_distance,경찰병원역_distance,고덕역_distance,고려대역_distance,고속터미널역_0_distance,고속터미널역_1_distance,공덕역_0_distance,공덕역_1_distance,공릉역_distance,광나루역_distance,광명사거리역_distance,광화문역_distance,광흥창역_distance,교대역_0_distance,교대역_1_distance,구로디지털단지역_distance,구산역_distance,구의역_distance,구파발역_distance,군자역_0_distance,군자역_1_distance,굴포천역_distance,굽은다리역_distance,금호역_distance,길동역_distance,길음역_distance,김포공항역_distance,까치산역_0_distance,까치산역_1_distance,까치울역_distance,낙성대역_distance,남구로역_distance,남부터미널역_distance,남성역_distance,남태령역_distance,남한산성입구역_distance,내방역_distance,노원역_0_distance,노원역_1_distance,녹번역_distance,녹사평역_distance,논현역_distance,단대오거리역_distance,답십리역_distance,당고개역_distance,당산역_distance,대림역_0_distance,대림역_1_distance,대청역_distance,대치역_distance,대흥역_distance,도곡역_distance,도림천역_distance,도봉산역_distance,독립문역_distance,독바위역_distance,돌곶이역_distance,동대문역_0_distance,동대문역_1_distance,동대문역사문화공원역_0_distance,동대문역사문화공원역_1_distance,동대문역사문화공원역_2_distance,동대입구역_distance,동묘앞역_0_distance,동묘앞역_1_distance,동작역_distance,둔촌동역_distance,둔촌오륜역_distance,디지털미디어시티역_distance,뚝섬역_distance,뚝섬유원지역_distance,마곡역_distance,마들역_distance,마장역_distance,마천역_distance,마포구청역_distance,마포역_distance,망원역_distance,매봉역_distance,먹골역_distance,면목역_distance,명동역_distance,명일역_distance,모란역_distance,목동역_distance,몽촌토성역_distance,무악재역_distance,문래역_distance,문정역_distance,미아사거리역_distance,미아역_distance,반포역_distance,발산역_distance,방배역_distance,방이역_distance,방화역_distance,버티고개역_distance,보라매역_distance,보문역_distance,복정역_distance,봉은사역_distance,봉천역_distance,봉화산역_distance,부천시청역_distance,부천종합운동장역_distance,부평구청역_distance,불광역_0_distance,불광역_1_distance,사가정역_distance,사당역_0_distance,사당역_1_distance,산성역_distance,삼각지역_0_distance,삼각지역_1_distance,삼산체육관역_distance,삼성역_distance,삼성중앙역_distance,삼전역_distance,상계역_distance,상도역_distance,상동역_distance,상봉역_distance,상수역_distance,상왕십리역_distance,상월곡역_distance,상일동역_distance,새절역_distance,서대문역_distance,서울대입구역_distance,서울역_0_distance,서울역_1_distance,서초역_distance,석계역_distance,석촌고분역_distance,석촌역_0_distance,석촌역_1_distance,선릉역_distance,선정릉역_distance,성수역_distance,성신여대입구역_distance,송정역_distance,송파나루역_distance,송파역_distance,수락산역_distance,수서역_distance,수유역_distance,수진역_distance,숙대입구역_distance,숭실대입구역_distance,시청역_0_distance,시청역_1_distance,신금호역_distance,신길역_distance,신내역_distance,신답역_distance,신당역_0_distance,신당역_1_distance,신대방삼거리역_distance,신대방역_distance,신도림역_distance,신림역_distance,신사역_distance,신설동역_0_distance,신설동역_1_distance,신용산역_distance,신정네거리역_distance,신정역_distance,신중동역_distance,신촌역_distance,신풍역_distance,신흥역_distance,쌍문역_distance,아차산역_distance,아현역_distance,안국역_distance,안암역_distance,암사역_distance,압구정역_distance,애오개역_distance,약수역_0_distance,약수역_1_distance,양재역_distance,양천구청역_distance,양평역_distance,어린이대공원역_distance,언주역_distance,여의나루역_distance,여의도역_distance,역삼역_distance,역촌역_distance,연신내역_0_distance,연신내역_1_distance,영등포구청역_0_distance,영등포구청역_1_distance,영등포시장역_distance,오금역_0_distance,오금역_1_distance,오목교역_distance,옥수역_distance,온수역_distance,올림픽공원역_0_distance,올림픽공원역_1_distance,왕십리역_0_distance,왕십리역_1_distance,용답역_distance,용두역_distance,용마산역_distance,우장산역_distance,월곡역_distance,월드컵경기장역_distance,을지로3가역_0_distance,을지로3가역_1_distance,을지로4가역_0_distance,을지로4가역_1_distance,을지로입구역_distance,응암역_distance,이대역_distance,이수역_distance,이촌역_distance,이태원역_distance,일원역_distance,잠실나루역_distance,잠실새내역_distance,잠실역_0_distance,잠실역_1_distance,잠원역_distance,장승배기역_distance,장암역_distance,장지역_distance,장한평역_distance,제기동역_distance,종각역_distance,종로3가역_0_distance,종로3가역_1_distance,종로3가역_2_distance,종로5가역_distance,종합운동장역_0_distance,종합운동장역_1_distance,중계역_distance,중곡역_distance,중앙보훈병원역_distance,중화역_distance,증산역_distance,지축역_distance,창동역_distance,창신역_distance,천왕역_distance,천호역_0_distance,천호역_1_distance,철산역_distance,청구역_0_distance,청구역_1_distance,청담역_distance,청량리역_distance,총신대입구역_distance,춘의역_distance,충무로역_0_distance,충무로역_1_distance,충정로역_0_distance,충정로역_1_distance,태릉입구역_0_distance,태릉입구역_1_distance,하계역_distance,학동역_distance,학여울역_distance,한강진역_distance,한성대입구역_distance,한성백제역_distance,한양대역_distance,합정역_0_distance,합정역_1_distance,행당역_distance,혜화역_distance,홍대입구역_distance,홍제역_distance,화곡역_distance,화랑대역_distance,회현역_distance,효창공원앞역_distance
0,매매,77.75,8,57000.0,2013,9,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,25년_이하,5931.271069,5931.271069,15063.428185,4103.275138,2865.482155,8103.164567,9318.374645,7016.464949,7494.226249,24037.113002,8160.12951,6706.610677,6777.48304,12637.190727,6484.103485,12104.230451,12213.048954,5004.408577,5004.408577,11387.899904,11276.834805,16086.062118,8390.049468,17533.893317,12064.894186,12966.43127,3653.401352,3653.401352,13386.279656,18743.709262,6850.845387,20928.36525,8689.430787,8689.430787,28534.361247,10691.281057,8095.877461,9920.889176,13740.115703,23951.868578,19079.214974,19079.214974,21529.537498,7929.664083,14653.005137,3274.191145,7189.126771,5971.616549,10012.255092,5327.709093,19424.755108,19424.755108,16795.242852,8313.217275,4312.521045,10015.33631,9438.857403,21088.260979,14545.913024,14030.462177,13877.1664,2718.93629,1731.652761,12224.156841,1074.173276,15488.368579,23056.333254,13314.691164,18539.600195,14335.420645,10687.30635,10598.684743,10113.70263,10118.00774,10113.70263,9580.163545,10696.231002,10696.231002,6852.219591,8972.54911,8656.381077,17057.906934,7318.819869,5704.381196,21914.384808,20387.021291,9460.202908,8909.597781,16048.557353,11452.467515,15104.21213,834.751415,14507.138916,12260.863991,10626.222439,11175.549509,8585.860059,17361.320281,6620.730473,14415.36561,14557.06827,6155.949769,14773.824557,16273.094191,4712.545132,20911.737735,4882.143556,7116.918869,23736.210693,8449.923594,11878.270532,11908.433046,6628.325218,3727.207227,9853.427643,15484.916864,25667.808382,22744.505017,29498.866126,17952.145843,17952.145843,11468.181963,6346.778123,6327.178127,9010.795278,9206.974677,9206.974677,27636.67507,3179.333665,3496.329514,4026.958804,20001.873633,9614.434151,26636.625643,13099.624437,13687.578325,9454.629712,13862.497892,13012.072497,17350.494904,12083.04137,8866.866518,11007.076521,10660.54159,4172.744672,14876.942428,4490.959317,5452.592858,5452.592858,2590.582019,3373.310922,7009.916801,12766.572447,23148.223434,6203.534298,5617.97759,21774.992537,4376.488752,14773.824557,9138.261964,10072.786208,8888.179298,11491.987746,11491.987746,8621.131006,12895.787316,15211.189699,9843.264337,9804.763491,9804.763491,11222.048567,12375.164178,14626.830165,10890.741586,4841.493408,10703.930262,10703.930262,9204.091294,18180.189955,17963.623564,24555.209981,13196.479287,12815.802906,9524.935527,18649.588787,8511.404948,12016.259218,12127.286549,11826.566379,10083.888708,5447.119966,11671.685776,8933.07149,8897.414034,1655.285283,16880.183256,15473.588061,7624.614399,3329.687953,11763.040714,12209.545758,2585.934709,17975.845798,19196.737657,19196.737657,14665.28906,14665.28906,13861.971432,7019.220302,7019.220302,16407.507258,7293.482225,20332.647956,7872.00308,7872.00308,8975.236223,8975.236223,8947.904129,10362.390784,10712.852237,20553.844072,13359.569636,16778.529133,10924.1483,10924.1483,10639.015593,10639.015593,11275.423766,17803.449508,12638.212665,6376.584514,8361.244601,7880.973222,2803.392445,6265.143971,4467.979057,5629.937126,5629.937126,5073.282951,10435.047131,24269.400387,6508.028297,8948.447585,10862.792376,11622.458188,11251.607852,11251.607852,11251.607852,10907.995305,3747.772386,3747.772386,18169.233286,9736.674556,9574.674578,13541.260796,17014.739856,22889.160772,19071.209876,11440.652635,18949.042774,8912.818112,8912.818112,16369.754678,9418.022913,9418.022913,4224.276884,10990.366093,6291.029394,23649.133651,10267.975878,10267.975878,11700.13225,11860.493588,15289.575027,15289.575027,17257.087791,4107.935529,2378.506133,7950.284687,12579.674157,6983.947969,8275.21208,14406.069721,14428.259312,8678.25904,12015.096335,14166.027088,15298.647232,19922.492557,15608.00911,10809.940927,10339.981725
1,매매,77.75,16,57000.0,2013,12,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,25년_이하,5931.271069,5931.271069,15063.428185,4103.275138,2865.482155,8103.164567,9318.374645,7016.464949,7494.226249,24037.113002,8160.12951,6706.610677,6777.48304,12637.190727,6484.103485,12104.230451,12213.048954,5004.408577,5004.408577,11387.899904,11276.834805,16086.062118,8390.049468,17533.893317,12064.894186,12966.43127,3653.401352,3653.401352,13386.279656,18743.709262,6850.845387,20928.36525,8689.430787,8689.430787,28534.361247,10691.281057,8095.877461,9920.889176,13740.115703,23951.868578,19079.214974,19079.214974,21529.537498,7929.664083,14653.005137,3274.191145,7189.126771,5971.616549,10012.255092,5327.709093,19424.755108,19424.755108,16795.242852,8313.217275,4312.521045,10015.33631,9438.857403,21088.260979,14545.913024,14030.462177,13877.1664,2718.93629,1731.652761,12224.156841,1074.173276,15488.368579,23056.333254,13314.691164,18539.600195,14335.420645,10687.30635,10598.684743,10113.70263,10118.00774,10113.70263,9580.163545,10696.231002,10696.231002,6852.219591,8972.54911,8656.381077,17057.906934,7318.819869,5704.381196,21914.384808,20387.021291,9460.202908,8909.597781,16048.557353,11452.467515,15104.21213,834.751415,14507.138916,12260.863991,10626.222439,11175.549509,8585.860059,17361.320281,6620.730473,14415.36561,14557.06827,6155.949769,14773.824557,16273.094191,4712.545132,20911.737735,4882.143556,7116.918869,23736.210693,8449.923594,11878.270532,11908.433046,6628.325218,3727.207227,9853.427643,15484.916864,25667.808382,22744.505017,29498.866126,17952.145843,17952.145843,11468.181963,6346.778123,6327.178127,9010.795278,9206.974677,9206.974677,27636.67507,3179.333665,3496.329514,4026.958804,20001.873633,9614.434151,26636.625643,13099.624437,13687.578325,9454.629712,13862.497892,13012.072497,17350.494904,12083.04137,8866.866518,11007.076521,10660.54159,4172.744672,14876.942428,4490.959317,5452.592858,5452.592858,2590.582019,3373.310922,7009.916801,12766.572447,23148.223434,6203.534298,5617.97759,21774.992537,4376.488752,14773.824557,9138.261964,10072.786208,8888.179298,11491.987746,11491.987746,8621.131006,12895.787316,15211.189699,9843.264337,9804.763491,9804.763491,11222.048567,12375.164178,14626.830165,10890.741586,4841.493408,10703.930262,10703.930262,9204.091294,18180.189955,17963.623564,24555.209981,13196.479287,12815.802906,9524.935527,18649.588787,8511.404948,12016.259218,12127.286549,11826.566379,10083.888708,5447.119966,11671.685776,8933.07149,8897.414034,1655.285283,16880.183256,15473.588061,7624.614399,3329.687953,11763.040714,12209.545758,2585.934709,17975.845798,19196.737657,19196.737657,14665.28906,14665.28906,13861.971432,7019.220302,7019.220302,16407.507258,7293.482225,20332.647956,7872.00308,7872.00308,8975.236223,8975.236223,8947.904129,10362.390784,10712.852237,20553.844072,13359.569636,16778.529133,10924.1483,10924.1483,10639.015593,10639.015593,11275.423766,17803.449508,12638.212665,6376.584514,8361.244601,7880.973222,2803.392445,6265.143971,4467.979057,5629.937126,5629.937126,5073.282951,10435.047131,24269.400387,6508.028297,8948.447585,10862.792376,11622.458188,11251.607852,11251.607852,11251.607852,10907.995305,3747.772386,3747.772386,18169.233286,9736.674556,9574.674578,13541.260796,17014.739856,22889.160772,19071.209876,11440.652635,18949.042774,8912.818112,8912.818112,16369.754678,9418.022913,9418.022913,4224.276884,10990.366093,6291.029394,23649.133651,10267.975878,10267.975878,11700.13225,11860.493588,15289.575027,15289.575027,17257.087791,4107.935529,2378.506133,7950.284687,12579.674157,6983.947969,8275.21208,14406.069721,14428.259312,8678.25904,12015.096335,14166.027088,15298.647232,19922.492557,15608.00911,10809.940927,10339.981725
2,매매,77.75,29,55000.0,2014,1,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,5931.271069,5931.271069,15063.428185,4103.275138,2865.482155,8103.164567,9318.374645,7016.464949,7494.226249,24037.113002,8160.12951,6706.610677,6777.48304,12637.190727,6484.103485,12104.230451,12213.048954,5004.408577,5004.408577,11387.899904,11276.834805,16086.062118,8390.049468,17533.893317,12064.894186,12966.43127,3653.401352,3653.401352,13386.279656,18743.709262,6850.845387,20928.36525,8689.430787,8689.430787,28534.361247,10691.281057,8095.877461,9920.889176,13740.115703,23951.868578,19079.214974,19079.214974,21529.537498,7929.664083,14653.005137,3274.191145,7189.126771,5971.616549,10012.255092,5327.709093,19424.755108,19424.755108,16795.242852,8313.217275,4312.521045,10015.33631,9438.857403,21088.260979,14545.913024,14030.462177,13877.1664,2718.93629,1731.652761,12224.156841,1074.173276,15488.368579,23056.333254,13314.691164,18539.600195,14335.420645,10687.30635,10598.684743,10113.70263,10118.00774,10113.70263,9580.163545,10696.231002,10696.231002,6852.219591,8972.54911,8656.381077,17057.906934,7318.819869,5704.381196,21914.384808,20387.021291,9460.202908,8909.597781,16048.557353,11452.467515,15104.21213,834.751415,14507.138916,12260.863991,10626.222439,11175.549509,8585.860059,17361.320281,6620.730473,14415.36561,14557.06827,6155.949769,14773.824557,16273.094191,4712.545132,20911.737735,4882.143556,7116.918869,23736.210693,8449.923594,11878.270532,11908.433046,6628.325218,3727.207227,9853.427643,15484.916864,25667.808382,22744.505017,29498.866126,17952.145843,17952.145843,11468.181963,6346.778123,6327.178127,9010.795278,9206.974677,9206.974677,27636.67507,3179.333665,3496.329514,4026.958804,20001.873633,9614.434151,26636.625643,13099.624437,13687.578325,9454.629712,13862.497892,13012.072497,17350.494904,12083.04137,8866.866518,11007.076521,10660.54159,4172.744672,14876.942428,4490.959317,5452.592858,5452.592858,2590.582019,3373.310922,7009.916801,12766.572447,23148.223434,6203.534298,5617.97759,21774.992537,4376.488752,14773.824557,9138.261964,10072.786208,8888.179298,11491.987746,11491.987746,8621.131006,12895.787316,15211.189699,9843.264337,9804.763491,9804.763491,11222.048567,12375.164178,14626.830165,10890.741586,4841.493408,10703.930262,10703.930262,9204.091294,18180.189955,17963.623564,24555.209981,13196.479287,12815.802906,9524.935527,18649.588787,8511.404948,12016.259218,12127.286549,11826.566379,10083.888708,5447.119966,11671.685776,8933.07149,8897.414034,1655.285283,16880.183256,15473.588061,7624.614399,3329.687953,11763.040714,12209.545758,2585.934709,17975.845798,19196.737657,19196.737657,14665.28906,14665.28906,13861.971432,7019.220302,7019.220302,16407.507258,7293.482225,20332.647956,7872.00308,7872.00308,8975.236223,8975.236223,8947.904129,10362.390784,10712.852237,20553.844072,13359.569636,16778.529133,10924.1483,10924.1483,10639.015593,10639.015593,11275.423766,17803.449508,12638.212665,6376.584514,8361.244601,7880.973222,2803.392445,6265.143971,4467.979057,5629.937126,5629.937126,5073.282951,10435.047131,24269.400387,6508.028297,8948.447585,10862.792376,11622.458188,11251.607852,11251.607852,11251.607852,10907.995305,3747.772386,3747.772386,18169.233286,9736.674556,9574.674578,13541.260796,17014.739856,22889.160772,19071.209876,11440.652635,18949.042774,8912.818112,8912.818112,16369.754678,9418.022913,9418.022913,4224.276884,10990.366093,6291.029394,23649.133651,10267.975878,10267.975878,11700.13225,11860.493588,15289.575027,15289.575027,17257.087791,4107.935529,2378.506133,7950.284687,12579.674157,6983.947969,8275.21208,14406.069721,14428.259312,8678.25904,12015.096335,14166.027088,15298.647232,19922.492557,15608.00911,10809.940927,10339.981725
3,매매,77.75,10,59500.0,2014,11,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,5931.271069,5931.271069,15063.428185,4103.275138,2865.482155,8103.164567,9318.374645,7016.464949,7494.226249,24037.113002,8160.12951,6706.610677,6777.48304,12637.190727,6484.103485,12104.230451,12213.048954,5004.408577,5004.408577,11387.899904,11276.834805,16086.062118,8390.049468,17533.893317,12064.894186,12966.43127,3653.401352,3653.401352,13386.279656,18743.709262,6850.845387,20928.36525,8689.430787,8689.430787,28534.361247,10691.281057,8095.877461,9920.889176,13740.115703,23951.868578,19079.214974,19079.214974,21529.537498,7929.664083,14653.005137,3274.191145,7189.126771,5971.616549,10012.255092,5327.709093,19424.755108,19424.755108,16795.242852,8313.217275,4312.521045,10015.33631,9438.857403,21088.260979,14545.913024,14030.462177,13877.1664,2718.93629,1731.652761,12224.156841,1074.173276,15488.368579,23056.333254,13314.691164,18539.600195,14335.420645,10687.30635,10598.684743,10113.70263,10118.00774,10113.70263,9580.163545,10696.231002,10696.231002,6852.219591,8972.54911,8656.381077,17057.906934,7318.819869,5704.381196,21914.384808,20387.021291,9460.202908,8909.597781,16048.557353,11452.467515,15104.21213,834.751415,14507.138916,12260.863991,10626.222439,11175.549509,8585.860059,17361.320281,6620.730473,14415.36561,14557.06827,6155.949769,14773.824557,16273.094191,4712.545132,20911.737735,4882.143556,7116.918869,23736.210693,8449.923594,11878.270532,11908.433046,6628.325218,3727.207227,9853.427643,15484.916864,25667.808382,22744.505017,29498.866126,17952.145843,17952.145843,11468.181963,6346.778123,6327.178127,9010.795278,9206.974677,9206.974677,27636.67507,3179.333665,3496.329514,4026.958804,20001.873633,9614.434151,26636.625643,13099.624437,13687.578325,9454.629712,13862.497892,13012.072497,17350.494904,12083.04137,8866.866518,11007.076521,10660.54159,4172.744672,14876.942428,4490.959317,5452.592858,5452.592858,2590.582019,3373.310922,7009.916801,12766.572447,23148.223434,6203.534298,5617.97759,21774.992537,4376.488752,14773.824557,9138.261964,10072.786208,8888.179298,11491.987746,11491.987746,8621.131006,12895.787316,15211.189699,9843.264337,9804.763491,9804.763491,11222.048567,12375.164178,14626.830165,10890.741586,4841.493408,10703.930262,10703.930262,9204.091294,18180.189955,17963.623564,24555.209981,13196.479287,12815.802906,9524.935527,18649.588787,8511.404948,12016.259218,12127.286549,11826.566379,10083.888708,5447.119966,11671.685776,8933.07149,8897.414034,1655.285283,16880.183256,15473.588061,7624.614399,3329.687953,11763.040714,12209.545758,2585.934709,17975.845798,19196.737657,19196.737657,14665.28906,14665.28906,13861.971432,7019.220302,7019.220302,16407.507258,7293.482225,20332.647956,7872.00308,7872.00308,8975.236223,8975.236223,8947.904129,10362.390784,10712.852237,20553.844072,13359.569636,16778.529133,10924.1483,10924.1483,10639.015593,10639.015593,11275.423766,17803.449508,12638.212665,6376.584514,8361.244601,7880.973222,2803.392445,6265.143971,4467.979057,5629.937126,5629.937126,5073.282951,10435.047131,24269.400387,6508.028297,8948.447585,10862.792376,11622.458188,11251.607852,11251.607852,11251.607852,10907.995305,3747.772386,3747.772386,18169.233286,9736.674556,9574.674578,13541.260796,17014.739856,22889.160772,19071.209876,11440.652635,18949.042774,8912.818112,8912.818112,16369.754678,9418.022913,9418.022913,4224.276884,10990.366093,6291.029394,23649.133651,10267.975878,10267.975878,11700.13225,11860.493588,15289.575027,15289.575027,17257.087791,4107.935529,2378.506133,7950.284687,12579.674157,6983.947969,8275.21208,14406.069721,14428.259312,8678.25904,12015.096335,14166.027088,15298.647232,19922.492557,15608.00911,10809.940927,10339.981725
4,매매,77.75,28,75000.0,2016,3,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,5931.271069,5931.271069,15063.428185,4103.275138,2865.482155,8103.164567,9318.374645,7016.464949,7494.226249,24037.113002,8160.12951,6706.610677,6777.48304,12637.190727,6484.103485,12104.230451,12213.048954,5004.408577,5004.408577,11387.899904,11276.834805,16086.062118,8390.049468,17533.893317,12064.894186,12966.43127,3653.401352,3653.401352,13386.279656,18743.709262,6850.845387,20928.36525,8689.430787,8689.430787,28534.361247,10691.281057,8095.877461,9920.889176,13740.115703,23951.868578,19079.214974,19079.214974,21529.537498,7929.664083,14653.005137,3274.191145,7189.126771,5971.616549,10012.255092,5327.709093,19424.755108,19424.755108,16795.242852,8313.217275,4312.521045,10015.33631,9438.857403,21088.260979,14545.913024,14030.462177,13877.1664,2718.93629,1731.652761,12224.156841,1074.173276,15488.368579,23056.333254,13314.691164,18539.600195,14335.420645,10687.30635,10598.684743,10113.70263,10118.00774,10113.70263,9580.163545,10696.231002,10696.231002,6852.219591,8972.54911,8656.381077,17057.906934,7318.819869,5704.381196,21914.384808,20387.021291,9460.202908,8909.597781,16048.557353,11452.467515,15104.21213,834.751415,14507.138916,12260.863991,10626.222439,11175.549509,8585.860059,17361.320281,6620.730473,14415.36561,14557.06827,6155.949769,14773.824557,16273.094191,4712.545132,20911.737735,4882.143556,7116.918869,23736.210693,8449.923594,11878.270532,11908.433046,6628.325218,3727.207227,9853.427643,15484.916864,25667.808382,22744.505017,29498.866126,17952.145843,17952.145843,11468.181963,6346.778123,6327.178127,9010.795278,9206.974677,9206.974677,27636.67507,3179.333665,3496.329514,4026.958804,20001.873633,9614.434151,26636.625643,13099.624437,13687.578325,9454.629712,13862.497892,13012.072497,17350.494904,12083.04137,8866.866518,11007.076521,10660.54159,4172.744672,14876.942428,4490.959317,5452.592858,5452.592858,2590.582019,3373.310922,7009.916801,12766.572447,23148.223434,6203.534298,5617.97759,21774.992537,4376.488752,14773.824557,9138.261964,10072.786208,8888.179298,11491.987746,11491.987746,8621.131006,12895.787316,15211.189699,9843.264337,9804.763491,9804.763491,11222.048567,12375.164178,14626.830165,10890.741586,4841.493408,10703.930262,10703.930262,9204.091294,18180.189955,17963.623564,24555.209981,13196.479287,12815.802906,9524.935527,18649.588787,8511.404948,12016.259218,12127.286549,11826.566379,10083.888708,5447.119966,11671.685776,8933.07149,8897.414034,1655.285283,16880.183256,15473.588061,7624.614399,3329.687953,11763.040714,12209.545758,2585.934709,17975.845798,19196.737657,19196.737657,14665.28906,14665.28906,13861.971432,7019.220302,7019.220302,16407.507258,7293.482225,20332.647956,7872.00308,7872.00308,8975.236223,8975.236223,8947.904129,10362.390784,10712.852237,20553.844072,13359.569636,16778.529133,10924.1483,10924.1483,10639.015593,10639.015593,11275.423766,17803.449508,12638.212665,6376.584514,8361.244601,7880.973222,2803.392445,6265.143971,4467.979057,5629.937126,5629.937126,5073.282951,10435.047131,24269.400387,6508.028297,8948.447585,10862.792376,11622.458188,11251.607852,11251.607852,11251.607852,10907.995305,3747.772386,3747.772386,18169.233286,9736.674556,9574.674578,13541.260796,17014.739856,22889.160772,19071.209876,11440.652635,18949.042774,8912.818112,8912.818112,16369.754678,9418.022913,9418.022913,4224.276884,10990.366093,6291.029394,23649.133651,10267.975878,10267.975878,11700.13225,11860.493588,15289.575027,15289.575027,17257.087791,4107.935529,2378.506133,7950.284687,12579.674157,6983.947969,8275.21208,14406.069721,14428.259312,8678.25904,12015.096335,14166.027088,15298.647232,19922.492557,15608.00911,10809.940927,10339.981725


In [14]:
distance_cols = [col for col in main_data_df.columns if '_distance' in col]
print(len(distance_cols))

292


In [15]:
main_data_df['distance_nearest_station'] = main_data_df[distance_cols].min(axis=1)

In [16]:
main_data_df['nearest_station_name'] = main_data_df[distance_cols].idxmin(axis=1).apply(lambda x: x.split('_')[0])

In [17]:
main_data_df = main_data_df.drop(columns=distance_cols)

In [18]:
main_data_df.head()

Unnamed: 0,전월세매매구분,전용면적(㎡),계약일,금액(만원),년,월,건물종류,지번주소,도로명주소,X좌표,Y좌표,구,동,건물연식_engi,distance_nearest_station,nearest_station_name
0,매매,77.75,8,57000.0,2013,9,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,25년_이하,834.751415,매봉역
1,매매,77.75,16,57000.0,2013,12,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,25년_이하,834.751415,매봉역
2,매매,77.75,29,55000.0,2014,1,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,834.751415,매봉역
3,매매,77.75,10,59500.0,2014,11,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,834.751415,매봉역
4,매매,77.75,28,75000.0,2016,3,아파트,서울특별시 강남구 개포동 655-2,서울특별시 강남구 언주로 103-0,960473.743425,1942553.0,강남구,개포동,30년_이하,834.751415,매봉역


In [19]:
main_data_df['가격/면적'] = main_data_df['금액(만원)'] / main_data_df['전용면적(㎡)']

In [20]:
main_data_df.rename(columns={'전용면적(㎡)':'전용면적'}, inplace=True)

In [21]:
main_data_df['year_linear'] = (main_data_df['년'] - main_data_df['년'].min()) + ((main_data_df['월']-1)/12)

In [22]:
def month_circular_sine_func(x):
    return math.sin(2*math.pi*((x-1)/12))

def month_circular_cosine_func(x):
    return math.cos(2*math.pi*((x-1)/12))

In [23]:
main_data_df['month_sin'] = main_data_df['월'].apply(month_circular_sine_func)
main_data_df['month_cos'] = main_data_df['월'].apply(month_circular_cosine_func)

In [24]:
main_data_df.drop(columns=['월', '계약일'], inplace=True)

In [25]:
xmax = main_data_df['X좌표'].max()
xmin = main_data_df['X좌표'].min()
ymax = main_data_df['Y좌표'].max()
ymin = main_data_df['Y좌표'].min()

In [26]:
coor_minmax_df = pd.DataFrame({'xmax':xmax, 'xmin':xmin, 'ymax':ymax, 'ymin':ymin}, index=[0])
coor_minmax_df.to_csv('./Training/training_data_ver_17/coor_minmax.csv', index=False)
coor_minmax_df

Unnamed: 0,xmax,xmin,ymax,ymin
0,971728.508488,938060.786948,1965830.0,1937374.0


In [27]:
main_data_df['xnorm'] = (main_data_df['X좌표'] - xmin - ((xmax - xmin)/2))
main_data_df['ynorm'] = (main_data_df['Y좌표'] - ymin - ((ymax - ymin)/2))
main_data_df['x_2nd'] = main_data_df['xnorm']**2 / 2
main_data_df['y_2nd'] = main_data_df['ynorm']**2 / 2
main_data_df['x_sin'] = main_data_df['xnorm'].apply(np.sin)
main_data_df['y_sin'] = main_data_df['ynorm'].apply(np.sin)
main_data_df['x_cos'] = main_data_df['xnorm'].apply(np.cos)
main_data_df['y_cos'] = main_data_df['ynorm'].apply(np.cos)
main_data_df['xy'] = main_data_df['xnorm'] * main_data_df['ynorm']

In [28]:
main_data_df.drop(columns=['X좌표', 'Y좌표'], inplace=True)

In [29]:
main_data_df['trade_type'] = main_data_df['건물종류'] + '_' + main_data_df['전월세매매구분']
main_data_df.drop(columns=['건물종류', '전월세매매구분'], inplace=True)

In [30]:
%%time
land_specs_df = pd.read_csv(basedir + 'land_specs_ver_3.csv').drop(columns=['토지면적', '공시지가', '용도지역명2'])
land_specs_df.shape

Wall time: 2.4 s


(1563645, 8)

In [31]:
merge1_df = main_data_df.merge(land_specs_df, on=['지번주소', '년'])
merge1_df.shape

(4356650, 30)

In [32]:
merge1_df.columns = [col.replace(':','_').replace('/','_').replace('\\','_').replace('*','_').replace('?','_')\
                         .replace('\"','_').replace('|','_').replace('<','_').replace('>','_').replace('ㆍ','_')\
                         .replace(' ','_').replace('·','_').replace('-','_').replace('~','_')\
                         for col in merge1_df.columns]

In [33]:
ls_catcols = [col for col in land_specs_df.columns if col not in ['지번주소', '년', '토지면적']]
ls_catcols

['지목명', '용도지역명1', '토지이동상황', '지형높이', '지형형상', '도로접면']

In [34]:
merge1_df.drop(columns=['년', '도로명주소'], inplace=True)

In [35]:
merge1_df.head()

Unnamed: 0,전용면적,금액(만원),지번주소,구,동,건물연식_engi,distance_nearest_station,nearest_station_name,가격_면적,year_linear,month_sin,month_cos,xnorm,ynorm,x_2nd,y_2nd,x_sin,y_sin,x_cos,y_cos,xy,trade_type,지목명,용도지역명1,토지이동상황,지형높이,지형형상,도로접면
0,77.75,57000.0,서울특별시 강남구 개포동 655-2,강남구,개포동,25년_이하,834.751415,매봉역,733.118971,7.666667,-0.866025,-0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,아파트_매매,대,제3종일반주거지역,아파트,평지,세로장방,광대세각
1,77.75,57000.0,서울특별시 강남구 개포동 655-2,강남구,개포동,25년_이하,834.751415,매봉역,733.118971,7.916667,-0.5,0.866025,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,아파트_매매,대,제3종일반주거지역,아파트,평지,세로장방,광대세각
2,77.75,35000.0,서울특별시 강남구 개포동 655-2,강남구,개포동,25년_이하,834.751415,매봉역,450.160772,7.0,0.0,1.0,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,아파트_전세,대,제3종일반주거지역,아파트,평지,세로장방,광대세각
3,77.75,32000.0,서울특별시 강남구 개포동 655-2,강남구,개포동,25년_이하,834.751415,매봉역,411.575563,7.166667,0.866025,0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,아파트_전세,대,제3종일반주거지역,아파트,평지,세로장방,광대세각
4,77.75,37000.0,서울특별시 강남구 개포동 655-2,강남구,개포동,25년_이하,834.751415,매봉역,475.884244,7.666667,-0.866025,-0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,아파트_전세,대,제3종일반주거지역,아파트,평지,세로장방,광대세각


In [36]:
cat_cols = ['trade_type', '건물연식_engi', 'nearest_station_name', '구', '동'] + ls_catcols
cat_cols

['trade_type',
 '건물연식_engi',
 'nearest_station_name',
 '구',
 '동',
 '지목명',
 '용도지역명1',
 '토지이동상황',
 '지형높이',
 '지형형상',
 '도로접면']

In [37]:
for col in tqdm(cat_cols):
    col_map = {}
    col_map_df = pd.DataFrame({'cat':merge1_df[col].unique(),'map':[i for i in range(len(merge1_df[col].unique()))]})
    for i in range(col_map_df.shape[0]):
        col_map[col_map_df['cat'][i]] = col_map_df['map'][i]
    merge1_df[col] = merge1_df[col].map(col_map)
    col_map_df.to_csv('./Training/training_data_ver_17/feature_maps/{}_map.csv'.format(col))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:08<00:00,  1.26it/s]


In [38]:
cols_to_drop = []
for col in tqdm(merge1_df.columns):
    try:
        if merge3_df[col].std() == 0:
            cols_to_drop.append(col)
        else:
            pass
    except:
        pass

100%|██████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<?, ?it/s]


In [39]:
len(cols_to_drop)

0

In [40]:
cols_to_drop

[]

In [41]:
merge1_df = merge1_df.drop(columns=cols_to_drop)

In [42]:
merge1_df.head()

Unnamed: 0,전용면적,금액(만원),지번주소,구,동,건물연식_engi,distance_nearest_station,nearest_station_name,가격_면적,year_linear,month_sin,month_cos,xnorm,ynorm,x_2nd,y_2nd,x_sin,y_sin,x_cos,y_cos,xy,trade_type,지목명,용도지역명1,토지이동상황,지형높이,지형형상,도로접면
0,77.75,57000.0,서울특별시 강남구 개포동 655-2,0,0,0,834.751415,0,733.118971,7.666667,-0.866025,-0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,0,0,0,0,0,0,0
1,77.75,57000.0,서울특별시 강남구 개포동 655-2,0,0,0,834.751415,0,733.118971,7.916667,-0.5,0.866025,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,0,0,0,0,0,0,0
2,77.75,35000.0,서울특별시 강남구 개포동 655-2,0,0,0,834.751415,0,450.160772,7.0,0.0,1.0,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,1,0,0,0,0,0,0
3,77.75,32000.0,서울특별시 강남구 개포동 655-2,0,0,0,834.751415,0,411.575563,7.166667,0.866025,0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,1,0,0,0,0,0,0
4,77.75,37000.0,서울특별시 강남구 개포동 655-2,0,0,0,834.751415,0,475.884244,7.666667,-0.866025,-0.5,5579.095707,-9048.956121,15563150.0,40941800.0,-0.364267,-0.920469,0.931294,0.390815,-50484990.0,1,0,0,0,0,0,0


In [43]:
%%time
merge1_df.to_csv('./Training/training_data_ver_17/training_data_ver_17_nocut_subway.csv', index=False)

Wall time: 58.8 s


In [44]:
for col in cols_to_drop:
    if col in cat_cols:
        cat_cols.remove(col)

In [45]:
cat_cols_df = pd.DataFrame({'colname':cat_cols})
print(cat_cols_df.shape)
cat_cols_df.head()

(11, 1)


Unnamed: 0,colname
0,trade_type
1,건물연식_engi
2,nearest_station_name
3,구
4,동


In [46]:
cat_cols_df.to_csv('./Training/training_data_ver_17/cat_cols.csv', index=False)