In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
def landnum_modifier(x):
    # x is a string
    
    splitted = x.split('-')
    if len(splitted) == 1:
        return x+'-0'
    elif len(splitted) == 2:
        return x
    else:
        raise ValueError('myerror')

In [3]:
def create_land_specs_df():
    basedir = './토지특성정보/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('AL_' in f)]
    
    dfs_list = []
    for file in filenames:
        df = pd.read_csv(basedir + file, encoding='euc-kr')
        dfs_list.append(df)
    
    df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    df['지번주소'] = df['법정동명'] + ' ' + df['지번'].apply(landnum_modifier)
    df.drop(columns=['법정동명', '지번'], inplace=True)
    
    return df.sort_values(['지번주소', '기준년도'])

In [4]:
land_specs_df = create_land_specs_df()
print(land_specs_df.shape)
land_specs_df = land_specs_df[(land_specs_df['대장구분명'] == '일반')|(land_specs_df['대장구분명'] == '산')]
land_specs_df.shape

  if (await self.run_code(code, result,  async_=asy)):


(7515555, 25)


(7483019, 25)

In [5]:
land_specs_df.head()

Unnamed: 0,고유번호,법정동코드,대장구분코드,대장구분명,토지일련번호,기준년도,기준월,지목코드,지목명,토지면적,용도지역코드1,용도지역명1,용도지역코드2,용도지역명2,토지이용상황코드,토지이동상황,지형높이코드,지형높이,지형형상코드,지형형상,도로접면코드,도로접면,공시지가,데이터기준일자,지번주소
5931139,1168010300101000000,1168010300,1.0,일반,5961,2013,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,325000,2017-10-17,서울특별시 강남구 개포동 100-0
5931140,1168010300101000000,1168010300,1.0,일반,5960,2014,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,330000,2017-10-17,서울특별시 강남구 개포동 100-0
5931141,1168010300101000000,1168010300,1.0,일반,5954,2015,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,335000,2017-10-17,서울특별시 강남구 개포동 100-0
5931142,1168010300101000000,1168010300,1.0,일반,5987,2016,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,345000,2017-10-17,서울특별시 강남구 개포동 100-0
5931143,1168010300101000000,1168010300,1.0,일반,5964,2017,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,355000,2018-05-23,서울특별시 강남구 개포동 100-0


In [6]:
san_df = land_specs_df[land_specs_df['대장구분명'] == '산']

In [7]:
def modify_san_addr(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split(' ')
        return splitted[0] + ' ' + splitted[1] + ' ' + splitted[2] + ' ' + '산' + splitted[3]

In [8]:
san_df['지번주소'] = san_df['지번주소'].apply(modify_san_addr)

In [9]:
land_specs_df.loc[san_df.index, '지번주소'] = san_df['지번주소']

In [10]:
land_specs_df.rename(columns={'기준년도':'년'}, inplace=True)

In [11]:
land_specs_df.drop_duplicates(subset=['지번주소', '대장구분명', '년'], keep='first', inplace=True)
land_specs_df.shape

(7422744, 25)

In [12]:
selected_df = land_specs_df[['지번주소', '년', '지목명', '토지면적', '용도지역명1', '용도지역명2', '토지이동상황',
                           '지형높이', '지형형상', '도로접면']]
print(selected_df.shape)

(7422744, 10)


In [13]:
selected_df.head()

Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
5931139,서울특별시 강남구 개포동 100-0,2013,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931140,서울특별시 강남구 개포동 100-0,2014,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931141,서울특별시 강남구 개포동 100-0,2015,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931142,서울특별시 강남구 개포동 100-0,2016,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931143,서울특별시 강남구 개포동 100-0,2017,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지


In [14]:
main_df = pd.read_csv('./prepped_data/메인_데이터_20060101_20210409_ver_3.csv')
print(main_df.shape)
main_df.head()

(1911851, 17)


Unnamed: 0,지번주소,도로명,건물(단지)명,전용면적(㎡),거래금액(만원),층,건축년도,계약날짜기준_건물연식,계약날짜,해제사유발생일,부동산유형,전용면적_classes,가격_면적,가격_면적_classes,도로명주소,X좌표,Y좌표
0,서울특별시 강남구 개포동 1264-3,개포로31길 23-7,(1264-3),53.28,11500.0,-1,1992.0,14.0,2006-08-29,,연립다세대,50~55,215.840841,210~220,서울특별시 강남구 개포로31길 23-7,960076.154238,1942459.0
1,서울특별시 강남구 개포동 171-13,선릉로14길 11-0,(171-13),68.08,56500.0,2,1988.0,18.0,2006-12-20,,연립다세대,65~70,829.905993,820~830,서울특별시 강남구 선릉로14길 11-0,961145.46232,1942843.0
2,서울특별시 강남구 개포동 171-13,선릉로14길 11-0,(171-13),44.8,23300.0,-1,1988.0,28.0,2016-01-16,,연립다세대,40~45,520.089286,520~530,서울특별시 강남구 선릉로14길 11-0,961145.46232,1942843.0
3,서울특별시 강남구 개포동 171-13,선릉로14길 11-0,(171-13),44.8,24500.0,-1,1988.0,28.0,2016-04-09,,연립다세대,40~45,546.875,540~550,서울특별시 강남구 선릉로14길 11-0,961145.46232,1942843.0
4,서울특별시 강남구 개포동 171-13,선릉로14길 11-0,(171-13),68.08,60000.0,2,1988.0,31.0,2019-10-23,,연립다세대,65~70,881.316099,880~890,서울특별시 강남구 선릉로14길 11-0,961145.46232,1942843.0


In [15]:
selected_df2 = selected_df[selected_df['지번주소'].isin(main_df['지번주소'].unique())]
print(selected_df2.shape)

(681515, 10)


In [16]:
def create_complete_land_plans_df_iteratively(df):
    import numpy as np
    import pandas as pd
    import gc
    
    df = df.copy()
    
    dfs_list = []
    count = 0
    for addr in df['지번주소'].unique():
        if (count%5000) == 0:
            gc.collect()
        
        addr_df = df[df['지번주소'] == addr]
        
        min_year = addr_df['년'].min()
        
        years_list = [i for i in range(2006, 2022) if i not in addr_df['년'].unique().tolist()]
        if years_list == []:
            dfs_list.append(addr_df)
            continue
        
        empty_row = addr_df.iloc[[0], :]
        empty_row.iloc[:,2:] = np.nan

        empty_rows_list = []
        for y in years_list:
            empty_row_copy = empty_row.copy()
            empty_row_copy.iloc[0, 1] = y
            empty_rows_list.append(empty_row_copy)
        
        empty_df = pd.concat(empty_rows_list)

        mini_concat_df = pd.concat([addr_df, empty_df]).sort_values(['년']).reset_index(drop=True)
        
        mini_concat_df.fillna(method='ffill', inplace=True)
        mini_concat_df.fillna(method='bfill', inplace=True)
        
        dfs_list.append(mini_concat_df)
                
        count += 1

    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    return concat_df

In [17]:
def parallelize(df, func, num_processors=6):
    #data_split = np.array_split(data, num_processors)
    
    data_split = []
    unique_addrs = df['지번주소'].unique()
    divided = int(len(unique_addrs) / num_processors)
    for i in range(num_processors):
        if i < (num_processors - 1):
            picked_addrs = unique_addrs[divided*i:divided*(i+1)]
        else:
            picked_addrs = unique_addrs[divided*i:]
        data_split.append(df[df['지번주소'].isin(picked_addrs)])
            
    pool = mp.Pool(num_processors)
    concat_df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    
    concat_df = concat_df.sort_values(['지번주소', '년']).reset_index(drop=True)
    
    return concat_df

In [18]:
%%time
completed_df = parallelize(selected_df2, create_complete_land_plans_df_iteratively)
print(completed_df.shape)
completed_df.head()

(1369024, 10)
Wall time: 3min 54s


Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,서울특별시 강남구 개포동 1164-0,2006,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
1,서울특별시 강남구 개포동 1164-0,2007,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
2,서울특별시 강남구 개포동 1164-0,2008,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
3,서울특별시 강남구 개포동 1164-0,2009,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
4,서울특별시 강남구 개포동 1164-0,2010,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)


In [19]:
completed_df.to_csv('./prepped_data/land_specs_ver_6.csv', index=False)