In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
def landnum_modifier(x):
    # x is a string
    
    splitted = x.split('-')
    if len(splitted) == 1:
        return x+'-0'
    elif len(splitted) == 2:
        return x
    else:
        raise ValueError('myerror')

In [3]:
def create_land_specs_df():
    basedir = './토지특성정보/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('AL_' in f)]
    
    dfs_list = []
    for file in filenames:
        df = pd.read_csv(basedir + file, encoding='euc-kr')
        dfs_list.append(df)
    
    df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    df['지번주소'] = df['법정동명'] + ' ' + df['지번'].apply(landnum_modifier)
    df.drop(columns=['법정동명', '지번'], inplace=True)
    
    return df.sort_values(['지번주소', '기준년도'])

In [4]:
land_specs_df = create_land_specs_df()
print(land_specs_df.shape)
land_specs_df = land_specs_df[(land_specs_df['대장구분명'] == '일반')|(land_specs_df['대장구분명'] == '산')]
land_specs_df.shape

  if (await self.run_code(code, result,  async_=asy)):


(14987366, 26)


(14922293, 26)

In [5]:
land_specs_df.head()

Unnamed: 0,고유번호,법정동코드,대장구분코드,대장구분명,토지일련번호,기준년도,기준월,지목코드,지목명,토지면적,용도지역코드1,용도지역명1,용도지역코드2,용도지역명2,토지이용상황코드,토지이동상황,지형높이코드,지형높이,지형형상코드,지형형상,도로접면코드,도로접면,공시지가,데이터기준일자,기준연도,지번주소
5931139,1168010300101000000,1168010300,1.0,일반,5961,2013.0,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,325000,2017-10-17,,서울특별시 강남구 개포동 100-0
5931140,1168010300101000000,1168010300,1.0,일반,5960,2014.0,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,330000,2017-10-17,,서울특별시 강남구 개포동 100-0
5931141,1168010300101000000,1168010300,1.0,일반,5954,2015.0,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,335000,2017-10-17,,서울특별시 강남구 개포동 100-0
5931142,1168010300101000000,1168010300,1.0,일반,5987,2016.0,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,345000,2017-10-17,,서울특별시 강남구 개포동 100-0
5931143,1168010300101000000,1168010300,1.0,일반,5964,2017.0,1,1,전,876.0,43,자연녹지지역,0,지정되지않음,510,전,3,완경사,7,부정형,12,맹지,355000,2018-05-23,,서울특별시 강남구 개포동 100-0


In [6]:
san_df = land_specs_df[land_specs_df['대장구분명'] == '산']

In [7]:
def modify_san_addr(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split(' ')
        return splitted[0] + ' ' + splitted[1] + ' ' + splitted[2] + ' ' + '산' + splitted[3]

In [8]:
san_df['지번주소'] = san_df['지번주소'].apply(modify_san_addr)

In [9]:
land_specs_df.loc[san_df.index, '지번주소'] = san_df['지번주소']

In [10]:
land_specs_df.rename(columns={'기준년도':'년'}, inplace=True)

In [11]:
land_specs_df.drop_duplicates(subset=['지번주소', '대장구분명', '년'], keep='first', inplace=True)
land_specs_df.shape

(8405705, 26)

In [12]:
selected_df = land_specs_df[['지번주소', '년', '지목명', '토지면적', '용도지역명1', '용도지역명2', '토지이동상황',
                           '지형높이', '지형형상', '도로접면']]
print(selected_df.shape)

(8405705, 10)


In [13]:
selected_df.head()

Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
5931139,서울특별시 강남구 개포동 100-0,2013.0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931140,서울특별시 강남구 개포동 100-0,2014.0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931141,서울특별시 강남구 개포동 100-0,2015.0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931142,서울특별시 강남구 개포동 100-0,2016.0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
5931143,서울특별시 강남구 개포동 100-0,2017.0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지


In [14]:
%%time
selected_df.to_csv('./prepped_data/land_specs_baseline.csv', index=False)

Wall time: 20.9 s
