In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
def landnum_modifier(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split('-')
        if len(splitted) == 1:
            return x + '-0'
        else:
            return x

In [3]:
def create_land_specs_df():
    basedir = './토지특성정보/경기도/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('AL_' in f)]
    
    dfs_list = []
    for file in filenames:
        df = pd.read_csv(basedir + file, encoding='euc-kr')
        dfs_list.append(df)
    
    df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    df['지번주소'] = df['법정동명'] + ' ' + df['지번'].apply(landnum_modifier)
    df.drop(columns=['법정동명', '지번'], inplace=True)
    
    return df.sort_values(['지번주소', '기준년도'])

In [4]:
land_specs_df = create_land_specs_df()
print(land_specs_df.shape)
land_specs_df = land_specs_df[(land_specs_df['대장구분명'] == '일반')|(land_specs_df['대장구분명'] == '산')]
land_specs_df.shape

  if (await self.run_code(code, result,  async_=asy)):


(35909968, 25)


(35711880, 25)

In [5]:
land_specs_df.head()

Unnamed: 0,고유번호,법정동코드,대장구분코드,대장구분명,토지일련번호,기준년도,기준월,지목코드,지목명,토지면적,용도지역코드1,용도지역명1,용도지역코드2,용도지역명2,토지이용상황코드,토지이동상황,지형높이코드,지형높이,지형형상코드,지형형상,도로접면코드,도로접면,공시지가,데이터기준일자,지번주소
32427361,4182025030100010000,4182025030,1,일반,21885,2013,1,5,임야,1861.0,62,보전관리지역,0,지정되지않음,710,조림,4,급경사,7,부정형,8,세로한면(가),11800,2017-10-17,경기도 가평군 가평읍 개곡리 1-0
32443451,4182025030200010000,4182025030,2,산,23785,2013,1,5,임야,10017.0,71,농림지역,0,지정되지않음,720,자연림,4,급경사,7,부정형,12,맹지,1390,2017-10-17,경기도 가평군 가평읍 개곡리 1-0
32427362,4182025030100010000,4182025030,1,일반,22340,2014,1,5,임야,1861.0,62,보전관리지역,0,지정되지않음,710,조림,4,급경사,7,부정형,8,세로한면(가),13100,2017-10-17,경기도 가평군 가평읍 개곡리 1-0
32443452,4182025030200010000,4182025030,2,산,24256,2014,1,5,임야,10017.0,71,농림지역,0,지정되지않음,720,자연림,4,급경사,7,부정형,12,맹지,1490,2017-10-17,경기도 가평군 가평읍 개곡리 1-0
32427363,4182025030100010000,4182025030,1,일반,22656,2015,1,5,임야,1861.0,62,보전관리지역,0,지정되지않음,710,조림,4,급경사,7,부정형,8,세로한면(가),13600,2017-10-17,경기도 가평군 가평읍 개곡리 1-0


In [6]:
san_df = land_specs_df[land_specs_df['대장구분명'] == '산']

In [7]:
def modify_san_addr(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split(' ')
        return splitted[0] + ' ' + splitted[1] + ' ' + splitted[2] + ' ' + '산' + splitted[3]

In [8]:
san_df['지번주소'] = san_df['지번주소'].apply(modify_san_addr)

In [9]:
land_specs_df.loc[san_df.index, '지번주소'] = san_df['지번주소']

In [10]:
land_specs_df.rename(columns={'기준년도':'년'}, inplace=True)

In [11]:
land_specs_df.drop_duplicates(subset=['지번주소', '대장구분명', '년'], keep='first', inplace=True)
land_specs_df.shape

(32868339, 25)

In [12]:
selected_df = land_specs_df[['지번주소', '년', '지목명', '토지면적', '용도지역명1', '용도지역명2', '토지이동상황',
                           '지형높이', '지형형상', '도로접면']]
print(selected_df.shape)

(32868339, 10)


In [16]:
selected_df = selected_df.sort_values(['지번주소', '년']).reset_index(drop=True)

In [17]:
selected_df.head()

Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,경기도 가평군 가평읍 개곡리 1-0,2013,임야,1861.0,보전관리지역,지정되지않음,조림,급경사,부정형,세로한면(가)
1,경기도 가평군 가평읍 개곡리 1-0,2014,임야,1861.0,보전관리지역,지정되지않음,조림,급경사,부정형,세로한면(가)
2,경기도 가평군 가평읍 개곡리 1-0,2015,임야,1861.0,보전관리지역,지정되지않음,조림,급경사,부정형,세로한면(가)
3,경기도 가평군 가평읍 개곡리 1-0,2016,임야,1861.0,보전관리지역,지정되지않음,조림,급경사,부정형,세로한면(가)
4,경기도 가평군 가평읍 개곡리 1-0,2017,임야,1861.0,보전관리지역,지정되지않음,조림,급경사,부정형,세로한면(가)


In [18]:
%%time
selected_df.to_csv('./prepped_data/land_specs_baseline_경기도.csv', index=False)

Wall time: 2min 7s
