In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
def bon_bun_allocator(x):
    splitted = x.split('-')
    return int(splitted[0])

def bu_bun_allocator(x):
    splitted = x.split('-')
    if len(splitted) == 2:
        return int(splitted[1])
    elif len(splitted) == 1:
        return 0
    else:
        print('error')
        return

In [3]:
def land_plans_full_address_allocator(df):    
    bonbun = df['지번'].apply(bon_bun_allocator)
    bubun = df['지번'].apply(bu_bun_allocator)
    
    df['지번주소'] = df['법정동명'] + ' ' + bonbun.apply(str) + '-' + bubun.apply(str)
    
    return df

In [4]:
def land_plan_unstack(df):
    df = df.set_index(['지번주소', '년', '용도지역지구명']).unstack(-1).reset_index()
    
    return df

In [5]:
def san_addr_modifier(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split(' ')
        return splitted[0] + ' ' + splitted[1] + ' ' + splitted[2] + ' 산' + splitted[3]

In [6]:
basedir = './토지이용계획정보/'
filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('AL_' in f)]
filenames

['AL_11_D155_20160919.csv',
 'AL_11_D155_20171104.csv',
 'AL_11_D155_20181214.csv',
 'AL_11_D155_20191207.csv',
 'AL_11_D155_20201205.csv',
 'AL_11_D155_20210306.csv']

In [7]:
dfs_list = []
for file in tqdm(filenames):
    df = pd.read_csv(basedir + file, encoding='euc-kr')
    df = df[['법정동명', '대장구분명', '지번', '저촉여부', '용도지역지구명', '등록일자']]
    dfs_list.append(df)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:56<00:00, 19.49s/it]


In [8]:
del df
gc.collect()

46

In [9]:
concat_df = pd.concat(dfs_list).reset_index(drop=True)
print(concat_df.shape)
concat_df.head()

(50300540, 6)


Unnamed: 0,법정동명,대장구분명,지번,저촉여부,용도지역지구명,등록일자
0,서울특별시 종로구 청운동,토지대장,1,포함,대공방어협조구역(위탁고도:54-236m),2009-12-29
1,서울특별시 종로구 청운동,토지대장,1,저촉,상대정화구역,2009-12-28
2,서울특별시 종로구 청운동,토지대장,1,저촉,상대정화구역,2009-12-28
3,서울특별시 종로구 청운동,토지대장,1,포함,도시지역,2009-12-28
4,서울특별시 종로구 청운동,토지대장,1,저촉,자연경관지구,2009-12-28


In [10]:
concat_df['등록일자'] = concat_df['등록일자'].apply(lambda x: str(x).split('-')[0])
concat_df.rename(columns={'등록일자':'년'}, inplace=True)

In [11]:
concat_df = concat_df.dropna().drop_duplicates().reset_index(drop=True)

In [12]:
gc.collect()

20

In [13]:
concat_df['용도지역지구명'].nunique()

353

In [14]:
concat_df['용도지역지구명'].unique()[:50]

array(['대공방어협조구역(위탁고도:54-236m)', '상대정화구역', '도시지역', '자연경관지구', '역사문화미관지구',
       '도로', '제1종일반주거지역', '가축사육제한구역', '자연녹지지역', '과밀억제권역', '공원',
       '문화재보존영향 검토대상구역', '통제보호구역(민통선이남:300m)', '공익용산지', '문화재보호구역',
       '국가지정문화재구역', '제1종지구단위계획구역', '제한보호구역(후방지역:500m)', '임업용산지', '개발제한구역',
       '기타교통시설', '시도지정문화재구역', '비오톱1등급', '학교', '최고고도지구', '문화재', '절대정화구역',
       '제2종일반주거지역', '등록문화재구역', '주차장', '도시기타용도지역지구용도지역지구', '도시관리계획 입안중',
       '제1종전용주거지역', '정비구역', '사회복지시설', '공공청사', '문화시설', '방화지구', '노외주차장',
       '제3종일반주거지역', '중심지미관지구', '일반상업지역', '4대문안', '도시철도', '교통광장', '녹지',
       '시·도 생태·경관보전지역', '전기공급설비', '소로2류(폭 8M~10M)', '경관녹지'], dtype=object)

In [15]:
concat_df = land_plans_full_address_allocator(concat_df)

In [16]:
san_df = concat_df[concat_df['대장구분명'] == '임야대장']
san_df['지번주소'] = san_df['지번주소'].apply(san_addr_modifier)
concat_df.loc[san_df.index, '지번주소'] = san_df['지번주소']

simple_cols = ['지번주소', '저촉여부', '용도지역지구명', '년']

concat_df = concat_df[simple_cols]
concat_df = concat_df.drop_duplicates(subset=['지번주소', '용도지역지구명', '년'], keep='first').reset_index(drop=True)
gc.collect()

0

In [17]:
del san_df
gc.collect()

20

In [18]:
%%time
concat_df = land_plan_unstack(concat_df)

Wall time: 1min 48s


In [19]:
gc.collect()

20

In [20]:
correct_cols = []
cols = concat_df.columns
for i in range(len(cols)):
    if (i == 0)|(i == 1):
        correct_cols.append(cols[i][0])
    else:
        correct_cols.append(cols[i][1])

concat_df.columns = correct_cols

In [21]:
concat_df = concat_df.sort_values(['지번주소', '년']).reset_index(drop=True)
gc.collect()

40

In [22]:
concat_df.to_csv('./토지이용계획정보/original_land_plans.csv', index=False)

In [23]:
concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5149472 entries, 0 to 5149471
Columns: 355 entries, 지번주소 to 환경정비구역
dtypes: object(355)
memory usage: 13.6+ GB


In [None]:
def create_complete_land_plans_df_iteratively(df):
    import numpy as np
    import pandas as pd
    import gc
    
    df = df.copy()
    
    dfs_list = []
    count = 0
    for addr in df['지번주소'].unique():
        if (count%5000) == 0:
            gc.collect()
        
        addr_df = df[df['지번주소'] == addr]
        
        min_year = addr_df['년'].min()
        
        years_list = [i for i in range(2006, 2021) if i not in addr_df['년'].unique().tolist()]
        if years_list == []:
            dfs_list.append(addr_df)
            continue
        
        empty_row = addr_df.iloc[[0], :]
        empty_row.iloc[:,2:] = np.nan

        empty_rows_list = []
        for y in years_list:
            empty_row_copy = empty_row.copy()
            empty_row_copy.iloc[0, 1] = y
            empty_rows_list.append(empty_row_copy)
        
        empty_df = pd.concat(empty_rows_list)

        mini_concat_df = pd.concat([addr_df, empty_df]).sort_values(['년']).reset_index(drop=True)
        
        min_year_and_below_df = mini_concat_df[mini_concat_df['년'] <= min_year].fillna(method='bfill')
        
        min_year_and_above_df = mini_concat_df[mini_concat_df['년'] >= min_year].fillna(method='ffill')
        
        mini_concat_updated_df = pd.concat([min_year_and_below_df, min_year_and_above_df]).drop_duplicates().sort_values(['년']).reset_index(drop=True)
        
        dfs_list.append(mini_concat_updated_df)
                
        count += 1

    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    return concat_df

In [None]:
def parallelize(df, func, num_processors=6):
    #data_split = np.array_split(data, num_processors)
    
    data_split = []
    unique_addrs = df['지번주소'].unique()
    divided = int(len(unique_addrs) / num_processors)
    for i in range(num_processors):
        if i < (num_processors - 1):
            picked_addrs = unique_addrs[divided*i:divided*(i+1)]
        else:
            picked_addrs = unique_addrs[divided*i:]
        data_split.append(df[df['지번주소'].isin(picked_addrs)])
            
    pool = mp.Pool(num_processors)
    concat_df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    
    concat_df = concat_df.sort_values(['지번주소', '년']).reset_index(drop=True)
    
    return concat_df

In [None]:
%%time
completed_df = parallelize(land_plans_df, create_complete_land_plans_df_iteratively)
print(completed_df.shape)
completed_df.head()

In [None]:
completed_df.to_csv('./prepped_data/land_plans_ver_3.csv', index=False)