In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [None]:
%%time
land_plans_df = pd.read_csv('./토지이용계획정보/original_land_plans.csv')
print(land_plans_df.shape)
land_plans_df.head()

In [3]:
land_plans_df.columns = [col.replace(':','_').replace('/','_').replace('\\','_').replace('*','_').replace('?','_')\
                         .replace('\"','_').replace('|','_').replace('<','_').replace('>','_').replace('ㆍ','_')\
                         .replace(' ','_').replace('·','_').replace('-','_').replace('~','_')\
                         for col in land_plans_df.columns]

In [4]:
cat_cols = land_plans_df.columns.drop(['지번주소', '년'])
cat_cols[:5]

Index(['(한강)오염행위_제한지역', '(한강)폐기물매립시설_설치제한지역', '4대문안', '가로구역별_최고높이_제한지역',
       '가스공급설비'],
      dtype='object')

In [5]:
for col in tqdm(cat_cols):
    col_map = {}
    col_map_df = pd.DataFrame({'cat':land_plans_df[col].unique(),'map':[i for i in range(len(land_plans_df[col].unique()))]})
    for i in range(col_map_df.shape[0]):
        col_map[col_map_df['cat'][i]] = col_map_df['map'][i]
    land_plans_df[col] = land_plans_df[col].map(col_map)
    col_map_df.to_csv('./토지이용계획정보/feature_maps/{}_map.csv'.format(col))

100%|████████████████████████████████████████████████████████████████████████████████| 353/353 [23:39<00:00,  4.02s/it]


In [6]:
land_plans_df.to_csv('./토지이용계획정보/original_land_plans_mapped.csv', index=False)

In [7]:
land_plans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5149472 entries, 0 to 5149471
Columns: 355 entries, 지번주소 to 환경정비구역
dtypes: int64(354), object(1)
memory usage: 13.6+ GB


In [None]:
def create_complete_land_plans_df_iteratively(df):
    import numpy as np
    import pandas as pd
    import gc
    
    df = df.copy()
    
    dfs_list = []
    count = 0
    for addr in df['지번주소'].unique():
        if (count%5000) == 0:
            gc.collect()
        
        addr_df = df[df['지번주소'] == addr]
        
        min_year = addr_df['년'].min()
        
        years_list = [i for i in range(2006, 2021) if i not in addr_df['년'].unique().tolist()]
        if years_list == []:
            dfs_list.append(addr_df)
            continue
        
        empty_row = addr_df.iloc[[0], :]
        empty_row.iloc[:,2:] = np.nan

        empty_rows_list = []
        for y in years_list:
            empty_row_copy = empty_row.copy()
            empty_row_copy.iloc[0, 1] = y
            empty_rows_list.append(empty_row_copy)
        
        empty_df = pd.concat(empty_rows_list)

        mini_concat_df = pd.concat([addr_df, empty_df]).sort_values(['년']).reset_index(drop=True)
        
        min_year_and_below_df = mini_concat_df[mini_concat_df['년'] <= min_year].fillna(method='bfill')
        
        min_year_and_above_df = mini_concat_df[mini_concat_df['년'] >= min_year].fillna(method='ffill')
        
        mini_concat_updated_df = pd.concat([min_year_and_below_df, min_year_and_above_df]).drop_duplicates().sort_values(['년']).reset_index(drop=True)
        
        dfs_list.append(mini_concat_updated_df)
                
        count += 1

    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    return concat_df

In [None]:
def parallelize(df, func, num_processors=6):
    #data_split = np.array_split(data, num_processors)
    
    data_split = []
    unique_addrs = df['지번주소'].unique()
    divided = int(len(unique_addrs) / num_processors)
    for i in range(num_processors):
        if i < (num_processors - 1):
            picked_addrs = unique_addrs[divided*i:divided*(i+1)]
        else:
            picked_addrs = unique_addrs[divided*i:]
        data_split.append(df[df['지번주소'].isin(picked_addrs)])
            
    pool = mp.Pool(num_processors)
    concat_df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    
    concat_df = concat_df.sort_values(['지번주소', '년']).reset_index(drop=True)
    
    return concat_df

In [None]:
%%time
completed_df = parallelize(land_plans_df, create_complete_land_plans_df_iteratively)
print(completed_df.shape)
completed_df.head()

In [None]:
completed_df.to_csv('./prepped_data/land_plans_ver_3.csv', index=False)