In [2]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [3]:
%%time
land_specs_df = pd.read_csv('./prepped_data/land_specs_baseline.csv')
print(land_specs_df.shape)
land_specs_df.head()

(7422744, 10)
Wall time: 11.3 s


Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,서울특별시 강남구 개포동 100-0,2013,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
1,서울특별시 강남구 개포동 100-0,2014,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
2,서울특별시 강남구 개포동 100-0,2015,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
3,서울특별시 강남구 개포동 100-0,2016,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
4,서울특별시 강남구 개포동 100-0,2017,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지


In [4]:
main_df = pd.read_csv('./prepped_data/메인_데이터_20060101_20210416_ver_4.csv')
print(main_df.shape)
main_df.head()

(1480907, 18)


Unnamed: 0,지번주소,도로명주소,건물(단지)명,전용면적(㎡),거래금액(만원),층,건축년도,계약날짜기준_건물연식,계약날짜,건물연식,부동산유형,전용면적_classes,가격_면적,가격_면적_classes,X좌표,Y좌표,구,동
0,서울특별시 강남구 개포동 1264-3,서울특별시 강남구 개포로31길 23-7,(1264-3),53.28,11500,-1,1992.0,14.0,2006-08-29,29.0,연립다세대,50~55,215.840841,210~220,960076.154238,1942459.0,강남구,개포동
1,서울특별시 강남구 개포동 171-13,서울특별시 강남구 선릉로14길 11-0,(171-13),68.08,56500,2,1988.0,18.0,2006-12-20,33.0,연립다세대,65~70,829.905993,820~830,961145.46232,1942843.0,강남구,개포동
2,서울특별시 강남구 개포동 171-13,서울특별시 강남구 선릉로14길 11-0,(171-13),44.8,23300,-1,1988.0,28.0,2016-01-16,33.0,연립다세대,40~45,520.089286,520~530,961145.46232,1942843.0,강남구,개포동
3,서울특별시 강남구 개포동 171-13,서울특별시 강남구 선릉로14길 11-0,(171-13),44.8,24500,-1,1988.0,28.0,2016-04-09,33.0,연립다세대,40~45,546.875,540~550,961145.46232,1942843.0,강남구,개포동
4,서울특별시 강남구 개포동 171-13,서울특별시 강남구 선릉로14길 11-0,(171-13),68.08,60000,2,1988.0,31.0,2019-10-23,33.0,연립다세대,65~70,881.316099,880~890,961145.46232,1942843.0,강남구,개포동


In [5]:
selected_df = land_specs_df[land_specs_df['지번주소'].isin(main_df['지번주소'].unique())]
print(selected_df.shape)

(554766, 10)


In [6]:
def create_complete_land_plans_df_iteratively(df):
    import numpy as np
    import pandas as pd
    import gc
    
    df = df.copy()
    
    dfs_list = []
    count = 0
    for addr in df['지번주소'].unique():
        if (count%5000) == 0:
            gc.collect()
        
        addr_df = df[df['지번주소'] == addr]
        
        min_year = addr_df['년'].min()
        
        years_list = [i for i in range(2006, 2022) if i not in addr_df['년'].unique().tolist()]
        if years_list == []:
            dfs_list.append(addr_df)
            continue
        
        empty_row = addr_df.iloc[[0], :]
        empty_row.iloc[:,2:] = np.nan

        empty_rows_list = []
        for y in years_list:
            empty_row_copy = empty_row.copy()
            empty_row_copy.iloc[0, 1] = y
            empty_rows_list.append(empty_row_copy)
        
        empty_df = pd.concat(empty_rows_list)

        mini_concat_df = pd.concat([addr_df, empty_df]).sort_values(['년']).reset_index(drop=True)
        
        mini_concat_df.fillna(method='ffill', inplace=True)
        mini_concat_df.fillna(method='bfill', inplace=True)
        
        dfs_list.append(mini_concat_df)
                
        count += 1

    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    return concat_df

In [7]:
def parallelize(df, func, num_processors=6):
    #data_split = np.array_split(data, num_processors)
    
    data_split = []
    unique_addrs = df['지번주소'].unique()
    divided = int(len(unique_addrs) / num_processors)
    for i in range(num_processors):
        if i < (num_processors - 1):
            picked_addrs = unique_addrs[divided*i:divided*(i+1)]
        else:
            picked_addrs = unique_addrs[divided*i:]
        data_split.append(df[df['지번주소'].isin(picked_addrs)])
            
    pool = mp.Pool(num_processors)
    concat_df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    
    concat_df = concat_df.sort_values(['지번주소', '년']).reset_index(drop=True)
    
    return concat_df

In [8]:
%%time
completed_df = parallelize(selected_df, create_complete_land_plans_df_iteratively)
print(completed_df.shape)
completed_df.head()

(1114560, 10)
Wall time: 2min 53s


Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,서울특별시 강남구 개포동 1164-0,2006,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
1,서울특별시 강남구 개포동 1164-0,2007,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
2,서울특별시 강남구 개포동 1164-0,2008,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
3,서울특별시 강남구 개포동 1164-0,2009,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)
4,서울특별시 강남구 개포동 1164-0,2010,대,330.4,제2종일반주거지역,지정되지않음,상업용,평지,세로장방,세로한면(가)


In [9]:
completed_df.to_csv('./prepped_data/land_specs_ver_20060101_20210416.csv', index=False)