In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
prac_df = pd.read_csv('living_area_per_room.csv')
print(prac_df.shape)
prac_df.head()

(918093, 9)


Unnamed: 0,지번주소,기준년월,관리_건축물대장_PK,기준년월_최초사용승인일,주소별_최초생성일자,주소별_기준년월_총세대수합_최대,전체연면적,세대별연면적,outlier
0,서울특별시 강남구 개포동 1163-4,201412,11680-100218729,20130307.0,20130313,20,540.44,27.022,-1
1,서울특별시 강남구 개포동 1163-4,201512,11680-100218729,20130307.0,20130313,20,540.44,27.022,-1
2,서울특별시 강남구 개포동 1163-4,201612,11680-100218729,20130307.0,20130313,20,540.44,27.022,-1
3,서울특별시 강남구 개포동 1163-4,201712,11680-100218729,20130307.0,20130313,20,540.44,27.022,-1
4,서울특별시 강남구 개포동 1163-4,201812,11680-100218729,20130307.0,20130313,20,540.44,27.022,-1


In [3]:
prac_df.isna().sum()

지번주소                     0
기준년월                     0
관리_건축물대장_PK              0
기준년월_최초사용승인일         22250
주소별_최초생성일자               0
주소별_기준년월_총세대수합_최대        0
전체연면적                    0
세대별연면적                   0
outlier                  0
dtype: int64

In [4]:
prac_df['기준년월_최초사용승인일'].fillna(prac_df['주소별_최초생성일자'], inplace=True)
prac_df.isna().sum()

지번주소                 0
기준년월                 0
관리_건축물대장_PK          0
기준년월_최초사용승인일         0
주소별_최초생성일자           0
주소별_기준년월_총세대수합_최대    0
전체연면적                0
세대별연면적               0
outlier              0
dtype: int64

In [5]:
prac_df.drop(columns=['관리_건축물대장_PK', '주소별_최초생성일자', 'outlier'], inplace=True)
prac_df.head()

Unnamed: 0,지번주소,기준년월,기준년월_최초사용승인일,주소별_기준년월_총세대수합_최대,전체연면적,세대별연면적
0,서울특별시 강남구 개포동 1163-4,201412,20130307.0,20,540.44,27.022
1,서울특별시 강남구 개포동 1163-4,201512,20130307.0,20,540.44,27.022
2,서울특별시 강남구 개포동 1163-4,201612,20130307.0,20,540.44,27.022
3,서울특별시 강남구 개포동 1163-4,201712,20130307.0,20,540.44,27.022
4,서울특별시 강남구 개포동 1163-4,201812,20130307.0,20,540.44,27.022


In [6]:
prac_df.drop_duplicates(inplace=True)
prac_df.shape

(736090, 6)

In [7]:
prac_df.head()

Unnamed: 0,지번주소,기준년월,기준년월_최초사용승인일,주소별_기준년월_총세대수합_최대,전체연면적,세대별연면적
0,서울특별시 강남구 개포동 1163-4,201412,20130307.0,20,540.44,27.022
1,서울특별시 강남구 개포동 1163-4,201512,20130307.0,20,540.44,27.022
2,서울특별시 강남구 개포동 1163-4,201612,20130307.0,20,540.44,27.022
3,서울특별시 강남구 개포동 1163-4,201712,20130307.0,20,540.44,27.022
4,서울특별시 강남구 개포동 1163-4,201812,20130307.0,20,540.44,27.022


In [8]:
prac_df['기준년도'] = prac_df['기준년월'].apply(lambda x: int(str(x)[:4]))
prac_df['사용승인년도'] = prac_df['기준년월_최초사용승인일'].apply(lambda x: int(str(x)[:4]))
prac_df.drop(columns=['기준년월', '기준년월_최초사용승인일'], inplace=True)

In [9]:
def create_full_df_iteratively(df):
    # df is a pandas dataframe
    
    import pandas as pd
    import numpy as np
    import gc
    
    df = df.copy()
    
    dfs_list = []
    count = 0
    for addr in df['지번주소'].unique():
        addr_df = df[df['지번주소'] == addr]
        
        min_record_year = addr_df['기준년도'].min()
        max_record_year = addr_df['기준년도'].max()
        unique_record_years = addr_df['기준년도'].unique()

        unique_approv_years = sorted(addr_df['사용승인년도'].unique().tolist())
        min_approved_year = addr_df['사용승인년도'].min()
        max_approved_year = addr_df['사용승인년도'].max()

        if min_approved_year <= 2006:
            min_year = 2006
        else:
            min_year = min_approved_year

        years_for_concat = [y for y in range(min_year, 2021) if y not in unique_record_years]
        if years_for_concat == []:
            dfs_list.append(addr_df)
            continue

        rows_list = []
        for y in years_for_concat:
            empty_row = addr_df.iloc[[0],:]
            empty_row['기준년도'] = y
            empty_row.iloc[0, 2:] = np.nan
            rows_list.append(empty_row)

        empty_df = pd.concat(rows_list)

        addr_concat_df = pd.concat([addr_df, empty_df]).sort_values(['기준년도']).reset_index(drop=True)

        addr_concat_df.fillna(method='ffill', inplace=True)
        addr_concat_df.fillna(method='bfill', inplace=True)
        
        dfs_list.append(addr_concat_df)
        count += 1
        if count % 5000 == 0:
            gc.collect()
    
    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    return concat_df

In [10]:
def parallelize(df, func, num_processors=6):
    # df is a pandas dataframe
    # func is the function to use
    # num_processors is the number of cpu cores
    
    data_split = []
    unique_addrs = df['지번주소'].unique()
    divided = int(len(unique_addrs) / num_processors)
    for i in range(num_processors):
        if i < (num_processors - 1):
            picked_addrs = unique_addrs[divided*i:divided*(i+1)]
        else:
            picked_addrs = unique_addrs[divided*i:]
        data_split.append(df[df['지번주소'].isin(picked_addrs)])
            
    pool = mp.Pool(num_processors)
    concat_df = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return concat_df

In [11]:
%%time
completed_df = parallelize(prac_df, create_full_df_iteratively)
print(completed_df.shape)
completed_df.head()

(1475121, 6)
Wall time: 6min 45s


Unnamed: 0,지번주소,주소별_기준년월_총세대수합_최대,전체연면적,세대별연면적,기준년도,사용승인년도
0,서울특별시 강남구 개포동 1163-4,20,540.44,27.022,2014.0,2013.0
1,서울특별시 강남구 개포동 1163-4,20,540.44,27.022,2015.0,2013.0
2,서울특별시 강남구 개포동 1163-4,20,540.44,27.022,2016.0,2013.0
3,서울특별시 강남구 개포동 1163-4,20,540.44,27.022,2017.0,2013.0
4,서울특별시 강남구 개포동 1163-4,20,540.44,27.022,2018.0,2013.0


In [12]:
completed_df.to_csv('./prepped_data/living_area_per_room_full.csv', index=False)