In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
basedir = './국토교통부_건축물대장_표제부/'
filenames = [f for f in os.listdir(basedir) if f.endswith('.txt')]
filenames

['mart_djy_03_201412.txt',
 'mart_djy_03_201612.txt',
 'mart_djy_03_201712.txt',
 'mart_djy_03_201812.txt',
 'mart_djy_03_201912.txt',
 'mart_djy_03_202012.txt',
 'MART_DJY_TITLE_201512.txt']

In [3]:
colnames_df = pd.read_excel(basedir + '데이터구조.xls', header=1)
print(colnames_df.shape)
colnames_df.head()

(77, 3)


Unnamed: 0,컬럼 한글명,데이터 타입,비고
0,관리_건축물대장_PK,VARCHAR(33),
1,대장_구분_코드,VARCHAR(1),
2,대장_구분_코드_명,VARCHAR(100),
3,대장_종류_코드,VARCHAR(1),
4,대장_종류_코드_명,VARCHAR(100),


In [4]:
colnames = colnames_df['컬럼 한글명'].tolist()
print(len(colnames))
colnames[:5]

77


['관리_건축물대장_PK', '대장_구분_코드', '대장_구분_코드_명', '대장_종류_코드', '대장_종류_코드_명']

In [5]:
%%time
dfs_list = []
for i, file in enumerate(filenames):
    yearmonth = file.replace('.txt', '')[-6:]
    year = yearmonth[:4]
    print(yearmonth)
    
    if i == 0:
        df = pd.read_csv(basedir + file, sep='|', header=0, names=colnames)
        current_cols = df.columns.tolist()
        
        new_cols = []
        
        for i in range(len(current_cols)):
            if i <= 16:
                new_cols.append(current_cols[i])
            elif i == 17:
                new_cols.append('내진_설계_적용_여부')
            elif i <= 25:
                new_cols.append(current_cols[i-1])
            elif i == 26:
                new_cols.append('내진_능력')
            else:
                new_cols.append(current_cols[i-2])
        
        df.columns = new_cols
        
        colnames_75 = colnames.copy()
        colnames_75.remove('내진_설계_적용_여부')
        colnames_75.remove('내진_능력')
        
    elif i == 2:
        df = pd.read_csv(basedir + file, sep='|', header=0, encoding='euc-kr', names=colnames)
        df.columns = new_cols
    else:
        df = pd.read_csv(basedir + file, sep='|', encoding='euc-kr', header=None)
        if df.shape[1] == 77:
            df.columns = new_cols
            if year == '2020':
                df.columns = colnames
        elif df.shape[1] == 75:
            df.columns = colnames_75
            
    df['기준년월'] = int(yearmonth)
        
    dfs_list.append(df)

201412




201612




201712




201812




201912




202012
201512




Wall time: 6min 34s


In [6]:
for i in range(len(dfs_list)):
    print(dfs_list[i].shape)

(7389045, 78)
(7563464, 78)
(7645979, 78)
(7725700, 76)
(7791811, 76)
(7837517, 78)
(7483690, 78)


In [7]:
for i in range(len(dfs_list)):
    if dfs_list[i].shape[1] == 78:
        dfs_list[i] = dfs_list[i].drop(columns=['내진_능력', '내진_설계_적용_여부'])
    
    print(dfs_list[i].shape)

(7389045, 76)
(7563464, 76)
(7645979, 76)
(7725700, 76)
(7791811, 76)
(7837517, 76)
(7483690, 76)


In [8]:
for i in range(len(dfs_list)):
    df = dfs_list[i]
    df['대지_위치'].fillna('없음', inplace=True)
    df = df[df['대지_위치'].str.contains('서울특별시')]
    dfs_list[i] = df

In [9]:
del df
gc.collect()

185

In [10]:
prac_df = dfs_list[0]

In [11]:
prac_df['관리_건축물대장_PK'] = prac_df['관리_건축물대장_PK'].apply(lambda x: x[:-2])

In [12]:
dfs_list[0] = prac_df

In [13]:
concat_df = pd.concat(dfs_list).reset_index(drop=True)
print(concat_df.shape)
concat_df.head()

(4366892, 76)


Unnamed: 0,관리_건축물대장_PK,대장_구분_코드,대장_구분_코드_명,대장_종류_코드,대장_종류_코드_명,대지_위치,도로명_대지_위치,건물_명,시군구_코드,법정동_코드,대지_구분_코드,번,지,특수지_명,블록,로트,외필지_수,새주소_도로_코드,새주소_법정동_코드,새주소_지상지하_코드,새주소_본_번,새주소_부_번,동_명,주_부속_구분_코드,주_부속_구분_코드_명,대지_면적(㎡),건축_면적(㎡),건폐_율(%),연면적(㎡),용적_률_산정_연면적(㎡),용적_률(%),구조_코드,구조_코드_명,기타_구조,주_용도_코드,주_용도_코드_명,기타_용도,지붕_코드,지붕_코드_명,기타_지붕,세대_수(세대),가구_수(가구),높이(m),지상_층_수,지하_층_수,승용_승강기_수,비상용_승강기_수,부속_건축물_수,부속_건축물_면적(㎡),총_동_연면적(㎡),옥내_기계식_대수(대),옥내_기계식_면적(㎡),옥외_기계식_대수(대),옥외_기계식_면적(㎡),옥내_자주식_대수(대),옥내_자주식_면적(㎡),옥외_자주식_대수(대),옥외_자주식_면적(㎡),허가_일,착공_일,사용승인_일,허가번호_년,허가번호_기관_코드,허가번호_기관_코드_명,허가번호_구분_코드,허가번호_구분_코드_명,호_수(호),에너지효율_등급,에너지절감_율,에너지_EPI점수,친환경_건축물_등급,친환경_건축물_인증점수,지능형_건축물_등급,지능형_건축물_인증점수,생성_일자,기준년월
0,11305-100193109,2,집합,3,표제부,서울특별시 강북구 수유동 484-80번지,서울특별시강북구 삼양로77가길 36,삼일교회,11305,10300.0,0.0,484.0,80.0,,,,1,113054100000.0,10301.0,0.0,36.0,0.0,삼일교회,0.0,주건축물,393.0,233.81,59.49,761.29,761.29,193.71,21,철근콘크리트구조,철근콘크리트구조(조적조(4층)),4000,제2종근린생활시설,"근린생활시설, 공동주택(다세대주택)",10,(철근)콘크리트,(철근)콘크리트(평스라브),3,0,12.2,4,0,0,0,0,0.0,761.29,0,0.0,0,0.0,0,0.0,7,80.5,20070821.0,20070829.0,20101200.0,2007.0,3080080.0,건축과,1101.0,신축허가,0,,0.0,0,,0,,0,20101228,201412
1,11440-21639,2,집합,3,표제부,서울특별시 마포구 서교동 484-16번지,서울특별시마포구 양화로7길 61-10,,11440,12000.0,0.0,484.0,16.0,,,,0,114404100000.0,12001.0,0.0,61.0,10.0,,0.0,주건축물,177.62,105.3,59.28,393.13,385.21,216.87,21,철근콘크리트구조,철근콘크리트구조,2000,공동주택,다세대주택,10,(철근)콘크리트,(철근)콘크리트 평스라브,4,0,12.0,4,1,0,0,0,0.0,393.13,0,0.0,0,0.0,0,0.0,4,46.0,20030628.0,20030820.0,20040300.0,2003.0,3130080.0,건축과,1101.0,신축허가,0,,0.0,0,,0,,0,20101230,201412
2,11200-23602,1,일반,2,일반건축물,서울특별시 성동구 송정동 73-566번지,서울특별시성동구 송정14길 17,대한예수교장로회비전교회,11200,11800.0,0.0,73.0,566.0,,,,0,112004100000.0,11801.0,0.0,17.0,0.0,,0.0,주건축물,806.0,376.05,46.66,1185.07,797.98,99.0,21,철근콘크리트구조,"철근콘크리트,철골조",6000,종교시설,교회,10,(철근)콘크리트,"슬라브,경량철골조",0,0,13.2,2,1,0,0,0,0.0,1185.07,0,0.0,0,0.0,0,0.0,0,0.0,,,19781200.0,,,,,,0,,0.0,0,,0,,0,20110131,201412
3,11380-26906,1,일반,2,일반건축물,서울특별시 은평구 응암동 594-87번지,서울특별시은평구 응암로 163-13,,11380,10700.0,0.0,594.0,87.0,,,,0,113803000000.0,10702.0,0.0,163.0,13.0,,0.0,주건축물,0.0,0.0,0.0,246.15,0.0,0.0,11,벽돌구조,연와조,1000,단독주택,"단독주택,다가구용 (6가구)",10,(철근)콘크리트,슬라브,0,6,0.0,2,1,0,0,0,0.0,246.15,0,0.0,0,0.0,0,0.0,0,0.0,19900403.0,,19911100.0,,,,,,0,,0.0,0,,0,,0,20110105,201412
4,11110-100178925,1,일반,2,일반건축물,서울특별시 종로구 평창동 365-1번지,서울특별시종로구 평창11길 45,,11110,18300.0,0.0,365.0,1.0,,,,0,111104100000.0,18301.0,0.0,45.0,0.0,,0.0,주건축물,356.0,104.22,29.28,404.8,194.31,54.58,21,철근콘크리트구조,철근콘크리트구조,3000,제1종근린생활시설,제1종근린생활시설,10,(철근)콘크리트,(철근)콘크리트,0,0,7.9,2,1,0,0,0,0.0,404.8,0,0.0,0,0.0,1,33.29,2,40.19,20070605.0,20070616.0,20080300.0,2007.0,3000080.0,건축과,1101.0,신축허가,0,,0.0,0,,0,,0,20110101,201412


In [14]:
def correct_addr(x):
    # x is a string
    if (pd.isna(x) == True)|(x == '없음'):
        return x
    else:
        x = x.replace('번지', '')
        if x[0] == ' ':
            for i in range(len(x)):
                if x[i] != ' ':
                    x = x[i:]
                    break
        
        splitted = x.split('-')
        if len(splitted) == 1:
            return x+'-0'
        else:
            return x

In [15]:
concat_df['지번주소'] = concat_df['대지_위치'].apply(correct_addr)

In [16]:
concat_df.drop(columns=[col for col in concat_df.columns if col.endswith('_코드')], inplace=True)
concat_df.shape

(4366892, 63)

In [19]:
concat_df.columns = [col.replace('_코드_명', '') for col in concat_df.columns]

In [20]:
concat_df.head()

Unnamed: 0,관리_건축물대장_PK,대장_구분,대장_종류,대지_위치,도로명_대지_위치,건물_명,번,지,특수지_명,블록,로트,외필지_수,새주소_본_번,새주소_부_번,동_명,주_부속_구분,대지_면적(㎡),건축_면적(㎡),건폐_율(%),연면적(㎡),용적_률_산정_연면적(㎡),용적_률(%),구조,기타_구조,주_용도,기타_용도,지붕,기타_지붕,세대_수(세대),가구_수(가구),높이(m),지상_층_수,지하_층_수,승용_승강기_수,비상용_승강기_수,부속_건축물_수,부속_건축물_면적(㎡),총_동_연면적(㎡),옥내_기계식_대수(대),옥내_기계식_면적(㎡),옥외_기계식_대수(대),옥외_기계식_면적(㎡),옥내_자주식_대수(대),옥내_자주식_면적(㎡),옥외_자주식_대수(대),옥외_자주식_면적(㎡),허가_일,착공_일,사용승인_일,허가번호_년,허가번호_기관,허가번호_구분,호_수(호),에너지효율_등급,에너지절감_율,에너지_EPI점수,친환경_건축물_등급,친환경_건축물_인증점수,지능형_건축물_등급,지능형_건축물_인증점수,생성_일자,기준년월,지번주소
0,11305-100193109,집합,표제부,서울특별시 강북구 수유동 484-80번지,서울특별시강북구 삼양로77가길 36,삼일교회,484.0,80.0,,,,1,36.0,0.0,삼일교회,주건축물,393.0,233.81,59.49,761.29,761.29,193.71,철근콘크리트구조,철근콘크리트구조(조적조(4층)),제2종근린생활시설,"근린생활시설, 공동주택(다세대주택)",(철근)콘크리트,(철근)콘크리트(평스라브),3,0,12.2,4,0,0,0,0,0.0,761.29,0,0.0,0,0.0,0,0.0,7,80.5,20070821.0,20070829.0,20101200.0,2007.0,건축과,신축허가,0,,0.0,0,,0,,0,20101228,201412,서울특별시 강북구 수유동 484-80
1,11440-21639,집합,표제부,서울특별시 마포구 서교동 484-16번지,서울특별시마포구 양화로7길 61-10,,484.0,16.0,,,,0,61.0,10.0,,주건축물,177.62,105.3,59.28,393.13,385.21,216.87,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,(철근)콘크리트 평스라브,4,0,12.0,4,1,0,0,0,0.0,393.13,0,0.0,0,0.0,0,0.0,4,46.0,20030628.0,20030820.0,20040300.0,2003.0,건축과,신축허가,0,,0.0,0,,0,,0,20101230,201412,서울특별시 마포구 서교동 484-16
2,11200-23602,일반,일반건축물,서울특별시 성동구 송정동 73-566번지,서울특별시성동구 송정14길 17,대한예수교장로회비전교회,73.0,566.0,,,,0,17.0,0.0,,주건축물,806.0,376.05,46.66,1185.07,797.98,99.0,철근콘크리트구조,"철근콘크리트,철골조",종교시설,교회,(철근)콘크리트,"슬라브,경량철골조",0,0,13.2,2,1,0,0,0,0.0,1185.07,0,0.0,0,0.0,0,0.0,0,0.0,,,19781200.0,,,,0,,0.0,0,,0,,0,20110131,201412,서울특별시 성동구 송정동 73-566
3,11380-26906,일반,일반건축물,서울특별시 은평구 응암동 594-87번지,서울특별시은평구 응암로 163-13,,594.0,87.0,,,,0,163.0,13.0,,주건축물,0.0,0.0,0.0,246.15,0.0,0.0,벽돌구조,연와조,단독주택,"단독주택,다가구용 (6가구)",(철근)콘크리트,슬라브,0,6,0.0,2,1,0,0,0,0.0,246.15,0,0.0,0,0.0,0,0.0,0,0.0,19900403.0,,19911100.0,,,,0,,0.0,0,,0,,0,20110105,201412,서울특별시 은평구 응암동 594-87
4,11110-100178925,일반,일반건축물,서울특별시 종로구 평창동 365-1번지,서울특별시종로구 평창11길 45,,365.0,1.0,,,,0,45.0,0.0,,주건축물,356.0,104.22,29.28,404.8,194.31,54.58,철근콘크리트구조,철근콘크리트구조,제1종근린생활시설,제1종근린생활시설,(철근)콘크리트,(철근)콘크리트,0,0,7.9,2,1,0,0,0,0.0,404.8,0,0.0,0,0.0,1,33.29,2,40.19,20070605.0,20070616.0,20080300.0,2007.0,건축과,신축허가,0,,0.0,0,,0,,0,20110101,201412,서울특별시 종로구 평창동 365-1


In [21]:
concat_df['구조'].unique()

array(['철근콘크리트구조', '벽돌구조', '일반철골구조', '기타조적구조', '일반목구조', '블록구조',
       '철골철근콘크리트구조', '경량철골구조', '철골콘크리트구조', '석구조', nan, '기타콘크리트구조',
       '강파이프구조', '프리케스트콘크리트구조', '기타강구조', '기타구조', '기타철골철근콘크리트구조', '조적구조',
       '통나무구조', '목구조', '철골철근콘크리트합성구조', '조립식판넬조', '시멘트블럭조', '라멘조',
       '스틸하우스조', '컨테이너조'], dtype=object)

In [17]:
concat_df.drop(columns=['대지_위치', '도로명_대지_위치', '건물_명', '번', '지', '특수지_명', '블록', '로트', '새주소_본_번',
                        '새주소_부_번', '동_명', '허가번호_기관_코드_명', '허가번호_구분_코드_명'
                       ], inplace=True)

In [18]:
concat_df['총세대수'] = concat_df['세대_수(세대)'] + concat_df['가구_수(가구)'] + concat_df['호_수(호)']
concat_df.drop(columns=['세대_수(세대)', '가구_수(가구)', '호_수(호)'], inplace=True)

In [19]:
bdinfo_df = concat_df
del concat_df

In [21]:
def apply_int(x):
    # x is a number or a string
    
    if pd.isna(x) == True:
        return x
    else:
        if type(x) == str:
            try:
                return int(x.replace(' ', '0'))
            except:
                raise ValueError(x)
        else:
            return int(x)

In [22]:
bdinfo_df['사용승인_일_int'] = bdinfo_df['사용승인_일'].apply(apply_int)

In [23]:
bdinfo_df.drop(columns=['허가_일', '착공_일', '허가번호_년'], inplace=True)

In [24]:
bdinfo_df['사용승인_일'] = bdinfo_df['사용승인_일_int'].astype('Int64')

In [25]:
bdinfo_df.head()

Unnamed: 0,관리_건축물대장_PK,대장_구분,대장_종류,외필지_수,주_부속_구분,대지_면적(㎡),건축_면적(㎡),건폐_율(%),연면적(㎡),용적_률_산정_연면적(㎡),용적_률(%),구조,기타_구조,주_용도,기타_용도,지붕,기타_지붕,높이(m),지상_층_수,지하_층_수,승용_승강기_수,비상용_승강기_수,부속_건축물_수,부속_건축물_면적(㎡),총_동_연면적(㎡),옥내_기계식_대수(대),옥내_기계식_면적(㎡),옥외_기계식_대수(대),옥외_기계식_면적(㎡),옥내_자주식_대수(대),옥내_자주식_면적(㎡),옥외_자주식_대수(대),옥외_자주식_면적(㎡),사용승인_일,에너지효율_등급,에너지절감_율,에너지_EPI점수,친환경_건축물_등급,친환경_건축물_인증점수,지능형_건축물_등급,지능형_건축물_인증점수,생성_일자,기준년월,지번주소,총세대수,사용승인_일_int
0,11305-100193109,집합,표제부,1,주건축물,393.0,233.81,59.49,761.29,761.29,193.71,철근콘크리트구조,철근콘크리트구조(조적조(4층)),제2종근린생활시설,"근린생활시설, 공동주택(다세대주택)",(철근)콘크리트,(철근)콘크리트(평스라브),12.2,4,0,0,0,0,0.0,761.29,0,0.0,0,0.0,0,0.0,7,80.5,20101216,,0.0,0,,0,,0,20101228,201412,서울특별시 강북구 수유동 484-80,3,20101216.0
1,11440-21639,집합,표제부,0,주건축물,177.62,105.3,59.28,393.13,385.21,216.87,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,(철근)콘크리트 평스라브,12.0,4,1,0,0,0,0.0,393.13,0,0.0,0,0.0,0,0.0,4,46.0,20040323,,0.0,0,,0,,0,20101230,201412,서울특별시 마포구 서교동 484-16,4,20040323.0
2,11200-23602,일반,일반건축물,0,주건축물,806.0,376.05,46.66,1185.07,797.98,99.0,철근콘크리트구조,"철근콘크리트,철골조",종교시설,교회,(철근)콘크리트,"슬라브,경량철골조",13.2,2,1,0,0,0,0.0,1185.07,0,0.0,0,0.0,0,0.0,0,0.0,19781205,,0.0,0,,0,,0,20110131,201412,서울특별시 성동구 송정동 73-566,0,19781205.0
3,11380-26906,일반,일반건축물,0,주건축물,0.0,0.0,0.0,246.15,0.0,0.0,벽돌구조,연와조,단독주택,"단독주택,다가구용 (6가구)",(철근)콘크리트,슬라브,0.0,2,1,0,0,0,0.0,246.15,0,0.0,0,0.0,0,0.0,0,0.0,19911128,,0.0,0,,0,,0,20110105,201412,서울특별시 은평구 응암동 594-87,6,19911128.0
4,11110-100178925,일반,일반건축물,0,주건축물,356.0,104.22,29.28,404.8,194.31,54.58,철근콘크리트구조,철근콘크리트구조,제1종근린생활시설,제1종근린생활시설,(철근)콘크리트,(철근)콘크리트,7.9,2,1,0,0,0,0.0,404.8,0,0.0,0,0.0,1,33.29,2,40.19,20080319,,0.0,0,,0,,0,20110101,201412,서울특별시 종로구 평창동 365-1,0,20080319.0


In [26]:
bdinfo_df.drop(columns=['사용승인_일_int'], inplace=True)

In [27]:
bdinfo_df['addr_split_len'] = bdinfo_df['지번주소'].apply(lambda x: len(x.split(' ')))

In [28]:
bdinfo_df['addr_split_len'].value_counts()

4    4329762
5      33434
6       2510
7        924
8        137
3        109
9         16
Name: addr_split_len, dtype: int64

In [29]:
len3_df = bdinfo_df[bdinfo_df['addr_split_len'] == 3]
bdinfo_df.drop(len3_df.index, inplace=True)
len9_df = bdinfo_df[bdinfo_df['addr_split_len'] == 9]
bdinfo_df.drop(len9_df.index, inplace=True)
len8_df = bdinfo_df[bdinfo_df['addr_split_len'] == 8]
bdinfo_df.drop(len8_df.index, inplace=True)
len7_df = bdinfo_df[bdinfo_df['addr_split_len'] == 7]
bdinfo_df.drop(len7_df.index, inplace=True)
len6_df = bdinfo_df[bdinfo_df['addr_split_len'] == 6]
bdinfo_df.drop(len6_df.index, inplace=True)

In [30]:
len5_df = bdinfo_df[bdinfo_df['addr_split_len'] == 5]

In [31]:
len5_df['fourth_addr'] = len5_df['지번주소'].apply(lambda x: x.split(' ')[3])

In [32]:
bdinfo_df['대장구분명'] = np.nan

In [33]:
def landtype_name(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        if x == '산':
            return '산'
        elif ('블록' in x)&('지구' in x):
            return '지구블록'
        elif '블록' in x:
            return 'BL'

In [34]:
len5_df['대장구분명'] = len5_df['fourth_addr'].apply(landtype_name)

In [35]:
bdinfo_df.loc[len5_df.index, '대장구분명'] = len5_df['대장구분명']

In [36]:
def weird_addr(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        try:
            int(x.split(' ')[-1].replace('-', ''))
            return 0
        except:
            return 1

In [37]:
len4_df = bdinfo_df[bdinfo_df['addr_split_len'] == 4]
len4_df['weird_addr'] = len4_df['지번주소'].apply(weird_addr)

In [38]:
weird_addr_df = len4_df[len4_df['weird_addr'] == 1]

In [39]:
bdinfo_df.drop(weird_addr_df.index, inplace=True)

In [40]:
bdinfo_df = bdinfo_df[bdinfo_df['대장구분명'] != '지구블록']

In [41]:
bdinfo_df = bdinfo_df[bdinfo_df['대장구분명'] != 'BL']

In [42]:
bdinfo_df['대장구분명'].fillna('일반', inplace=True)
bdinfo_df['대장구분명'].value_counts()

일반    4328868
산       30939
Name: 대장구분명, dtype: int64

In [43]:
bdinfo_df.drop(columns=['addr_split_len'], inplace=True)

In [44]:
bdinfo_san_df = bdinfo_df[bdinfo_df['대장구분명'] == '산']

In [45]:
def san_new_addr(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split(' ')
        return splitted[0] + ' ' + splitted[1] + ' ' + splitted[2] + ' ' + splitted[3] + splitted[4]

In [46]:
bdinfo_san_df['지번주소'] = bdinfo_san_df['지번주소'].apply(san_new_addr)

In [47]:
bdinfo_df.loc[bdinfo_san_df.index, '지번주소'] = bdinfo_san_df['지번주소']

In [48]:
def addr_split_len(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        return len(x.split(' '))

In [49]:
%%time
bdinfo_df.to_csv('./국토교통부_건축물대장_표제부/bdinfo_baseline_checkpoint_3.csv', index=False)

Wall time: 1min 31s


In [50]:
bdinfo_df['구조'].value_counts()

철근콘크리트구조        1950006
벽돌구조            1776898
일반목구조            373350
블록구조             160531
일반철골구조            35926
경량철골구조            26402
철골철근콘크리트구조        15148
기타조적구조            10015
철골콘크리트구조           4610
석구조                1552
기타구조               1107
프리케스트콘크리트구조        1101
기타강구조              1012
기타콘크리트구조            598
강파이프구조              590
통나무구조               217
기타철골철근콘크리트구조        116
조적구조                 38
철골철근콘크리트합성구조         35
조립식판넬조               30
시멘트블럭조               20
목구조                  14
라멘조                   6
컨테이너조                 2
스틸하우스조                2
Name: 구조, dtype: int64

In [53]:
bdinfo_df.shape

(4359807, 46)

In [51]:
main_data_df = pd.read_csv('./prepped_data/main_data_baseline_checkpoint_3.csv')

In [52]:
inter_df = bdinfo_df[bdinfo_df['지번주소'].isin(main_data_df['지번주소'].unique())]
print(inter_df.shape)
inter_df.head()

(935418, 46)


Unnamed: 0,관리_건축물대장_PK,대장_구분,대장_종류,외필지_수,주_부속_구분,대지_면적(㎡),건축_면적(㎡),건폐_율(%),연면적(㎡),용적_률_산정_연면적(㎡),용적_률(%),구조,기타_구조,주_용도,기타_용도,지붕,기타_지붕,높이(m),지상_층_수,지하_층_수,승용_승강기_수,비상용_승강기_수,부속_건축물_수,부속_건축물_면적(㎡),총_동_연면적(㎡),옥내_기계식_대수(대),옥내_기계식_면적(㎡),옥외_기계식_대수(대),옥외_기계식_면적(㎡),옥내_자주식_대수(대),옥내_자주식_면적(㎡),옥외_자주식_대수(대),옥외_자주식_면적(㎡),사용승인_일,에너지효율_등급,에너지절감_율,에너지_EPI점수,친환경_건축물_등급,친환경_건축물_인증점수,지능형_건축물_등급,지능형_건축물_인증점수,생성_일자,기준년월,지번주소,총세대수,대장구분명
1,11440-21639,집합,표제부,0,주건축물,177.62,105.3,59.28,393.13,385.21,216.87,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,(철근)콘크리트 평스라브,12.0,4,1,0,0,0,0.0,393.13,0,0.0,0,0.0,0,0.0,4,46.0,20040323,,0.0,0,,0,,0,20101230,201412,서울특별시 마포구 서교동 484-16,4,일반
6,11590-100182663,집합,표제부,1,주건축물,389.48,232.42,59.67,947.74,659.89,169.43,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,(철근)콘크리트,11.26,4,1,0,0,0,0.0,947.74,0,0.0,0,0.0,8,287.85,2,11.5,20080826,,0.0,0,,0,,0,20110119,201412,서울특별시 동작구 상도동 302-28,10,일반
7,11320-100182241,집합,표제부,0,주건축물,208.0,81.085,38.98,268.165,268.165,128.93,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,(철근)콘크리트,10.4,4,0,0,0,0,0.0,268.165,0,0.0,0,0.0,3,0.0,1,0.0,20080721,,0.0,0,,0,,0,20110202,201412,서울특별시 도봉구 쌍문동 380-24,4,일반
8,11440-32174,집합,표제부,0,주건축물,314.0,180.38,57.45,627.88,627.88,199.96,철근콘크리트구조,철근콘크리트구조,공동주택,다세대주택,(철근)콘크리트,평스라브,13.3,5,0,0,0,0,0.0,627.88,0,0.0,0,0.0,7,80.5,1,11.5,20050901,,0.0,0,,0,,0,20110125,201412,서울특별시 마포구 성산동 200-260,8,일반
11,11740-100179671,집합,표제부,0,주건축물,0.0,564.01,0.0,6401.19,6401.19,0.0,철근콘크리트구조,철근콘크리트구조,공동주택,공동주택(아파트),(철근)콘크리트,(철근)콘크리트,45.6,14,0,3,0,0,0.0,6401.19,0,0.0,0,0.0,0,0.0,0,0.0,20090311,,0.0,0,,0,,0,20110122,201412,서울특별시 강동구 강일동 674-0,78,일반


In [54]:
inter_df['구조'].value_counts()

철근콘크리트구조       784561
벽돌구조           136483
일반목구조            4111
블록구조             2901
철골철근콘크리트구조       2362
기타조적구조           1539
일반철골구조           1003
경량철골구조            859
철골콘크리트구조          718
프리케스트콘크리트구조       564
기타콘크리트구조          127
기타강구조              51
기타구조               44
석구조                27
강파이프구조             26
라멘조                 3
조립식판넬조              1
Name: 구조, dtype: int64

In [56]:
inter_df['기타_구조'].value_counts()

철근콘크리트구조             455878
철근콘크리트조              211649
연와조                  108659
철근콘크리트조, 연와조          20507
철근콘크리트                17175
                      ...  
연와조및세멘부록조                 1
연와조,경량판넬조                 1
연와조, 경량철골, 철근콘크리트         1
세멘벽돌조 세멘블럭조               1
연와조/샌드위판넬                 1
Name: 기타_구조, Length: 2053, dtype: int64

In [58]:
inter_df.to_csv('./국토교통부_건축물대장_표제부/bdinfo_baseline_maindata_inter_checkpoint_4.csv', index=False)