In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams['font.family'] = 'Malgun Gothic'
font = {'size': 16}
matplotlib.rc('font', **font)

In [2]:
def day_modifier(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        x = str(x)
        if len(x) == 1:
            return '0' + x
        else:
            return x

In [3]:
def landnum_modifier(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        x = str(x).replace('외', '').replace(' ','')
        splitted = x.split('-')
        if len(splitted) == 1:
            return x + '-0'
        else:
            return x

In [4]:
def yunrip_data_prep():
    basedir = './국토교통부_실거래가_공개시스템/연립다세대/매매/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('(' in f)]
    
    dfs_list = []
    for i, f in tqdm(enumerate(filenames)):
        df = pd.read_csv(basedir + f, encoding='euc-kr', header=15)
        if '해제사유발생일' in df.columns.tolist():
            df = df.drop(columns=['해제사유발생일'])

        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    concat_df['번지'] = concat_df['번지'].apply(landnum_modifier)
    
    concat_df['계약년월'] = concat_df['계약년월'].apply(str)
    concat_df['계약일'] = concat_df['계약일'].apply(str)
    
    concat_df['건물연식'] = concat_df['계약년월'].apply(lambda x: int(x[:4])) - concat_df['건축년도']
    
    concat_df['계약일'] = concat_df['계약일'].apply(day_modifier)
    
    concat_df['계약날짜'] = concat_df['계약년월'].apply(lambda x: x[:4]) + '-' + concat_df['계약년월'].apply(lambda x: x[-2:])\
    + '-' + concat_df['계약일']
    
    concat_df['계약날짜'] = pd.to_datetime(concat_df['계약날짜'], format='%Y-%m-%d')
    
    concat_df['거래금액(만원)'] = concat_df['거래금액(만원)'].apply(lambda x: int(x.replace(',','')))
    concat_df['전용면적단가(만원/㎡)'] = concat_df['거래금액(만원)'] / concat_df['전용면적(㎡)']
    
    concat_df['지번주소'] = concat_df['시군구'] + ' ' + concat_df['번지']
    
    concat_df['년'] = concat_df['계약날짜'].dt.year
    
    cols_to_drop = ['시군구', '번지', '본번', '부번', '지번주소', '계약년월', '계약일', '도로명']
    
    concat_df = concat_df[concat_df['년'] >= 2015]
    
    return concat_df[['지번주소'] + [col for col in concat_df.columns if col not in cols_to_drop]]

In [5]:
def officetel_data_prep():
    basedir = './국토교통부_실거래가_공개시스템/오피스텔/매매/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('(' in f)]
    
    dfs_list = []
    for i, f in tqdm(enumerate(filenames)):
        df = pd.read_csv(basedir + f, encoding='euc-kr', header=15)
        if '해제사유발생일' in df.columns.tolist():
            df = df.drop(columns=['해제사유발생일'])

        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).reset_index(drop=True)
    
    concat_df['번지'] = concat_df['번지'].apply(landnum_modifier)
    
    concat_df['계약년월'] = concat_df['계약년월'].apply(str)
    concat_df['계약일'] = concat_df['계약일'].apply(str)
    
    concat_df['건물연식'] = concat_df['계약년월'].apply(lambda x: int(x[:4])) - concat_df['건축년도']
    
    concat_df['계약일'] = concat_df['계약일'].apply(day_modifier)
    
    concat_df['계약날짜'] = concat_df['계약년월'].apply(lambda x: x[:4]) + '-' + concat_df['계약년월'].apply(lambda x: x[-2:])\
    + '-' + concat_df['계약일']
    
    concat_df['계약날짜'] = pd.to_datetime(concat_df['계약날짜'], format='%Y-%m-%d')
    
    concat_df['거래금액(만원)'] = concat_df['거래금액(만원)'].apply(lambda x: int(x.replace(',','')))
    concat_df['전용면적단가(만원/㎡)'] = concat_df['거래금액(만원)'] / concat_df['전용면적(㎡)']
    
    concat_df['지번주소'] = concat_df['시군구'] + ' ' + concat_df['번지']
    
    concat_df['년'] = concat_df['계약날짜'].dt.year
    
    cols_to_drop = ['시군구', '번지', '본번', '부번', '지번주소', '계약년월', '계약일', '도로명']
    
    concat_df = concat_df[concat_df['년'] >= 2015]
        
    return concat_df[['지번주소'] + [col for col in concat_df.columns if col not in cols_to_drop]]

In [6]:
yunrip_df = yunrip_data_prep().dropna().reset_index(drop=True)
print(yunrip_df.shape)
yunrip_df.head()

16it [00:01, 11.04it/s]


(329064, 11)


Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),년
0,서울특별시 강남구 개포동 1216-4,(1216-4),33.77,21.67,24800,4,2012.0,3.0,2015-03-26,734.379627,2015
1,서울특별시 강남구 개포동 1216-4,(1216-4),35.87,23.02,27200,4,2012.0,3.0,2015-06-23,758.293839,2015
2,서울특별시 강남구 개포동 1216-4,(1216-4),29.12,18.68,22200,4,2012.0,3.0,2015-07-20,762.362637,2015
3,서울특별시 강남구 개포동 1216-4,(1216-4),29.97,19.23,22500,3,2012.0,3.0,2015-08-06,750.750751,2015
4,서울특별시 강남구 개포동 170-18,(170-18),26.6,21.53,21000,1,1988.0,27.0,2015-08-21,789.473684,2015


In [7]:
officetel_df = officetel_data_prep().dropna().reset_index(drop=True)
print(officetel_df.shape)
officetel_df.head()

16it [00:00, 37.82it/s]


(76332, 10)


Unnamed: 0,지번주소,단지명,전용면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),년
0,서울특별시 강남구 개포동 13-3,대청타워,43.24,22000,14,1997.0,18.0,2015-01-08,508.788159,2015
1,서울특별시 강남구 개포동 13-3,대청타워,32.44,15800,21,1997.0,18.0,2015-01-12,487.053021,2015
2,서울특별시 강남구 개포동 13-3,대청타워,32.44,16000,10,1997.0,18.0,2015-01-19,493.218249,2015
3,서울특별시 강남구 개포동 13-3,대청타워,32.44,15400,21,1997.0,18.0,2015-01-26,474.722565,2015
4,서울특별시 강남구 개포동 13-3,대청타워,31.91,16000,26,1997.0,18.0,2015-01-28,501.410216,2015


In [8]:
def land_shape2(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        if '사다리' in x:
            return '사다리형'
        elif '장방' in x:
            return '장방형'
        elif '정방' in x:
            return '정방형'
        elif '부정' in x:
            return '부정형'
        elif '않음' in x:
            return '지정되지않음'
        elif '자루' in x:
            return '자루형'
        elif '삼각' in x:
            return '삼각형'
        else:
            return '기타'

In [9]:
land_specs_df = pd.read_csv('./prepped_data/land_specs_ver_4.csv')
land_specs_df['지형형상2'] = land_specs_df['지형형상'].apply(land_shape2)
print(land_specs_df.shape)
land_specs_df.head()

(8706295, 12)


Unnamed: 0,지번주소,년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,공시지가,지형형상2
0,서울특별시 강남구 개포동 100-0,2013,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,325000.0,부정형
1,서울특별시 강남구 개포동 100-0,2014,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,330000.0,부정형
2,서울특별시 강남구 개포동 100-0,2015,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,335000.0,부정형
3,서울특별시 강남구 개포동 100-0,2016,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,345000.0,부정형
4,서울특별시 강남구 개포동 100-0,2017,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,355000.0,부정형


In [10]:
land_specs_df['지형형상'].value_counts()

사다리형      2278575
세로장방      1636461
부정형       1364328
지정되지않음    1241904
가로장방       866599
정방형        845940
자루형        301454
삼각형        165686
역삼각형         5348
Name: 지형형상, dtype: int64

In [11]:
last_df = land_specs_df.drop_duplicates(subset=['지번주소'], keep='last').drop(columns=['년', '공시지가']).reset_index(drop=True)
print(last_df.shape)
last_df.head()

(981464, 10)


Unnamed: 0,지번주소,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,지형형상2
0,서울특별시 강남구 개포동 100-0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지,부정형
1,서울특별시 강남구 개포동 101-0,전,641.0,자연녹지지역,지정되지않음,전,평지,부정형,맹지,부정형
2,서울특별시 강남구 개포동 102-0,전,509.0,자연녹지지역,지정되지않음,전,평지,부정형,맹지,부정형
3,서울특별시 강남구 개포동 103-0,전,48.7,자연녹지지역,개발제한구역,전,평지,부정형,맹지,부정형
4,서울특별시 강남구 개포동 104-1,전,2995.0,개발제한구역,자연녹지지역,전,완경사,부정형,맹지,부정형


In [12]:
last_df['도로접면'].value_counts()

세로한면(가)    291412
지정되지않음     151282
세로한면(불)    106248
세로각지(가)     80726
소로한면        68840
광대소각        57931
소로각지        57412
중로각지        51606
중로한면        33053
광대로한면       31004
맹지          20664
광대세각        20419
세로각지(불)     10852
Name: 도로접면, dtype: int64

In [13]:
yunrip_merge_df = yunrip_df.merge(land_specs_df, on=['지번주소', '년'])
print(yunrip_merge_df.shape)
yunrip_merge_df.head()

(326774, 21)


Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,공시지가,지형형상2
0,서울특별시 강남구 개포동 1216-4,(1216-4),33.77,21.67,24800,4,2012.0,3.0,2015-03-26,734.379627,2015,대,257.0,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로각지(가),3470000.0,정방형
1,서울특별시 강남구 개포동 1216-4,(1216-4),35.87,23.02,27200,4,2012.0,3.0,2015-06-23,758.293839,2015,대,257.0,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로각지(가),3470000.0,정방형
2,서울특별시 강남구 개포동 1216-4,(1216-4),29.12,18.68,22200,4,2012.0,3.0,2015-07-20,762.362637,2015,대,257.0,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로각지(가),3470000.0,정방형
3,서울특별시 강남구 개포동 1216-4,(1216-4),29.97,19.23,22500,3,2012.0,3.0,2015-08-06,750.750751,2015,대,257.0,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로각지(가),3470000.0,정방형
4,서울특별시 강남구 개포동 170-18,(170-18),26.6,21.53,21000,1,1988.0,27.0,2015-08-21,789.473684,2015,대,130.3,제2종일반주거지역,지정되지않음,다세대,평지,세로장방,세로한면(가),4370000.0,장방형


In [14]:
officetel_merge_df = officetel_df.merge(land_specs_df, on=['지번주소', '년'])
print(officetel_merge_df.shape)
officetel_merge_df.head()

(74257, 20)


Unnamed: 0,지번주소,단지명,전용면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),년,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,공시지가,지형형상2
0,서울특별시 강남구 개포동 13-3,대청타워,43.24,22000,14,1997.0,18.0,2015-01-08,508.788159,2015,대,4188.5,일반상업지역,지정되지않음,업무용,평지,세로장방,광대소각,12670000.0,장방형
1,서울특별시 강남구 개포동 13-3,대청타워,32.44,15800,21,1997.0,18.0,2015-01-12,487.053021,2015,대,4188.5,일반상업지역,지정되지않음,업무용,평지,세로장방,광대소각,12670000.0,장방형
2,서울특별시 강남구 개포동 13-3,대청타워,32.44,16000,10,1997.0,18.0,2015-01-19,493.218249,2015,대,4188.5,일반상업지역,지정되지않음,업무용,평지,세로장방,광대소각,12670000.0,장방형
3,서울특별시 강남구 개포동 13-3,대청타워,32.44,15400,21,1997.0,18.0,2015-01-26,474.722565,2015,대,4188.5,일반상업지역,지정되지않음,업무용,평지,세로장방,광대소각,12670000.0,장방형
4,서울특별시 강남구 개포동 13-3,대청타워,31.91,16000,26,1997.0,18.0,2015-01-28,501.410216,2015,대,4188.5,일반상업지역,지정되지않음,업무용,평지,세로장방,광대소각,12670000.0,장방형


In [15]:
reference_df = pd.read_excel('./감정평가사_자료/이승준_총괄표.xlsx', header=[0,1]).dropna(subset=[('소재지','소재지')]).sort_values([('소재지', '지역'),('소재지', '소재지'), ('소재지', '지번')]).reset_index(drop=True)
reference_df.columns = [col[0] + '_' + col[1] for col in reference_df.columns]
print(reference_df.shape)
reference_df.head()

(131, 39)


Unnamed: 0,소재지_지역,소재지_소재지,소재지_지번,토지내용_용도지역,토지내용_도로너비,토지내용_토지면적(㎡),토지내용_토지매매금액,토지내용_토지매매단가\n(원/㎡),토지내용_공시지가와격차\n(개공/매매),건물내용_건물용도,건물내용_공급면적(㎡),건물내용_전유면적(㎡),건물내용_전용율,건물내용_용적율\n(%),건물내용_건폐율\n(%),건물내용_구분건물호,건물내용_건물동수,건물내용_건물층수,수입/비용_의뢰인,수입/비용_기준시점,수입/비용_전체평가금액,수입/비용_투입비용합계,수입/비용_순이익,수입/비용_순이익율,평가단가(원/전유㎡) _전유면적,평가단가(원/전유㎡) _공급면적,매출원가(원/전유㎡) _전유면적기준,매출원가(원/전유㎡) _공급면적기준,사업진행일자_토지계약일자,사업진행일자_허가일자,사업진행일자_착공일자,사업진행일자_사용승인일,건축비\n(원/평)_Unnamed: 32_level_1,원가비율_토지원가비율,원가비율_건축원가비율,원가비율_부대비용,원가비율_금융비용,원가비율_합계,비고_Unnamed: 38_level_1
0,강남구,개포동,1195-10,2종일주,6미터,265.7,2970000000.0,11178020.0,0.433,다세대주택,519.54,432.22,0.832,195.54,59.62,15개호,1개동,지상5층,SH공사,2020.09.23,4719500000.0,4167814000.0,551685800.0,0.117,10919208.0,9083997.0,9642807.0,8372395.0,2019.02.26,2019.06.18,2019.06.18,2020.01.13,5500000.0,0.7563,0.2139,0.006,0.0239,1.0,
1,강남구,개포동,1199-7,2종일주,4미터,248.0,2550000000.0,10282258.0,0.365,다세대주택,463.84,390.67,0.842,187.04,59.61,14개호,1개동,지상5층,LH공사,2019.11.29,4040000000.0,3593249000.0,446751100.0,0.111,10341209.0,8709524.0,9197658.0,8074624.0,2018.08.21,2018.12.31,2019.01.07,2019.06.04,5400000.0,0.752,0.2175,0.0061,0.0244,1.0,
2,강남구,개포동,1216-7,2종일주,8미터,258.1,2931000000.0,11356064.0,0.472,도시행생활주택,489.46,400.29,0.818,199.62,59.89,15개호,1개동,지상5층,SH공사,2019.11.29,4586000000.0,4054965000.0,531034800.0,0.116,11456694.0,9369509.0,10130070.0,8312283.0,2019.01.24,2019.06.17,2019.07.23,2020.02.25,5400000.0,0.7667,0.2034,0.0057,0.0242,1.0,
3,강남구,역삼동,707-9,일반상업,8미터,363.5,9500000000.0,26134801.0,0.58,도시행생활주택,1695.85,1289.62,0.76,199.62,59.89,15개호,1개동,지상5층,LH공사,2019.11.29,22199830000.0,13988110000.0,8211721000.0,0.37,17214239.0,11159450.0,10846690.0,16602582.0,2019.08.14,2019.07.15,2020.01.03,2020.10.07,5800000.0,0.7189,0.2568,0.0072,0.017,1.0,
4,강남구,역삼동,751-6,2종일주,6미터,391.8,5900000000.0,15058703.0,0.441,도시행생활주택,658.88,537.65,0.816,168.17,46.35,20개호,1개동,지상6층,LH공사,2019.11.29,9810500000.0,7604511000.0,2205989000.0,0.225,18247001.0,14889660.0,14143980.0,12107552.0,2019.01.04,2019.03.05,2019.04.20,2019.11.29,5500000.0,0.8249,0.1486,0.0042,0.0223,1.0,


In [16]:
reference_df['소재지_지번'] = reference_df['소재지_지번'].apply(landnum_modifier)

In [17]:
reference_df['지번주소'] = '서울특별시 ' + reference_df['소재지_지역'] + ' ' + reference_df['소재지_소재지']\
+ ' ' + reference_df['소재지_지번']

In [18]:
def date_modifier(x):
    # x is a string or datetime
    if pd.isna(x) == True:
        return x
    else:
        x = str(x)
        if '-' in x:
            x = x.replace('-', '.')
            
        splitted = x.split('.')
        
        if len(splitted) == 1:
            return np.nan
        else:
            return x

In [19]:
reference_df['기준시점2'] = reference_df['수입/비용_기준시점'].apply(date_modifier)

In [20]:
def get_year(x):
    if pd.isna(x) == True:
        return x
    else:
        return int(x.split('.')[0])

In [21]:
reference_df['년'] = reference_df['기준시점2'].apply(get_year)

In [22]:
reference_df['년'] = reference_df['년'].fillna(2020)
reference_df['년'].isna().sum()

0

In [23]:
def get_bd_type(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        if '오피' in x:
            return '오피스텔'
        else:
            return '다세대'

In [24]:
reference_df['건물유형'] = reference_df['건물내용_건물용도'].apply(get_bd_type)

In [25]:
reference_df = reference_df.merge(last_df, on=['지번주소']).reset_index(drop=True)

In [26]:
reference_df.head()

Unnamed: 0,소재지_지역,소재지_소재지,소재지_지번,토지내용_용도지역,토지내용_도로너비,토지내용_토지면적(㎡),토지내용_토지매매금액,토지내용_토지매매단가\n(원/㎡),토지내용_공시지가와격차\n(개공/매매),건물내용_건물용도,건물내용_공급면적(㎡),건물내용_전유면적(㎡),건물내용_전용율,건물내용_용적율\n(%),건물내용_건폐율\n(%),건물내용_구분건물호,건물내용_건물동수,건물내용_건물층수,수입/비용_의뢰인,수입/비용_기준시점,수입/비용_전체평가금액,수입/비용_투입비용합계,수입/비용_순이익,수입/비용_순이익율,평가단가(원/전유㎡) _전유면적,평가단가(원/전유㎡) _공급면적,매출원가(원/전유㎡) _전유면적기준,매출원가(원/전유㎡) _공급면적기준,사업진행일자_토지계약일자,사업진행일자_허가일자,사업진행일자_착공일자,사업진행일자_사용승인일,건축비\n(원/평)_Unnamed: 32_level_1,원가비율_토지원가비율,원가비율_건축원가비율,원가비율_부대비용,원가비율_금융비용,원가비율_합계,비고_Unnamed: 38_level_1,지번주소,기준시점2,년,건물유형,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면,지형형상2
0,강남구,개포동,1195-10,2종일주,6미터,265.7,2970000000.0,11178020.0,0.433,다세대주택,519.54,432.22,0.832,195.54,59.62,15개호,1개동,지상5층,SH공사,2020.09.23,4719500000.0,4167814000.0,551685800.0,0.117,10919208.0,9083997.0,9642807.0,8372395.0,2019.02.26,2019.06.18,2019.06.18,2020.01.13,5500000.0,0.7563,0.2139,0.006,0.0239,1.0,,서울특별시 강남구 개포동 1195-10,2020.09.23,2020.0,다세대,대,265.7,제2종일반주거지역,지정되지않음,상업기타,평지,세로장방,세로한면(가),장방형
1,강남구,개포동,1199-7,2종일주,4미터,248.0,2550000000.0,10282258.0,0.365,다세대주택,463.84,390.67,0.842,187.04,59.61,14개호,1개동,지상5층,LH공사,2019.11.29,4040000000.0,3593249000.0,446751100.0,0.111,10341209.0,8709524.0,9197658.0,8074624.0,2018.08.21,2018.12.31,2019.01.07,2019.06.04,5400000.0,0.752,0.2175,0.0061,0.0244,1.0,,서울특별시 강남구 개포동 1199-7,2019.11.29,2019.0,다세대,대,248.0,제2종일반주거지역,지정되지않음,다세대,평지,세로장방,세로한면(가),장방형
2,강남구,개포동,1216-7,2종일주,8미터,258.1,2931000000.0,11356064.0,0.472,도시행생활주택,489.46,400.29,0.818,199.62,59.89,15개호,1개동,지상5층,SH공사,2019.11.29,4586000000.0,4054965000.0,531034800.0,0.116,11456694.0,9369509.0,10130070.0,8312283.0,2019.01.24,2019.06.17,2019.07.23,2020.02.25,5400000.0,0.7667,0.2034,0.0057,0.0242,1.0,,서울특별시 강남구 개포동 1216-7,2019.11.29,2019.0,다세대,대,258.1,제2종일반주거지역,지정되지않음,상업기타,평지,정방형,소로한면,정방형
3,강남구,역삼동,707-9,일반상업,8미터,363.5,9500000000.0,26134801.0,0.58,도시행생활주택,1695.85,1289.62,0.76,199.62,59.89,15개호,1개동,지상5층,LH공사,2019.11.29,22199830000.0,13988110000.0,8211721000.0,0.37,17214239.0,11159450.0,10846690.0,16602582.0,2019.08.14,2019.07.15,2020.01.03,2020.10.07,5800000.0,0.7189,0.2568,0.0072,0.017,1.0,,서울특별시 강남구 역삼동 707-9,2019.11.29,2019.0,다세대,대,363.5,일반상업지역,지정되지않음,업무용,평지,정방형,세로한면(가),정방형
4,강남구,역삼동,751-6,2종일주,6미터,391.8,5900000000.0,15058703.0,0.441,도시행생활주택,658.88,537.65,0.816,168.17,46.35,20개호,1개동,지상6층,LH공사,2019.11.29,9810500000.0,7604511000.0,2205989000.0,0.225,18247001.0,14889660.0,14143980.0,12107552.0,2019.01.04,2019.03.05,2019.04.20,2019.11.29,5500000.0,0.8249,0.1486,0.0042,0.0223,1.0,,서울특별시 강남구 역삼동 751-6,2019.11.29,2019.0,다세대,대,391.8,제2종일반주거지역,지정되지않음,상업기타,평지,세로장방,세로한면(가),장방형


In [27]:
reference_df['최근3년50분위'] = np.nan
reference_df['최근3년50분위_평가단가와의격차'] = np.nan
reference_df['최근3년평균'] = np.nan
reference_df['최근3년평균_평가단가와의격차'] = np.nan
reference_df['최근2년50분위'] = np.nan
reference_df['최근2년50분위_평가단가와의격차'] = np.nan
reference_df['최근2년평균'] = np.nan
reference_df['최근2년평균_평가단가와의격차'] = np.nan
reference_df['당해50분위'] = np.nan
reference_df['당해50분위_평가단가와의격차'] = np.nan
reference_df['당해평균'] = np.nan
reference_df['당해평균_평가단가와의격차'] = np.nan

In [28]:
count = 0
for i in tqdm(range(reference_df.shape[0])):
    gu = reference_df.loc[i, '소재지_지역'].replace(' ', '')
    dong = reference_df.loc[i, '소재지_소재지'].replace(' ', '')
    
    land_purpose = reference_df.loc[i, '토지내용_용도지역'].replace(' ', '')[:3]
    year = int(reference_df.loc[i, '년'])
    bd_type = reference_df.loc[i, '건물유형']
    doro = reference_df.loc[i, '도로접면']
    land_shape = reference_df.loc[i, '지형형상2']
    
    if bd_type == '오피스텔':
        target_df = officetel_merge_df[(officetel_merge_df['지번주소'].str.contains(gu+' '+dong))&
                                       (officetel_merge_df['용도지역명1'].str.contains(land_purpose))&
                                       (officetel_merge_df['도로접면'] == doro)&
                                       #(officetel_merge_df['지형형상2'] == land_shape)&
                                       (officetel_merge_df['건물연식'] <= 2)
                                      ]
    else:
        target_df = yunrip_merge_df[(yunrip_merge_df['지번주소'].str.contains(gu+' '+dong))&
                                    (yunrip_merge_df['용도지역명1'].str.contains(land_purpose))&
                                    (yunrip_merge_df['도로접면'] == doro)&
                                    #(yunrip_merge_df['지형형상2'] == land_shape)&
                                    (yunrip_merge_df['건물연식'] <= 2)
                                   ]
        
    target_year_minus_two_df = target_df[(target_df['년'] <= year)&(target_df['년'] >= year-2)]
    target_year_minus_two_df['지번별단가평균'] = target_year_minus_two_df[['지번주소', '전용면적단가(만원/㎡)']]\
    .groupby(['지번주소']).transform('mean')['전용면적단가(만원/㎡)']
    target_year_minus_two_df = target_year_minus_two_df[['지번주소', '지번별단가평균']].drop_duplicates()\
    .reset_index(drop=True)
    
    target_year_minus_one_df = target_df[(target_df['년'] <= year)&(target_df['년'] >= year-1)]
    target_year_minus_one_df['지번별단가평균'] = target_year_minus_one_df[['지번주소', '전용면적단가(만원/㎡)']]\
    .groupby(['지번주소']).transform('mean')['전용면적단가(만원/㎡)']
    target_year_minus_one_df = target_year_minus_one_df[['지번주소', '지번별단가평균']].drop_duplicates()\
    .reset_index(drop=True)
    
    target_year_df = target_df[target_df['년'] == year]
    target_year_df['지번별단가평균'] = target_year_df[['지번주소', '전용면적단가(만원/㎡)']]\
    .groupby(['지번주소']).transform('mean')['전용면적단가(만원/㎡)']
    target_year_df = target_year_df[['지번주소', '지번별단가평균']].drop_duplicates()\
    .reset_index(drop=True)
        
    reference_df.loc[i, '최근3년50분위'] = target_year_minus_two_df['지번별단가평균'].median()
    reference_df.loc[i, '최근3년평균'] = target_year_minus_two_df['지번별단가평균'].mean()
    reference_df.loc[i, '최근2년50분위'] = target_year_minus_one_df['지번별단가평균'].median()
    reference_df.loc[i, '최근2년평균'] = target_year_minus_one_df['지번별단가평균'].mean()
    reference_df.loc[i, '당해50분위'] = target_year_df['지번별단가평균'].median()
    reference_df.loc[i, '당해평균'] = target_year_df['지번별단가평균'].mean()

100%|████████████████████████████████████████████████████████████████████████████████| 130/130 [00:24<00:00,  5.24it/s]


In [29]:
cols = ['지번주소', '기준시점2', '건물내용_건물용도', '건물유형', '토지내용_용도지역', '토지내용_도로너비', '도로접면', '지형형상2',
        '평가단가(원/전유㎡) _전유면적', '최근3년50분위', '최근3년50분위_평가단가와의격차', '최근3년평균',
        '최근3년평균_평가단가와의격차', '최근2년50분위', '최근2년50분위_평가단가와의격차', '최근2년평균',
        '최근2년평균_평가단가와의격차', '당해50분위', '당해50분위_평가단가와의격차', '당해평균', '당해평균_평가단가와의격차'
       ]
selected_df = reference_df[cols]

In [30]:
selected_df['평가단가(원/전유㎡) _전유면적'] = selected_df['평가단가(원/전유㎡) _전유면적'] / 10000

In [31]:
selected_cols = ['최근3년50분위', '최근3년평균', '최근2년50분위', '최근2년평균', '당해50분위', '당해평균']
for col in selected_cols:
    selected_df[col+'_평가단가와의격차'] = selected_df[col] / selected_df['평가단가(원/전유㎡) _전유면적']
    selected_df[col+'_오차'] = selected_df[col] - selected_df['평가단가(원/전유㎡) _전유면적']
    selected_df[col+'_오차절대값'] = np.abs(selected_df[col+'_오차'])
    selected_df[col+'_오차율'] = selected_df[col+'_오차'] / selected_df['평가단가(원/전유㎡) _전유면적'] * 100
    selected_df[col+'_절대값오차율'] = selected_df[col+'_오차절대값'] / selected_df['평가단가(원/전유㎡) _전유면적'] * 100

In [32]:
base_cols = ['지번주소', '기준시점2', '건물내용_건물용도', '건물유형', '토지내용_용도지역', '토지내용_도로너비', '도로접면',
             '지형형상2', '평가단가(원/전유㎡) _전유면적'
            ]
cols_to_show = []
for col in selected_cols:
    cols_to_show.append(col)
    cols_to_show.append(col+'_오차율')

In [33]:
selected2_df = selected_df[(selected_df['건물내용_건물용도'].str.contains('다세'))|(selected_df['건물내용_건물용도'].str.contains('오피'))].dropna(subset=['기준시점2'])
selected2_df = selected2_df[base_cols + cols_to_show]
selected2_df = selected2_df.reset_index(drop=True)
print(selected2_df.shape)
selected2_df

(51, 21)


Unnamed: 0,지번주소,기준시점2,건물내용_건물용도,건물유형,토지내용_용도지역,토지내용_도로너비,도로접면,지형형상2,평가단가(원/전유㎡) _전유면적,최근3년50분위,최근3년50분위_오차율,최근3년평균,최근3년평균_오차율,최근2년50분위,최근2년50분위_오차율,최근2년평균,최근2년평균_오차율,당해50분위,당해50분위_오차율,당해평균,당해평균_오차율
0,서울특별시 강남구 개포동 1195-10,2020.09.23,다세대주택,다세대,2종일주,6미터,세로한면(가),장방형,1091.9208,1097.178201,0.481482,1039.889568,-4.76511,1122.177441,2.770956,1090.256392,-0.152429,1229.894917,12.635909,1203.81941,10.247869
1,서울특별시 강남구 개포동 1199-7,2019.11.29,다세대주택,다세대,2종일주,4미터,세로한면(가),장방형,1034.1209,949.838048,-8.150193,969.236554,-6.274348,1072.297113,3.691659,1000.25716,-3.27464,1078.688631,4.309721,1038.445435,0.418185
2,서울특별시 강남구 역삼동 778-15,2019.11.29,도생-다세대,다세대,2종일주,6미터 및\n 4미터,세로각지(가),장방형,1656.9493,1244.999451,-24.861947,1320.05046,-20.332477,1287.127545,-22.319437,1359.70707,-17.939126,1333.28352,-19.533837,1410.535753,-14.87152
3,서울특별시 강동구 고덕동 292-5,2019.06.14,다세대주택,다세대,2종일반주거,4미터,세로한면(가),정방형,989.5452,,,,,,,,,,,,
4,서울특별시 강동구 고덕동 292-6,2019.06.14,다세대주택,다세대,2종일주,4미터,세로한면(가),정방형,988.1178,,,,,,,,,,,,
5,서울특별시 강동구 길동 140-0,2019.06.14,단지형다세대,다세대,3종일주,6미터,세로한면(가),사다리형,750.4237,,,,,,,,,,,,
6,서울특별시 강동구 길동 96-4,2019.06.14,다세대-도시형주택,다세대,3종일주,4미터,세로한면(가),정방형,746.9423,,,,,,,,,,,,
7,서울특별시 강동구 명일동 350-1,2020.11.11,도생(단지형다세대),다세대,3종일주,8미터,소로각지,부정형,713.2919,715.074115,0.249858,715.074115,0.249858,715.074115,0.249858,715.074115,0.249858,715.074115,0.249858,715.074115,0.249858
8,서울특별시 강동구 암사동 433-69,2020.10.23,다세대주택,다세대,3종일주,4미터,세로각지(가),사다리형,711.012,,,,,,,,,,,,
9,서울특별시 강동구 암사동 454-26,2019.04.01,다세대주택,다세대,2종일주,4미터,세로한면(가),사다리형,609.4015,697.009794,14.37612,735.250341,20.651219,738.792888,21.232535,776.945948,27.493278,704.810481,15.656178,726.680517,19.244951


In [34]:
for col in selected2_df.columns:
    if '오차율' in col:
        print(col, selected2_df[col].apply(np.abs).mean())

최근3년50분위_오차율 5.934458459939919
최근3년평균_오차율 7.084302834918586
최근2년50분위_오차율 6.367786045496198
최근2년평균_오차율 7.478096721729216
당해50분위_오차율 7.930385474871837
당해평균_오차율 8.95426703244608
