In [1]:
import pandas as pd
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import tqdm
import datetime

In [2]:
land_specs_df = pd.read_csv('./prepped_data/land_specs_baseline.csv')
land_specs_df = land_specs_df.sort_values(['지번주소', '년']).drop_duplicates(subset=['지번주소'], keep='last').drop(columns=['년']).reset_index(drop=True)
print(land_specs_df.shape)
land_specs_df.head()

(981464, 9)


Unnamed: 0,지번주소,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,서울특별시 강남구 개포동 100-0,전,876.0,자연녹지지역,지정되지않음,전,완경사,부정형,맹지
1,서울특별시 강남구 개포동 101-0,전,641.0,자연녹지지역,지정되지않음,전,평지,부정형,맹지
2,서울특별시 강남구 개포동 102-0,전,509.0,자연녹지지역,지정되지않음,전,평지,부정형,맹지
3,서울특별시 강남구 개포동 103-0,전,48.7,자연녹지지역,개발제한구역,전,평지,부정형,맹지
4,서울특별시 강남구 개포동 104-1,전,2995.0,개발제한구역,자연녹지지역,전,완경사,부정형,맹지


In [3]:
basedir = './국토교통부_실거래가_공개시스템/'

In [4]:
yunrip_filenames = [f for f in os.listdir(basedir + '연립다세대/매매/') if f.endswith('.csv')]
yunrip_filenames

['연립다세대(매매)__실거래가_20060101_20061231.csv',
 '연립다세대(매매)__실거래가_20070101_20071231.csv',
 '연립다세대(매매)__실거래가_20080101_20081231.csv',
 '연립다세대(매매)__실거래가_20090101_20091231.csv',
 '연립다세대(매매)__실거래가_20100101_20101231.csv',
 '연립다세대(매매)__실거래가_20110101_20111231.csv',
 '연립다세대(매매)__실거래가_20120101_20121231.csv',
 '연립다세대(매매)__실거래가_20130101_20131231.csv',
 '연립다세대(매매)__실거래가_20140101_20141231.csv',
 '연립다세대(매매)__실거래가_20150101_20151231.csv',
 '연립다세대(매매)__실거래가_20160101_20161231.csv',
 '연립다세대(매매)__실거래가_20170101_20171231.csv',
 '연립다세대(매매)__실거래가_20180101_20181231.csv',
 '연립다세대(매매)__실거래가_20190101_20191231.csv',
 '연립다세대(매매)__실거래가_20200101_20201231.csv',
 '연립다세대(매매)__실거래가_20210101_20210510.csv']

In [5]:
yunrip_dfs_list = []
for f in yunrip_filenames:
    df = pd.read_csv(basedir + '연립다세대/매매/' + f, header=15, encoding='euc-kr')
    if '해제사유발생일' in df.columns:
        df = df[df['해제사유발생일'].isna()]
        df = df.drop(columns=['해제사유발생일'])
    yunrip_dfs_list.append(df)

In [6]:
yunrip_df = pd.concat(yunrip_dfs_list).reset_index(drop=True)
print(yunrip_df.shape)
yunrip_df.head()

(690718, 13)


Unnamed: 0,시군구,번지,본번,부번,건물명,전용면적(㎡),대지권면적(㎡),계약년월,계약일,거래금액(만원),층,건축년도,도로명
0,서울특별시 강남구 개포동,1264-3,1264,3,(1264-3),53.28,29.23,200608,29,11500,-1,1992.0,개포로31길 23-7
1,서울특별시 강남구 개포동,171-13,171,13,(171-13),68.08,56.1,200612,20,56500,2,1988.0,선릉로14길 11
2,서울특별시 강남구 개포동,1239-7,1239,7,강남빌라 가동,52.59,45.0,200612,9,40500,2,1988.0,개포로15길 25
3,서울특별시 강남구 개포동,1239-7,1239,7,강남빌라 가동,52.59,45.0,200612,19,42000,1,1988.0,개포로15길 25
4,서울특별시 강남구 개포동,1239-6,1239,6,강남빌라 나동,52.75,44.68,200611,2,22000,2,1988.0,개포로15길 27


In [7]:
def landnum_modifier(x):
    # x is a string
    if pd.isna(x) == True:
        return x
    else:
        x = str(x)
        splitted = x.split('-')
        if len(splitted) == 1:
            return x + '-0'
        else:
            return x

In [8]:
yunrip_df['번지'] = yunrip_df['번지'].apply(landnum_modifier)

In [9]:
yunrip_df['지번주소'] = yunrip_df['시군구'] + ' ' + yunrip_df['번지']

In [10]:
yunrip_df = yunrip_df.merge(land_specs_df, on=['지번주소'])
print(yunrip_df.shape)
yunrip_df.head()

(687754, 22)


Unnamed: 0,시군구,번지,본번,부번,건물명,전용면적(㎡),대지권면적(㎡),계약년월,계약일,거래금액(만원),층,건축년도,도로명,지번주소,지목명,토지면적,용도지역명1,용도지역명2,토지이동상황,지형높이,지형형상,도로접면
0,서울특별시 강남구 개포동,1264-3,1264,3,(1264-3),53.28,29.23,200608,29,11500,-1,1992.0,개포로31길 23-7,서울특별시 강남구 개포동 1264-3,대,116.9,제2종일반주거지역,지정되지않음,다세대,평지,세로장방,세로한면(가)
1,서울특별시 강남구 개포동,171-13,171,13,(171-13),68.08,56.1,200612,20,56500,2,1988.0,선릉로14길 11,서울특별시 강남구 개포동 171-13,대,181.5,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로한면(가)
2,서울특별시 강남구 개포동,171-13,171,13,(171-13),44.8,39.6,201601,16,23300,-1,1988.0,선릉로14길 11,서울특별시 강남구 개포동 171-13,대,181.5,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로한면(가)
3,서울특별시 강남구 개포동,171-13,171,13,(171-13),44.8,39.6,201604,9,24500,-1,1988.0,선릉로14길 11,서울특별시 강남구 개포동 171-13,대,181.5,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로한면(가)
4,서울특별시 강남구 개포동,171-13,171,13,(171-13),68.08,56.1,201910,23,60000,2,1988.0,선릉로14길 11,서울특별시 강남구 개포동 171-13,대,181.5,제2종일반주거지역,지정되지않음,다세대,평지,정방형,세로한면(가)


In [11]:
land_filenames = [f for f in os.listdir(basedir + '토지/') if ('토지(매매)' in f)&(f.endswith('.csv'))]
land_filenames

['토지(매매)__실거래가_20060101_20061231.csv',
 '토지(매매)__실거래가_20070101_20071231.csv',
 '토지(매매)__실거래가_20080101_20081231.csv',
 '토지(매매)__실거래가_20090101_20091231.csv',
 '토지(매매)__실거래가_20100101_20101231.csv',
 '토지(매매)__실거래가_20110101_20111231.csv',
 '토지(매매)__실거래가_20120101_20121231.csv',
 '토지(매매)__실거래가_20130101_20131231.csv',
 '토지(매매)__실거래가_20140101_20141231.csv',
 '토지(매매)__실거래가_20150101_20151231.csv',
 '토지(매매)__실거래가_20160101_20161231.csv',
 '토지(매매)__실거래가_20170101_20171231.csv',
 '토지(매매)__실거래가_20180101_20181231.csv',
 '토지(매매)__실거래가_20190101_20191231.csv',
 '토지(매매)__실거래가_20200101_20201231.csv',
 '토지(매매)__실거래가_20210101_20210510.csv']

In [12]:
land_dfs_list = []
for f in land_filenames:
    try:
        df = pd.read_csv(basedir + '토지/' + f, header=14, encoding='euc-kr')
    except:
        df = pd.read_csv(basedir + '토지/' + f)
    if '해제사유발생일' in df.columns:
        df = df[df['해제사유발생일'].isna()]
        df = df.drop(columns=['해제사유발생일'])
    land_dfs_list.append(df)

In [13]:
land_df = pd.concat(land_dfs_list).reset_index(drop=True)
print(land_df.shape)
land_df.head()

(112882, 10)


Unnamed: 0,시군구,번지,지목,용도지역,도로조건,계약년월,계약일,계약면적(㎡),거래금액(만원),지분구분
0,서울특별시 구로구 오류동,2*,대,제2종일반주거지역,8m미만,200601,1,23.14,1890,
1,서울특별시 관악구 신림동,산9*,임야,제3종일반주거지역,25m미만,200601,2,26.0,1432,지분
2,서울특별시 강남구 역삼동,7**,대,제3종일반주거지역,8m미만,200601,2,393.3,267750,
3,서울특별시 서초구 신원동,3**,전,개발제한구역,8m미만,200601,2,684.0,53820,
4,서울특별시 관악구 신림동,산1**,임야,제3종일반주거지역,25m미만,200601,3,24.0,1325,지분


In [14]:
commerce_filenames = [f for f in os.listdir(basedir + '상업업무용/') if ('상업업무용(매매)' in f)&(f.endswith('.csv'))]
commerce_filenames

['상업업무용(매매)__실거래가_20060101_20061231.csv',
 '상업업무용(매매)__실거래가_20070101_20071231.csv',
 '상업업무용(매매)__실거래가_20080101_20081231.csv',
 '상업업무용(매매)__실거래가_20090101_20091231.csv',
 '상업업무용(매매)__실거래가_20100101_20101231.csv',
 '상업업무용(매매)__실거래가_20110101_20111231.csv',
 '상업업무용(매매)__실거래가_20120101_20121231.csv',
 '상업업무용(매매)__실거래가_20130101_20131231.csv',
 '상업업무용(매매)__실거래가_20140101_20141231.csv',
 '상업업무용(매매)__실거래가_20150101_20151231.csv',
 '상업업무용(매매)__실거래가_20160101_20161231.csv',
 '상업업무용(매매)__실거래가_20170101_20171231.csv',
 '상업업무용(매매)__실거래가_20180101_20181231.csv',
 '상업업무용(매매)__실거래가_20190101_20191231.csv',
 '상업업무용(매매)__실거래가_20200101_20201231.csv',
 '상업업무용(매매)__실거래가_20210101_20210510.csv']

In [15]:
commerce_dfs_list = []
for f in commerce_filenames:
    df = pd.read_csv(basedir + '상업업무용/' + f, header=15, encoding='euc-kr')
    if '해제사유발생일' in df.columns:
        df = df[df['해제사유발생일'].isna()]
        df = df.drop(columns=['해제사유발생일'])
    commerce_dfs_list.append(df)

In [16]:
commerce_df = pd.concat(commerce_dfs_list).reset_index(drop=True)
print(commerce_df.shape)
commerce_df.head()

(232101, 15)


Unnamed: 0,시군구,유형,지번,도로명,용도지역,건축물주용도,도로조건,전용/연면적(㎡),대지면적(㎡),거래금액(만원),층,계약년월,계약일,지분구분,건축년도
0,서울특별시 강남구 개포동,집합,1**,삼성로,제2종일반주거,판매,25m이상,14.17,,66000,1.0,200601,3,,1982.0
1,서울특별시 강남구 개포동,집합,1*,개포로,일반상업,업무,25m이상,35.22,,7500,14.0,200601,4,,1997.0
2,서울특별시 강남구 개포동,집합,1**,삼성로,제2종일반주거,판매,25m이상,31.28,,62000,,200601,4,,1982.0
3,서울특별시 강남구 개포동,집합,1*,개포로,일반상업,업무,25m이상,32.44,,7100,7.0,200601,10,,1997.0
4,서울특별시 강남구 개포동,집합,6**,언주로,제2종일반주거,제2종근린생활,25m이상,85.17,,93000,2.0,200601,12,,1984.0


In [17]:
commerce_df = commerce_df[commerce_df['대지면적(㎡)'] != ' ']
print(commerce_df.shape)
commerce_df.head()

(44999, 15)


Unnamed: 0,시군구,유형,지번,도로명,용도지역,건축물주용도,도로조건,전용/연면적(㎡),대지면적(㎡),거래금액(만원),층,계약년월,계약일,지분구분,건축년도
37,서울특별시 강남구 개포동,일반,6**,언주로,제3종일반주거,판매,25m이상,38.54,17.55,24000,,200606,8,,1984.0
63,서울특별시 강남구 개포동,일반,6**,언주로,제3종일반주거,기타,25m이상,52.63,26.59,55000,,200611,7,지분,1984.0
64,서울특별시 강남구 개포동,일반,6**,언주로,제3종일반주거,기타,25m이상,52.63,26.59,55000,,200611,7,지분,1984.0
66,서울특별시 강남구 개포동,일반,6**,언주로,제3종일반주거,기타,25m이상,35.09,17.73,38500,,200611,10,,1984.0
67,서울특별시 강남구 개포동,일반,6**,언주로,제3종일반주거,제1종근린생활,25m이상,35.09,17.73,38500,,200611,10,,1984.0


In [18]:
def price_to_number(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        return float(str(x).replace(',', ''))

In [19]:
yunrip_df['거래금액(만원)'] = yunrip_df['거래금액(만원)'].apply(price_to_number)
land_df['거래금액(만원)'] = land_df['거래금액(만원)'].apply(price_to_number)
commerce_df['거래금액(만원)'] = commerce_df['거래금액(만원)'].apply(price_to_number)

In [20]:
def building_age(df):
    df = df.copy()
    date_today = pd.to_datetime(datetime.datetime.now())
    
    df['건물연식'] = date_today.year - df['건축년도']
    
    return df

In [21]:
yunrip_df = building_age(yunrip_df)
commerce_df = building_age(commerce_df)

In [22]:
yunrip_df['단가(㎡)'] = yunrip_df['거래금액(만원)'] / yunrip_df['대지권면적(㎡)']
yunrip_df['단가(평)'] = yunrip_df['단가(㎡)'] * 3.305785

In [23]:
commerce_df['대지면적(㎡)'] = commerce_df['대지면적(㎡)'].apply(lambda x: float(x))
commerce_df['단가(㎡)'] = commerce_df['거래금액(만원)'] / commerce_df['대지면적(㎡)']
commerce_df['단가(평)'] = commerce_df['단가(㎡)'] * 3.305785

In [24]:
land_df['단가(㎡)'] = land_df['거래금액(만원)'] / land_df['계약면적(㎡)']
land_df['단가(평)'] = land_df['단가(㎡)'] * 3.305785

In [25]:
yunrip_df.rename(columns={'용도지역명1':'용도지역'}, inplace=True)

In [26]:
def dorotype(x):
    # x is a string
    if pd.isna(x) == True:
        return x
    else:
        if '광대' in x:
            return '25m이상'
        elif '중로' in x:
            return '25m미만'
        elif '소로' in x:
            return '12m미만'
        elif '세로' in x:
            return '8m미만'
        else:
            return '-'

In [27]:
yunrip_df['도로조건'] = yunrip_df['도로접면'].apply(dorotype)

In [28]:
yunrip_df['부동산유형'] = '연립다세대'
commerce_df['부동산유형'] = '상업업무'
land_df['부동산유형'] = '토지'

In [29]:
def day_modifier(x):
    if pd.isna(x) == True:
        return x
    else:
        x = str(x)
        if len(x) == 1:
            return '0' + x
        else:
            return x

In [30]:
yunrip_df['계약일'] = yunrip_df['계약일'].apply(day_modifier)
commerce_df['계약일'] = commerce_df['계약일'].apply(day_modifier)
land_df['계약일'] = land_df['계약일'].apply(day_modifier)

In [31]:
yunrip_df['계약년월'] = yunrip_df['계약년월'].apply(str)
commerce_df['계약년월'] = commerce_df['계약년월'].apply(str)
land_df['계약년월'] = land_df['계약년월'].apply(str)

In [32]:
yunrip_df['계약날짜'] = yunrip_df['계약년월'] + yunrip_df['계약일']
commerce_df['계약날짜'] = commerce_df['계약년월'] + commerce_df['계약일']
land_df['계약날짜'] = land_df['계약년월'] + land_df['계약일']

In [33]:
yunrip_df['계약날짜'] = pd.to_datetime(yunrip_df['계약날짜'], format='%Y%m%d')
commerce_df['계약날짜'] = pd.to_datetime(commerce_df['계약날짜'], format='%Y%m%d')
land_df['계약날짜'] = pd.to_datetime(land_df['계약날짜'], format='%Y%m%d')

In [34]:
commerce_df['계약년월'] = commerce_df['계약날짜']
land_df['계약년월'] = land_df['계약날짜']
commerce_df.drop(columns=['계약일', '계약날짜'], inplace=True)
land_df.drop(columns=['계약일', '계약날짜'], inplace=True)
commerce_df.rename(columns={'계약년월':'계약날짜'}, inplace=True)
land_df.rename(columns={'계약년월':'계약날짜'}, inplace=True)

In [35]:
commerce_df.rename(columns={'지번':'번지'}, inplace=True)

In [36]:
cols_to_keep = ['시군구', '번지', '대지권면적(㎡)', '계약날짜', '거래금액(만원)', '건축년도', '건물연식', '토지면적', '지목명', '용도지역', '도로접면', '도로조건', '단가(㎡)', '단가(평)', '부동산유형']
yunrip_df = yunrip_df[cols_to_keep]

In [37]:
yunrip_df.rename(columns={'대지권면적(㎡)':'대지면적(㎡)'}, inplace=True)
land_df.rename(columns={'계약면적(㎡)':'대지면적(㎡)'}, inplace=True)

In [38]:
yunrip_20_df = yunrip_df[yunrip_df['건물연식'] >= 20]
yunrip_30_df = yunrip_df[yunrip_df['건물연식'] >= 30]

In [39]:
commerce_20_df = commerce_df[commerce_df['건물연식'] >= 20]
commerce_30_df = commerce_df[commerce_df['건물연식'] >= 30]

In [40]:
yunrip_df.to_csv(basedir + '땅값분석/'+ 'yunrip.csv', index=False)
yunrip_20_df.to_csv(basedir + '땅값분석/' + 'yunrip_20.csv', index=False)
yunrip_30_df.to_csv(basedir + '땅값분석/' + 'yunrip_30.csv', index=False)

In [41]:
commerce_df.to_csv(basedir + '땅값분석/' + 'commerce.csv', index=False)
commerce_20_df.to_csv(basedir + '땅값분석/' + 'commerce_20.csv', index=False)
commerce_30_df.to_csv(basedir + '땅값분석/' + 'commerce_30.csv', index=False)

In [42]:
land_df.to_csv(basedir + '땅값분석/' + 'land.csv', index=False)