In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams['font.family'] = 'Malgun Gothic'
import datetime

In [2]:
def day_modifier(x):
    # x is a number
    
    if pd.isna(x) == True:
        return x
    else:
        x = str(x)
        if len(x) == 1:
            return '0' + x
        else:
            return x

In [3]:
def landnum_modifier(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        x = str(x).replace('외', '').replace(' ','')
        splitted = x.split('-')
        if len(splitted) == 1:
            return x + '-0'
        else:
            return x

In [4]:
def yunrip_data_prep():
    basedir = './국토교통부_실거래가_공개시스템/연립다세대/매매/'
    filenames = [f for f in os.listdir(basedir) if (f.endswith('.csv'))&('(' in f)]
    
    dfs_list = []
    for i, f in tqdm(enumerate(filenames)):
        df = pd.read_csv(basedir + f, encoding='euc-kr', header=15)
        '''if '해제사유발생일' in df.columns.tolist():
            df = df.drop(columns=['해제사유발생일'])'''

        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).reset_index(drop=True)
        
    concat_df['번지'] = concat_df['번지'].apply(landnum_modifier)
    
    concat_df['계약년월'] = concat_df['계약년월'].apply(str)
    concat_df['계약일'] = concat_df['계약일'].apply(str)
    
    concat_df['건물연식'] = concat_df['계약년월'].apply(lambda x: int(x[:4])) - concat_df['건축년도']
    
    concat_df['계약일'] = concat_df['계약일'].apply(day_modifier)
    
    concat_df['계약날짜'] = concat_df['계약년월'].apply(lambda x: x[:4]) + '-' + concat_df['계약년월'].apply(lambda x: x[-2:])\
    + '-' + concat_df['계약일']
    
    concat_df['계약날짜'] = pd.to_datetime(concat_df['계약날짜'], format='%Y-%m-%d')
    
    concat_df['거래금액(만원)'] = concat_df['거래금액(만원)'].apply(lambda x: int(x.replace(',','')))
    concat_df['전용면적단가(만원/㎡)'] = concat_df['거래금액(만원)'] / concat_df['전용면적(㎡)']
    
    concat_df['지번주소'] = concat_df['시군구'] + ' ' + concat_df['번지']
    
    #concat_df['년'] = concat_df['계약날짜'].dt.year
    
    cols_to_drop = ['시군구', '번지', '본번', '부번', '지번주소', '계약년월', '계약일', '도로명', '해제사유발생일']
    
    #concat_df = concat_df[concat_df['년'] >= 2015]
    
    concat_df = concat_df[['지번주소'] + [col for col in concat_df.columns if col not in cols_to_drop] + ['해제사유발생일']]
    
    return concat_df

In [5]:
yunrip_df = yunrip_data_prep().reset_index(drop=True)
print(yunrip_df.shape)
yunrip_df.head()

16it [00:01, 11.63it/s]


(688816, 11)


Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),해제사유발생일
0,서울특별시 강남구 개포동 1264-3,(1264-3),53.28,29.23,11500,-1,1992.0,14.0,2006-08-29,215.840841,
1,서울특별시 강남구 개포동 171-13,(171-13),68.08,56.1,56500,2,1988.0,18.0,2006-12-20,829.905993,
2,서울특별시 강남구 개포동 1239-7,강남빌라 가동,52.59,45.0,40500,2,1988.0,18.0,2006-12-09,770.108386,
3,서울특별시 강남구 개포동 1239-7,강남빌라 가동,52.59,45.0,42000,1,1988.0,18.0,2006-12-19,798.630918,
4,서울특별시 강남구 개포동 1239-6,강남빌라 나동,52.75,44.68,22000,2,1988.0,18.0,2006-11-02,417.061611,


In [6]:
idx_list = []
for addr in tqdm(yunrip_df['지번주소'].unique()):
    addr_df = yunrip_df[yunrip_df['지번주소'] == addr]
    
    for date in addr_df['계약날짜'].unique():
        date_df = addr_df[addr_df['계약날짜'] == date]
        
        if date_df.shape[0] >= 10:
            [idx_list.append(idx) for idx in date_df.index]

100%|████████████████████████████████████████████████████████████████████████████| 80381/80381 [37:46<00:00, 35.46it/s]


In [7]:
len(idx_list)

45185

In [8]:
selected_df = yunrip_df.loc[idx_list,:]
print(selected_df.shape)
selected_df.head()

(45185, 11)


Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),해제사유발생일
470469,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33043,6,2003.0,14.0,2017-09-15,658.751994,
470470,서울특별시 강남구 논현동 6-17,(6-17),69.5,22.11,46364,4,2003.0,14.0,2017-09-15,667.107914,
470471,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33735,5,2003.0,14.0,2017-09-15,672.547847,
470472,서울특별시 강남구 논현동 6-17,(6-17),56.14,17.86,36849,6,2003.0,14.0,2017-09-15,656.376915,
470473,서울특별시 강남구 논현동 6-17,(6-17),29.81,9.48,20068,4,2003.0,14.0,2017-09-15,673.196914,


In [9]:
selected_df['지번주소'].nunique()

2895

In [10]:
selected_df.head(50)

Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),해제사유발생일
470469,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33043,6,2003.0,14.0,2017-09-15,658.751994,
470470,서울특별시 강남구 논현동 6-17,(6-17),69.5,22.11,46364,4,2003.0,14.0,2017-09-15,667.107914,
470471,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33735,5,2003.0,14.0,2017-09-15,672.547847,
470472,서울특별시 강남구 논현동 6-17,(6-17),56.14,17.86,36849,6,2003.0,14.0,2017-09-15,656.376915,
470473,서울특별시 강남구 논현동 6-17,(6-17),29.81,9.48,20068,4,2003.0,14.0,2017-09-15,673.196914,
470474,서울특별시 강남구 논현동 6-17,(6-17),49.67,15.8,33216,3,2003.0,14.0,2017-09-15,668.733642,
470475,서울특별시 강남구 논현동 6-17,(6-17),27.71,8.81,18684,3,2003.0,14.0,2017-09-15,674.269217,
470476,서울특별시 강남구 논현동 6-17,(6-17),84.85,26.99,55360,3,2003.0,14.0,2017-09-15,652.445492,
470477,서울특별시 강남구 논현동 6-17,(6-17),50.22,15.98,33735,4,2003.0,14.0,2017-09-15,671.744325,
470478,서울특별시 강남구 논현동 6-17,(6-17),56.14,17.86,37714,5,2003.0,14.0,2017-09-15,671.784824,


In [11]:
selected_df = selected_df[selected_df['해제사유발생일'].isna()]
print(selected_df.shape)
selected_df.head()

(44864, 11)


Unnamed: 0,지번주소,건물명,전용면적(㎡),대지권면적(㎡),거래금액(만원),층,건축년도,건물연식,계약날짜,전용면적단가(만원/㎡),해제사유발생일
470469,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33043,6,2003.0,14.0,2017-09-15,658.751994,
470470,서울특별시 강남구 논현동 6-17,(6-17),69.5,22.11,46364,4,2003.0,14.0,2017-09-15,667.107914,
470471,서울특별시 강남구 논현동 6-17,(6-17),50.16,15.96,33735,5,2003.0,14.0,2017-09-15,672.547847,
470472,서울특별시 강남구 논현동 6-17,(6-17),56.14,17.86,36849,6,2003.0,14.0,2017-09-15,656.376915,
470473,서울특별시 강남구 논현동 6-17,(6-17),29.81,9.48,20068,4,2003.0,14.0,2017-09-15,673.196914,


In [12]:
selected_df.to_csv('./국토교통부_실거래가_공개시스템/집값분석/일괄구매/일괄구매_전체.csv')

In [13]:
selected_df['단가평균'] = selected_df[['지번주소', '건축년도', '전용면적단가(만원/㎡)']].groupby(['지번주소', '건축년도']).transform('mean')['전용면적단가(만원/㎡)']

In [15]:
selected_df = selected_df.sort_values(['지번주소']).reset_index(drop=True)

In [17]:
selected_df['구'] = selected_df['지번주소'].apply(lambda x: x.split(' ')[1])
selected_df['동'] = selected_df['지번주소'].apply(lambda x: x.split(' ')[2])

In [18]:
selected_df['동'].nunique()

202

In [19]:
selected_df['동2'] = selected_df['구'] + '_' + selected_df['동']

In [20]:
selected_df2 = selected_df[['지번주소', '구', '동', '동2', '건물명', '건축년도', '계약날짜', '단가평균']].drop_duplicates().reset_index(drop=True)
print(selected_df2.shape)
selected_df2.head()

(3082, 8)


Unnamed: 0,지번주소,구,동,동2,건물명,건축년도,계약날짜,단가평균
0,서울특별시 강남구 개포동 1169-10,강남구,개포동,강남구_개포동,에이스빌라,2001.0,2007-04-10,288.782553
1,서울특별시 강남구 개포동 1174-11,강남구,개포동,강남구_개포동,지호빌라B,2016.0,2016-11-29,712.201887
2,서울특별시 강남구 개포동 1174-9,강남구,개포동,강남구_개포동,지호빌라A,2016.0,2016-11-29,730.644702
3,서울특별시 강남구 개포동 1184-0,강남구,개포동,강남구_개포동,몬테빌라,1997.0,2015-02-25,357.477464
4,서울특별시 강남구 개포동 1186-4,강남구,개포동,강남구_개포동,르비양 빌라,2016.0,2017-01-12,726.723569


In [21]:
basedir = './국토교통부_실거래가_공개시스템/집값분석/일괄구매/'
filenames = [f for f in os.listdir(basedir)]
filenames

['일괄구매_전체.csv']

In [22]:
for dong in tqdm(selected_df2['동2'].unique()):
    dong_df = selected_df2[selected_df2['동2'] == dong].sort_values(['계약날짜'], ascending=False).reset_index(drop=True)
    dong_df['지번주소'] = dong_df['지번주소'].apply(lambda x: x.split(' ')[3])
    dong_df = dong_df.drop(columns=['구', '동', '동2'])
    
    dong_df.to_excel(basedir + '{}.xlsx'.format(dong))

100%|████████████████████████████████████████████████████████████████████████████████| 203/203 [00:03<00:00, 65.70it/s]
