In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'

# Read Data

In [3]:
def file_reader(basedir, filename):
    if filename.endswith('.csv'):
        try:
            df = pd.read_csv(basedir + filename, header=15)
        except:
            df = pd.read_csv(basedir + filename, encoding='euc-kr', header=15)
    elif filename.endswith('.xlsx'):
        try:
            df = pd.read_excel(basedir + filename, header=16)
        except:
            df = pd.read_excel(basedir + filename, encoding='euc-kr', header=16)
    else:
        print('error')
    
    return df

In [4]:
def read_data():
    basedir = './국토교통부_실거래가_공개시스템/'
    filenames = os.listdir(basedir)
    
    apart_trade_files = [filename for filename in filenames if filename.startswith('아파트(매매)')]
    apart_lease_files = [filename for filename in filenames if filename.startswith('아파트(전월세)')]
    multiplex_trade_files = [filename for filename in filenames if filename.startswith('연립다세대(매매)')]
    multiplex_lease_files = [filename for filename in filenames if filename.startswith('연립다세대(전월세)')]
    officetel_trade_files = [filename for filename in filenames if filename.startswith('오피스텔(매매)')]
    officetel_lease_files = [filename for filename in filenames if filename.startswith('오피스텔(전월세)')]
    
    #
    apart_trade_dfs_list = []
    for filename in apart_trade_files:
        df = file_reader(basedir, filename)
        apart_trade_dfs_list.append(df)
    apart_trade_df = pd.concat(apart_trade_dfs_list).reset_index(drop=True)
    
    #
    apart_lease_dfs_list = []
    for filename in apart_lease_files:
        df = file_reader(basedir, filename)
        apart_lease_dfs_list.append(df)
    apart_lease_df = pd.concat(apart_lease_dfs_list).reset_index(drop=True)
    
    #
    multiplex_trade_dfs_list = []
    for filename in multiplex_trade_files:
        df = file_reader(basedir, filename)
        multiplex_trade_dfs_list.append(df)
    multiplex_trade_df = pd.concat(multiplex_trade_dfs_list).reset_index(drop=True)
    
    #
    multiplex_lease_dfs_list = []
    for filename in multiplex_lease_files:
        df = file_reader(basedir, filename)
        multiplex_lease_dfs_list.append(df)
    multiplex_lease_df = pd.concat(multiplex_lease_dfs_list).reset_index(drop=True)
    
    #
    officetel_trade_dfs_list = []
    for filename in officetel_trade_files:
        df = file_reader(basedir, filename)
        officetel_trade_dfs_list.append(df)
    officetel_trade_df = pd.concat(officetel_trade_dfs_list).reset_index(drop=True)
    
    #
    officetel_lease_dfs_list = []
    for filename in officetel_lease_files:
        df = file_reader(basedir, filename)
        officetel_lease_dfs_list.append(df)
    officetel_lease_df = pd.concat(officetel_lease_dfs_list).reset_index(drop=True)
    
    
    return apart_trade_df, apart_lease_df, multiplex_trade_df, multiplex_lease_df, officetel_trade_df, officetel_lease_df

In [5]:
%%time
apart_trade_df, apart_lease_df, multiplex_trade_df, multiplex_lease_df, officetel_trade_df, officetel_lease_df\
= read_data()

  if __name__ == '__main__':
  exec(code, glob, local_ns)


Wall time: 3min 20s


# From previous EDA and Preprocessing

In [6]:
apart_lease_df.drop(apart_lease_df[apart_lease_df['전용면적(㎡)'].isna()].index, inplace=True)

In [7]:
multiplex_trade_df.loc[multiplex_trade_df[multiplex_trade_df['건축년도'].isna()].index, '건축년도'] = 1971
multiplex_lease_df.drop(multiplex_lease_df[multiplex_lease_df['층'].isna()].index, inplace=True)

In [8]:
# 모든 df 도로명 빈 칸 null 로 교체

def replace_empty_to_null(df):
    df = df.copy()
    
    empty_df = df[df['도로명'] == ' ']
    
    df.loc[empty_df.index, '도로명'] = np.nan
    
    return df

In [9]:
apart_trade_df = replace_empty_to_null(apart_trade_df)
apart_lease_df = replace_empty_to_null(apart_lease_df)
multiplex_trade_df = replace_empty_to_null(multiplex_trade_df)
multiplex_lease_df = replace_empty_to_null(multiplex_lease_df)
officetel_trade_df = replace_empty_to_null(officetel_trade_df)
officetel_lease_df = replace_empty_to_null(officetel_lease_df)

In [10]:
multiplex_lease_df.loc[multiplex_lease_df[multiplex_lease_df['건축년도'].isna()].index, '건축년도'] = 1971

In [11]:
# 건축년도가 null 인 row 들에 있는 주소들 중에서, 같은 주소지만 건축년도 값이 있는 주소는 null 을 채워넣고, 없는 주소는 df 에서 제거.
# 추가적으로, 한 주소지, 단지이름에 건축년도가 두 가지 이상 있는지도 조사함.

def bltyear_fill_and_remove_null(df):
    df = df.copy()
    
    df['temp_full_addr'] = df['시군구'] + df['번지'] + df['단지명']
    
    bltyear_null_df = df[df['건축년도'].isna()]
    
    unique_addrs = bltyear_null_df['temp_full_addr'].unique()
    
    concat_list = []
    for addr in unique_addrs:
        addr_df = df[df['temp_full_addr'] == addr]
        if addr_df['건축년도'].isna().sum() != addr_df.shape[0]:
            if addr_df['건축년도'].std() != 0:
                print('건축년도가 두 가지 이상인 주소가 있음.')
            
            addr_df['건축년도'].fillna(addr_df['건축년도'].mean(), inplace=True)
            
            concat_list.append(addr_df)
    
    df.drop(columns=['temp_full_addr'], inplace=True)
    
    if len(concat_list) > 0:
        concat_df = pd.concat(concat_list)
    else:
        return df
    
    #return concat_df
    
    df.loc[concat_df.index, '건축년도'] = concat_df['건축년도']
    
    bltyear_null_df = df[df['건축년도'].isna()]
    
    df.drop(bltyear_null_df.index, inplace=True)
        
    return df.reset_index(drop=True)

In [12]:
officetel_trade_df = bltyear_fill_and_remove_null(officetel_trade_df)
officetel_lease_df = bltyear_fill_and_remove_null(officetel_lease_df)

In [13]:
def trade_dfs_change_dtype(df):
    df = df.copy()
    
    df['거래금액(만원)'] = df['거래금액(만원)'].apply(str)
    df['거래금액(만원)'] = pd.to_numeric(df['거래금액(만원)'].str.replace(',', ''))
    df['층'] = df['층'].astype('int')
    df['건축년도'] = df['건축년도'].astype('int')
    
    return df

In [14]:
apart_trade_df = trade_dfs_change_dtype(apart_trade_df)
multiplex_trade_df = trade_dfs_change_dtype(multiplex_trade_df)
officetel_trade_df = trade_dfs_change_dtype(officetel_trade_df)

In [15]:
def lease_dfs_change_dtype(df):
    df = df.copy()
    
    df['보증금(만원)'] = df['보증금(만원)'].apply(str)
    df['월세(만원)'] = df['월세(만원)'].apply(str)
    df['보증금(만원)'] = pd.to_numeric(df['보증금(만원)'].str.replace(',', ''))
    df['월세(만원)'] = pd.to_numeric(df['월세(만원)'].str.replace(',', ''))
    df['층'] = df['층'].astype('int')
    df['건축년도'] = df['건축년도'].astype('int')
    
    return df

In [16]:
apart_lease_df = lease_dfs_change_dtype(apart_lease_df)
multiplex_lease_df = lease_dfs_change_dtype(multiplex_lease_df)
officetel_lease_df = lease_dfs_change_dtype(officetel_lease_df)

In [17]:
def price_per_area_and_year_month(df, mode='trade'):
    df = df.copy()
    
    if mode == 'trade':
        df['가격/면적'] = df['거래금액(만원)'] / df['전용면적(㎡)']
    elif mode == 'lease':
        df['가격/면적'] = df['보증금(만원)'] / df['전용면적(㎡)']
    
    df['year'] = pd.to_numeric(df['계약년월'].astype('str').str[:4])
    df['month'] = pd.to_numeric(df['계약년월'].astype('str').str[4:])
    
    return df   

In [18]:
apart_trade_df = price_per_area_and_year_month(apart_trade_df)
multiplex_trade_df = price_per_area_and_year_month(multiplex_trade_df)
officetel_trade_df = price_per_area_and_year_month(officetel_trade_df)

In [19]:
apart_lease_df = price_per_area_and_year_month(apart_lease_df, mode='lease')
multiplex_lease_df = price_per_area_and_year_month(multiplex_lease_df, mode='lease')
officetel_lease_df = price_per_area_and_year_month(officetel_lease_df, mode='lease')

In [20]:
def si_gu(df):
    df = df.copy()
    
    df['시'] = df['시군구'].str.split(' ').apply(lambda x: x[0])
    df['구'] = df['시군구'].str.split(' ').apply(lambda x: x[1])
    
    return df

In [21]:
apart_trade_df = si_gu(apart_trade_df)
apart_lease_df = si_gu(apart_lease_df)
multiplex_trade_df = si_gu(multiplex_trade_df)
multiplex_lease_df = si_gu(multiplex_lease_df)
officetel_trade_df = si_gu(officetel_trade_df)
officetel_lease_df = si_gu(officetel_lease_df)

In [22]:
def remove_zero_price(df, mode='trade'):
    df = df.copy()
    
    if mode == 'trade':
        df = df[df['거래금액(만원)'] != 0].reset_index(drop=True)
    elif mode == 'lease':
        df = df[df['보증금(만원)'] != 0].reset_index(drop=True)
    
    return df

In [23]:
apart_trade_df = remove_zero_price(apart_trade_df)
apart_lease_df = remove_zero_price(apart_lease_df, mode='lease')
multiplex_trade_df = remove_zero_price(multiplex_trade_df)
multiplex_lease_df = remove_zero_price(multiplex_lease_df, mode='lease')
officetel_trade_df = remove_zero_price(officetel_trade_df)
officetel_lease_df = remove_zero_price(officetel_lease_df, mode='lease')

In [24]:
def remove_zero_area(df):
    df = df.copy()
    
    df = df[df['전용면적(㎡)'] != 0].reset_index(drop=True)
    
    return df

In [25]:
apart_trade_df = remove_zero_area(apart_trade_df)
apart_lease_df = remove_zero_area(apart_lease_df)
multiplex_trade_df = remove_zero_area(multiplex_trade_df)
multiplex_lease_df = remove_zero_area(multiplex_lease_df)
officetel_trade_df = remove_zero_area(officetel_trade_df)
officetel_lease_df = remove_zero_area(officetel_lease_df)

In [26]:
def correct_lease_type(df):
    df = df.copy()
    
    wrong_df = df[(df['전월세구분'] == '월세') & (df['월세(만원)'] == 0)]
    
    df.loc[wrong_df.index, '전월세구분'] = '전세'
    
    wrong_df2 = df[(df['전월세구분'] == '전세') & (df['월세(만원)'] != 0)]
    
    df.loc[wrong_df2.index, '전월세구분'] = '월세'
    
    return df

In [27]:
apart_lease_df = correct_lease_type(apart_lease_df)
multiplex_lease_df = correct_lease_type(multiplex_lease_df)
officetel_lease_df = correct_lease_type(officetel_lease_df)

In [28]:
apart_trade_df['target_log_transformed'] = apart_trade_df['거래금액(만원)'].apply(lambda x: np.log(x))
apart_lease_df['target_log_transformed'] = apart_lease_df['보증금(만원)'].apply(lambda x: np.log(x))
multiplex_trade_df['target_log_transformed'] = multiplex_trade_df['거래금액(만원)'].apply(lambda x: np.log(x))
multiplex_lease_df['target_log_transformed'] = multiplex_lease_df['보증금(만원)'].apply(lambda x: np.log(x))
officetel_trade_df['target_log_transformed'] = officetel_trade_df['거래금액(만원)'].apply(lambda x: np.log(x))
officetel_lease_df['target_log_transformed'] = officetel_lease_df['보증금(만원)'].apply(lambda x: np.log(x))

In [29]:
apart_trade_df['target/area_log_transformed'] = apart_trade_df['가격/면적'].apply(lambda x: np.log(x))
apart_lease_df['target/area_log_transformed'] = apart_lease_df['가격/면적'].apply(lambda x: np.log(x))
multiplex_trade_df['target/area_log_transformed'] = multiplex_trade_df['가격/면적'].apply(lambda x: np.log(x))
multiplex_lease_df['target/area_log_transformed'] = multiplex_lease_df['가격/면적'].apply(lambda x: np.log(x))
officetel_trade_df['target/area_log_transformed'] = officetel_trade_df['가격/면적'].apply(lambda x: np.log(x))
officetel_lease_df['target/area_log_transformed'] = officetel_lease_df['가격/면적'].apply(lambda x: np.log(x))

In [30]:
def get_dong(x):
    #x is a string
    
    for i in range(10):
        splitted = x.split('{}'.format(i))
        if len(splitted) > 1:
            return splitted[0]
    
    return x

In [31]:
apart_trade_df['동'] = apart_trade_df['시군구'].apply(get_dong)
apart_lease_df['동'] = apart_lease_df['시군구'].apply(get_dong)
multiplex_trade_df['동'] = multiplex_trade_df['시군구'].apply(get_dong)
multiplex_lease_df['동'] = multiplex_lease_df['시군구'].apply(get_dong)
officetel_trade_df['동'] = officetel_trade_df['시군구'].apply(get_dong)
officetel_lease_df['동'] = officetel_lease_df['시군구'].apply(get_dong)

In [33]:
apart_lease_deposit_only_df = apart_lease_df[apart_lease_df['전월세구분'] == '전세'].reset_index(drop=True)
apart_lease_monthly_pay_df = apart_lease_df[apart_lease_df['전월세구분'] == '월세'].reset_index(drop=True)
multiplex_lease_deposit_only_df = multiplex_lease_df[multiplex_lease_df['전월세구분'] == '전세'].reset_index(drop=True)
multiplex_lease_monthly_pay_df = multiplex_lease_df[multiplex_lease_df['전월세구분'] == '월세'].reset_index(drop=True)
officetel_lease_deposit_only_df = officetel_lease_df[officetel_lease_df['전월세구분'] == '전세'].reset_index(drop=True)
officetel_lease_monthly_pay_df = officetel_lease_df[officetel_lease_df['전월세구분'] == '월세'].reset_index(drop=True)

In [32]:
# Tukey의 기법을 사용하여 각 동 별 이상치 (outlier) 검색 후 제거 (가격)

def find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(df, verbose=False):
    df = df.copy()
    
    unique_dongs = df['동'].unique()
    
    count = 0
    dfs_list = []
    for dong in unique_dongs:
        dong_df = df[df['동'] == dong]
        
        year_val_c = dong_df['year'].value_counts()
        year_val_c_max = year_val_c.max()
        if verbose:
            print('{}에서 거래가 가장 많았던 해는 {}년이며, 거래량은 {}회 입니다.'.format(dong, year_val_c.index[0], year_val_c_max))
        
        year_dfs_list = []
        for year in year_val_c.index:
            year_df = dong_df[dong_df['year'] == year]
            year_df_sample = year_df.sample(n=year_val_c_max, replace=True, random_state=42)
            year_dfs_list.append(year_df_sample)
            
        dong_concat_df = pd.concat(year_dfs_list)
    
        q1 = dong_concat_df['target/area_log_transformed'].quantile(0.25)
        q3 = dong_concat_df['target/area_log_transformed'].quantile(0.75)
        iqr = q3 - q1
        outlier_step = iqr * 1.5

        outlier_df = dong_df[(dong_df['target/area_log_transformed'] < q1 - outlier_step)\
                             | (dong_df['target/area_log_transformed'] > q3 + outlier_step)]

        dong_df.drop(outlier_df.index, inplace=True)
        
        dfs_list.append(dong_df)
        
        if verbose:
            print('{}에서 {}개의 이상치를 찾아서 제거했습니다.'.format(dong, outlier_df.shape[0]))
        
        count += outlier_df.shape[0]
    
    print('------------- 총 {}개의 이상치를 찾아서 제거했습니다. ----------------'.format(count))
    
    concat_df = pd.concat(dfs_list).reset_index(drop=True)   
    
    return concat_df

In [34]:
apart_trade_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(apart_trade_df)
apart_lease_deposit_only_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(apart_lease_deposit_only_df)
apart_lease_monthly_pay_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(apart_lease_monthly_pay_df)
multiplex_trade_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(multiplex_trade_df)
multiplex_lease_deposit_only_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(multiplex_lease_deposit_only_df)
multiplex_lease_monthly_pay_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(multiplex_lease_monthly_pay_df)
officetel_trade_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(officetel_trade_df)
officetel_lease_deposit_only_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(officetel_lease_deposit_only_df)
officetel_lease_monthly_pay_df = find_and_remove_Tukey_outliers_ver2_price_per_area_log_per_dong(officetel_lease_monthly_pay_df)

------------- 총 48297개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 14366개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 1831개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 29133개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 9181개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 3342개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 4297개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 2403개의 이상치를 찾아서 제거했습니다. ----------------
------------- 총 732개의 이상치를 찾아서 제거했습니다. ----------------


# EDA & Preprocessing 13

In [38]:
print(apart_trade_df.shape)
apart_trade_df.head()

(1119895, 20)


Unnamed: 0,시군구,번지,본번,부번,단지명,전용면적(㎡),계약년월,계약일,거래금액(만원),층,건축년도,도로명,가격/면적,year,month,시,구,target_log_transformed,target/area_log_transformed,동
0,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),77.75,201309,8,57000,2,1988,언주로 103,733.118971,2013,9,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동
1,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),77.75,201312,16,57000,2,1988,언주로 103,733.118971,2013,12,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동
2,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,67.28,201302,11,55000,5,1987,언주로 3,817.479191,2013,2,서울특별시,강남구,10.915088,6.706225,서울특별시 강남구 개포동
3,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,67.28,201302,22,58250,4,1987,언주로 3,865.78478,2013,2,서울특별시,강남구,10.972499,6.763636,서울특별시 강남구 개포동
4,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,67.28,201305,10,60000,5,1987,언주로 3,891.795482,2013,5,서울특별시,강남구,11.0021,6.793237,서울특별시 강남구 개포동


In [39]:
print(apart_lease_deposit_only_df.shape)
apart_lease_deposit_only_df.head()

(1146198, 22)


Unnamed: 0,시군구,번지,본번,부번,단지명,전월세구분,전용면적(㎡),계약년월,계약일,보증금(만원),월세(만원),층,건축년도,도로명,가격/면적,year,month,시,구,target_log_transformed,target/area_log_transformed,동
0,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),전세,77.75,201101,5,35000,0,7,1988,언주로 103,450.160772,2011,1,서울특별시,강남구,10.463103,6.109605,서울특별시 강남구 개포동
1,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),전세,77.75,201101,18,20000,0,8,1988,언주로 103,257.234727,2011,1,서울특별시,강남구,9.903488,5.549989,서울특별시 강남구 개포동
2,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),전세,77.75,201102,1,24000,0,5,1988,언주로 103,308.681672,2011,2,서울특별시,강남구,10.085809,5.732311,서울특별시 강남구 개포동
3,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),전세,77.75,201102,11,31000,0,9,1988,언주로 103,398.713826,2011,2,서울특별시,강남구,10.341742,5.988244,서울특별시 강남구 개포동
4,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),전세,77.75,201102,24,30500,0,9,1988,언주로 103,392.282958,2011,2,서울특별시,강남구,10.325482,5.971983,서울특별시 강남구 개포동


In [40]:
apart_trade_df['전월세매매구분'] = '매매'
multiplex_trade_df['전월세매매구분'] = '매매'
officetel_trade_df['전월세매매구분'] = '매매'
apart_trade_df['월세(만원)'] = 0
multiplex_trade_df['월세(만원)'] = 0
officetel_trade_df['월세(만원)'] = 0

In [41]:
print(apart_trade_df.shape)
print(apart_lease_deposit_only_df.shape)

(1119895, 22)
(1146198, 22)


In [44]:
apart_trade_df.rename(columns={'거래금액(만원)':'금액(만원)'}, inplace=True)
multiplex_trade_df.rename(columns={'거래금액(만원)':'금액(만원)'}, inplace=True)
officetel_trade_df.rename(columns={'거래금액(만원)':'금액(만원)'}, inplace=True)
apart_lease_deposit_only_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)
apart_lease_monthly_pay_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)
multiplex_lease_deposit_only_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)
multiplex_lease_monthly_pay_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)
officetel_lease_deposit_only_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)
officetel_lease_monthly_pay_df.rename(columns={'보증금(만원)':'금액(만원)', '전월세구분':'전월세매매구분'}, inplace=True)

In [55]:
apart_trade_df.rename(columns={'단지명':'건물명'}, inplace=True)
officetel_trade_df.rename(columns={'단지명':'건물명'}, inplace=True)
apart_lease_deposit_only_df.rename(columns={'단지명':'건물명'}, inplace=True)
apart_lease_monthly_pay_df.rename(columns={'단지명':'건물명'}, inplace=True)
officetel_lease_deposit_only_df.rename(columns={'단지명':'건물명'}, inplace=True)
officetel_lease_monthly_pay_df.rename(columns={'단지명':'건물명'}, inplace=True)

In [56]:
apart_trade_df = apart_trade_df[apart_lease_deposit_only_df.columns]
multiplex_trade_df = multiplex_trade_df[apart_lease_deposit_only_df.columns]
officetel_trade_df = officetel_trade_df[apart_lease_deposit_only_df.columns]
officetel_trade_df.head()

Unnamed: 0,시군구,번지,본번,부번,건물명,전월세매매구분,전용면적(㎡),계약년월,계약일,금액(만원),월세(만원),층,건축년도,도로명,가격/면적,year,month,시,구,target_log_transformed,target/area_log_transformed,동
0,서울특별시 강남구 개포동,157-9,157,9,\t(157-9)\t,매매,29.25,201310,15,16750,0,3,2011,선릉로4길 19,572.649573,2013,10,서울특별시,강남구,9.726154,6.350274,서울특별시 강남구 개포동
1,서울특별시 강남구 개포동,13-3,13,3,대청타워,매매,35.22,201301,10,15600,0,7,1997,개포로 623,442.930153,2013,1,서울특별시,강남구,9.655026,6.093412,서울특별시 강남구 개포동
2,서울특별시 강남구 개포동,13-3,13,3,대청타워,매매,32.44,201301,10,15500,0,6,1997,개포로 623,477.805179,2013,1,서울특별시,강남구,9.648595,6.169203,서울특별시 강남구 개포동
3,서울특별시 강남구 개포동,13-3,13,3,대청타워,매매,31.91,201301,24,15600,0,23,1997,개포로 623,488.874961,2013,1,서울특별시,강남구,9.655026,6.192107,서울특별시 강남구 개포동
4,서울특별시 강남구 개포동,13-3,13,3,대청타워,매매,43.24,201301,29,22000,0,10,1997,개포로 623,508.788159,2013,1,서울특별시,강남구,9.998798,6.232032,서울특별시 강남구 개포동


In [57]:
apart_trade_df['건물종류'] = '아파트'
multiplex_trade_df['건물종류'] = '다세대연립'
officetel_trade_df['건물종류'] = '오피스텔'
apart_lease_deposit_only_df['건물종류'] = '아파트'
apart_lease_monthly_pay_df['건물종류'] = '아파트'
multiplex_lease_deposit_only_df['건물종류'] = '다세대연립'
multiplex_lease_monthly_pay_df['건물종류'] = '다세대연립'
officetel_lease_deposit_only_df['건물종류'] = '오피스텔'
officetel_lease_monthly_pay_df['건물종류'] = '오피스텔'
apart_trade_df['trade_type'] = apart_trade_df['건물종류'] + '_' + apart_trade_df['전월세매매구분']
multiplex_trade_df['trade_type'] = multiplex_trade_df['건물종류'] + '_' + multiplex_trade_df['전월세매매구분']
officetel_trade_df['trade_type'] = officetel_trade_df['건물종류'] + '_' + officetel_trade_df['전월세매매구분']
apart_lease_deposit_only_df['trade_type'] = apart_lease_deposit_only_df['건물종류'] + '_' + apart_lease_deposit_only_df['전월세매매구분']
apart_lease_monthly_pay_df['trade_type'] = apart_lease_monthly_pay_df['건물종류'] + '_' + apart_lease_monthly_pay_df['전월세매매구분']
multiplex_lease_deposit_only_df['trade_type'] = multiplex_lease_deposit_only_df['건물종류'] + '_' + multiplex_lease_deposit_only_df['전월세매매구분']
multiplex_lease_monthly_pay_df['trade_type'] = multiplex_lease_monthly_pay_df['건물종류'] + '_' + multiplex_lease_monthly_pay_df['전월세매매구분']
officetel_lease_deposit_only_df['trade_type'] = officetel_lease_deposit_only_df['건물종류'] + '_' + officetel_lease_deposit_only_df['전월세매매구분']
officetel_lease_monthly_pay_df['trade_type'] = officetel_lease_monthly_pay_df['건물종류'] + '_' + officetel_lease_monthly_pay_df['전월세매매구분']

In [58]:
apart_trade_df.head()

Unnamed: 0,시군구,번지,본번,부번,건물명,전월세매매구분,전용면적(㎡),계약년월,계약일,금액(만원),월세(만원),층,건축년도,도로명,가격/면적,year,month,시,구,target_log_transformed,target/area_log_transformed,동,건물종류,trade_type
0,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),매매,77.75,201309,8,57000,0,2,1988,언주로 103,733.118971,2013,9,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동,아파트,아파트_매매
1,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),매매,77.75,201312,16,57000,0,2,1988,언주로 103,733.118971,2013,12,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동,아파트,아파트_매매
2,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201302,11,55000,0,5,1987,언주로 3,817.479191,2013,2,서울특별시,강남구,10.915088,6.706225,서울특별시 강남구 개포동,아파트,아파트_매매
3,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201302,22,58250,0,4,1987,언주로 3,865.78478,2013,2,서울특별시,강남구,10.972499,6.763636,서울특별시 강남구 개포동,아파트,아파트_매매
4,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201305,10,60000,0,5,1987,언주로 3,891.795482,2013,5,서울특별시,강남구,11.0021,6.793237,서울특별시 강남구 개포동,아파트,아파트_매매
