In [1]:
import multiprocess as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc

In [2]:
def bonbubun_allocator(x):
    # x is a string
    try:
        if pd.isna(x) == True:
            return x
        else:
            return str(int(x))
    except:
        return np.nan

In [3]:
def buildinggroup_modifier(x):
    # x is a string
    
    try:
        return int(x)
    except:
        return x

In [4]:
def read_coor_data1():
    basedir = './좌표데이터/'
    filenames = [f for f in os.listdir(basedir) if f.endswith('.txt')]
    
    dfs_list = []
    
    for filename in tqdm(filenames, position=0):
        if filename.endswith('.txt'):
            try:
                df = pd.read_csv(basedir + filename, sep='|', header=None, usecols=[3,4,7,9,10,16,17])
            except:
                df = pd.read_csv(basedir + filename, sep='|', header=None, encoding='euc-kr', usecols=[3,4,7,9,10,16,17])
            
            df['도로명주소'] = df[3] + ' ' + df[4] + ' ' + df[7] + ' ' + df[9].apply(bonbubun_allocator) + '-' + df[10].apply(bonbubun_allocator)
            
            df['x좌표'] = df[16]
            df['y좌표'] = df[17]
            
            #df['건물군여부'] = df[14].apply(buildinggroup_modifier)
            
            df = df[['도로명주소', 'x좌표', 'y좌표']]
            
            #df['dtype'] = df['건물군여부'].apply(type)
            
            #df = df[df['dtype'] != str]
            
            #df.drop(columns=['dtype'], inplace=True)
                        
            dfs_list.append(df)
            
    concat_df = pd.concat(dfs_list).dropna().drop_duplicates(subset=['도로명주소'], keep='last').reset_index(drop=True)
        
    return concat_df

In [5]:
def read_coor_data2():
    basedir = './좌표데이터2/'
    filenames = [f for f in os.listdir(basedir) if f.endswith('.txt')]
    
    dfs_list = []
    
    for filename in tqdm(filenames, position=0):
        if filename.endswith('.txt'):
            try:
                df = pd.read_csv(basedir + filename, sep='|', header=None, usecols=[1,2,5,7,8,23,24])
            except:
                df = pd.read_csv(basedir + filename, sep='|', header=None, encoding='ansi', usecols=[1,2,5,7,8,23,24])
                
            df['도로명주소'] = df[1] + ' ' + df[2] + ' ' + df[5] + ' ' + df[7].apply(bonbubun_allocator) + '-'\
            + df[8].apply(bonbubun_allocator)
            
            df['x좌표'] = df[23]
            df['y좌표'] = df[24]
            
            df = df[['도로명주소', 'x좌표', 'y좌표']]
            
            dfs_list.append(df)
            
    concat_df = pd.concat(dfs_list).dropna().drop_duplicates(subset=['도로명주소'], keep='last').reset_index(drop=True)
        
    return concat_df

In [6]:
def read_coor_data3():
    basedir = './좌표데이터3/'
    filenames = [f for f in os.listdir(basedir) if f.endswith('.csv')]
    
    dfs_list = []
    
    for filename in tqdm(filenames, position=0):
        if filename.endswith('.csv'):
            df = pd.read_csv(basedir + filename, index_col=0)
            
            dfs_list.append(df)
            
    concat_df = pd.concat(dfs_list).dropna().drop_duplicates(subset=['전체주소'], keep='last').reset_index(drop=True)
    concat_df['지번주소'] = concat_df['전체주소'].copy()
    concat_df['도로명주소'] = concat_df['전체주소'].copy()
    concat_df = concat_df[['지번주소', '도로명주소', 'x좌표', 'y좌표']]
    
    return concat_df

In [7]:
def add_0(x):
    if pd.isnull(x) == False:
        splitted = x.split('-')
        if len(splitted) == 1:
            return x+'-0'
        else:
            return x
    else:
        return x

In [8]:
def read_coor_data():
    coor_df1 = read_coor_data1()
    coor_df2 = read_coor_data2()
    coor_df3 = read_coor_data3()    
    
    coor_df = pd.concat([coor_df1, coor_df2, coor_df3]).drop_duplicates(subset=['도로명주소'], keep='last').reset_index(drop=True)
    coor_df = coor_df[['지번주소', '도로명주소', 'x좌표', 'y좌표']]    
            
    return coor_df

In [9]:
coor_df = read_coor_data()
print(coor_df.shape)
coor_df.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:32<00:00,  1.26s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:40<00:00,  1.60s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 111.49it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


(657309, 4)


Unnamed: 0,지번주소,도로명주소,x좌표,y좌표
0,,서울특별시 관악구 난곡로24가길 18-0,948851.369529,1941318.0
1,,서울특별시 종로구 성균관로15길 33-0,955591.635372,1954533.0
2,,서울특별시 성북구 인촌로7길 70-0,957563.072605,1954475.0
3,,서울특별시 서초구 강남대로91길 5-0,957617.535913,1945960.0
4,,서울특별시 서초구 바우뫼로11길 54-0,958121.727726,1941641.0


In [11]:
main_data_df = pd.read_csv('./prepped_data/main_data_ver_2.2.csv')
print(main_data_df.shape)
main_data_df.head()

(4559866, 27)


Unnamed: 0,시군구,번지,본번,부번,건물명,전월세매매구분,전용면적(㎡),계약년월,계약일,금액(만원),월세(만원),층,건축년도,도로명,가격/면적,year,month,시,구,target_log_transformed,target/area_log_transformed,동,건물종류,trade_type,지번주소,대장구분코드,대장구분명
0,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),매매,77.75,201309,8,57000,0,2,1988,언주로 103,733.118971,2013,9,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동,아파트,아파트_매매,서울특별시 강남구 개포동 655-2,1,일반
1,서울특별시 강남구 개포동,655-2,655,2,개포2차현대아파트(220),매매,77.75,201312,16,57000,0,2,1988,언주로 103,733.118971,2013,12,서울특별시,강남구,10.950807,6.597308,서울특별시 강남구 개포동,아파트,아파트_매매,서울특별시 강남구 개포동 655-2,1,일반
2,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201302,11,55000,0,5,1987,언주로 3,817.479191,2013,2,서울특별시,강남구,10.915088,6.706225,서울특별시 강남구 개포동,아파트,아파트_매매,서울특별시 강남구 개포동 658-1,1,일반
3,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201302,22,58250,0,4,1987,언주로 3,865.78478,2013,2,서울특별시,강남구,10.972499,6.763636,서울특별시 강남구 개포동,아파트,아파트_매매,서울특별시 강남구 개포동 658-1,1,일반
4,서울특별시 강남구 개포동,658-1,658,1,개포6차우성아파트1동~8동,매매,67.28,201305,10,60000,0,5,1987,언주로 3,891.795482,2013,5,서울특별시,강남구,11.0021,6.793237,서울특별시 강남구 개포동,아파트,아파트_매매,서울특별시 강남구 개포동 658-1,1,일반


In [12]:
def roadname_modifier(x):
    # x is a string
    
    if pd.isna(x) == True:
        return x
    else:
        splitted = x.split('-')
        
        if len(splitted) == 1:
            return x+'-0'
        elif len(splitted) == 2:
            return x
        else:
            raise ValueError('myerror')
            

In [13]:
main_data_df['도로명주소'] = main_data_df['시']+' '+main_data_df['구']+' '+main_data_df['도로명'].apply(roadname_modifier)

In [14]:
#main_data_df.to_csv('main_data_ver_2.3.csv', index=False)

In [13]:
selected_df = main_data_df[(main_data_df['지번주소'].isin(coor_df['지번주소'].unique()))
                           |(main_data_df['지번주소'].isin(coor_df['도로명주소'].unique()))
                          ]

In [14]:
no_jibun_df = main_data_df.drop(selected_df.index)

In [15]:
reselected_df = no_jibun_df[(no_jibun_df['도로명주소'].isin(coor_df['지번주소'].unique()))
                       |(no_jibun_df['도로명주소'].isin(coor_df['도로명주소'].unique()))
                      ]

In [16]:
no_coor_df = no_jibun_df.drop(reselected_df.index)

In [18]:
no_coor_df2 = pd.DataFrame({'지번주소':no_coor_df['지번주소'].unique().tolist()})

In [19]:
no_coor_df2.to_csv('no_coor_addrs.csv', index=False)

In [20]:
coor_df.head()

Unnamed: 0,지번주소,도로명주소,x좌표,y좌표
0,,서울특별시 관악구 난곡로24가길 18-0,948851.369529,1941318.0
1,,서울특별시 종로구 성균관로15길 33-0,955591.635372,1954533.0
2,,서울특별시 성북구 인촌로7길 70-0,957563.072605,1954475.0
3,,서울특별시 서초구 강남대로91길 5-0,957617.535913,1945960.0
4,,서울특별시 서초구 바우뫼로11길 54-0,958121.727726,1941641.0


In [22]:
selected_df = coor_df[(coor_df['지번주소'].isin(main_data_df['지번주소'].unique()))
                      |(coor_df['지번주소'].isin(main_data_df['도로명주소'].unique()))
                     ]

In [23]:
no_jibun_df = coor_df.drop(selected_df.index)

In [24]:
reselected_df = no_jibun_df[(no_jibun_df['도로명주소'].isin(main_data_df['지번주소'].unique()))
                       |(no_jibun_df['도로명주소'].isin(main_data_df['도로명주소'].unique()))
                      ]

In [26]:
no_addr_df = no_jibun_df.drop(reselected_df.index)
print(no_addr_df.shape)

(6274, 4)


In [27]:
no_addr_df.head()

Unnamed: 0,지번주소,도로명주소,x좌표,y좌표
566442,서울특별시 서초구 고무래로,서울특별시 서초구 고무래로,956676.746259,1945089.0
566448,서울특별시 강남구 260,서울특별시 강남구 260,960294.041765,1942320.0
566450,서울특별시 마포구 망원동 521,서울특별시 마포구 망원동 521,947232.206233,1950512.0
566451,서울특별시 강남구 개포로,서울특별시 강남구 개포로,962615.409421,1943838.0
566452,서울특별시 강남구 310,서울특별시 강남구 310,958582.891361,1943629.0


In [30]:
print(coor_df.shape)
coor_df.drop(no_addr_df.index, inplace=True)
coor_df.shape

(657309, 4)


(651035, 4)

In [31]:
coor_df.to_csv('coor_data_ver_1.csv', index=False)