In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
addr_df = pd.read_csv('addr_to_download.csv')

# Selecting addresses in main data that are not in coordinates data

In [None]:
def select_addr_not_in_coor_data(temp_main_data_df, temp_landnum_coor_df, temp_roadname_coor_df):
    #
    temp_main_data_df = temp_main_data_df.copy()
    temp_landnum_coor_df = temp_landnum_coor_df.copy()
    temp_roadname_coor_df = temp_roadname_coor_df.copy()
    
    addr_df = temp_main_data_df[['지번주소', '도로명주소']]
    
    unique_landnum_coor_addr = temp_landnum_coor_df['지번주소'].unique()
    unique_roadname_coor_addr = temp_roadname_coor_df['도로명주소'].unique()
    
    addr_not_in_coor_data_df = addr_df[(addr_df['지번주소'].isin(unique_landnum_coor_addr) == False) &
                                       (addr_df['도로명주소'].isin(unique_roadname_coor_addr) == False)]
    
    return addr_not_in_coor_data_df.drop_duplicates()

In [None]:
%%time
addr_not_in_coor_data_df = select_addr_not_in_coor_data(main_data_df, landnum_coor_df, roadname_coor_df)

In [None]:
# select addresses to download

def select_addr_to_download(df):
    df = df.copy()
    
    unique_landnum_addrs = df['지번주소'].unique()
    
    addr_to_download_list = []
    
    for addr in tqdm(unique_landnum_addrs, position=0):
        addr_df = df[df['지번주소'] == addr]      
        for rdname_addr in addr_df['도로명주소'].unique():
            addr_to_download_list.append(rdname_addr)
        if addr_df['도로명주소'].isna().sum() > 0:
            addr_to_download_list.append(addr)
    
    null_landnum_df = df[df['지번주소'].isna()]
    
    for addr in null_landnum_df['도로명주소'].unique():
        addr_to_download_list.append(addr)
    
    # remove duplicates
    no_duplicates_list = []
    for v in addr_to_download_list:
        if (v not in no_duplicates_list) & (pd.isnull(v) == False):
            no_duplicates_list.append(v)
    
    return no_duplicates_list

In [None]:
my_addr_to_download = select_addr_to_download(addr_not_in_coor_data_df)

In [None]:
#addr_to_download_df = pd.DataFrame({'주소':my_addr_to_download})
#addr_to_download_df.to_csv('addr_to_download.csv')

In [6]:
# download coordinates from an open api

def fetch_coor_data(df):
    df = df.copy()
    
    unique_addresses = df['주소'].unique()
    roadname_url = 'https://www.juso.go.kr/addrlink/addrLinkApi.do?'
    roadname_apikey = 'devU01TX0FVVEgyMDIwMTIwNzEyNTE1OTExMDUxMTk='
    coor_base_url = 'https://www.juso.go.kr/addrlink/addrCoordApi.do?'
    coor_apikey = 'devU01TX0FVVEgyMDIwMTIwNDE3NDIyNDExMDUwNTc='
    #apikey 만료시 https://www.juso.go.kr/addrlink/devAddrLinkRequestWrite.do?returnFn=write&cntcMenu=URL 가서 새로 받을 것.
    
    addr_list = []
    coor_x_list = []
    coor_y_list = []
    
    count = 0
    
    for address in tqdm(unique_addresses, position=0):        
        try:
            roadname_result = requests.get(roadname_url + 'confmKey=' + roadname_apikey + '&currentPage=1' + '&countPerPage=1' + '&keyword=' \
                                           + address + '&resultType=json' + '&hstryYn=Y')
            roadname_info = roadname_result.json()['results']['juso'][0]

            coor_result = requests.get(coor_base_url + 'confmKey=' + coor_apikey + '&admCd=' + roadname_info['admCd'] + '&rnMgtSn='\
                                       + roadname_info['rnMgtSn'] + '&udrtYn=' + roadname_info['udrtYn'] + '&buldMnnm='\
                                       + roadname_info['buldMnnm'] + '&buldSlno=' + roadname_info['buldSlno'] +'&resultType=json')
            coor_info = coor_result.json()['results']['juso'][0]

            coor_x = pd.to_numeric(coor_info['entX'])
            coor_y = pd.to_numeric(coor_info['entY'])

            addr_list.append(address)
            coor_x_list.append(coor_x)
            coor_y_list.append(coor_y)
        except:
            if address.endswith('-1'):
                pass
            else:
                try:
                    address = address.replace('-0', '')
                    
                    roadname_result = requests.get(roadname_url + 'confmKey=' + roadname_apikey + '&currentPage=1' + '&countPerPage=1' 
                                                   + '&keyword=' + address + '&resultType=json' + '&hstryYn=Y')
                    roadname_info = roadname_result.json()['results']['juso'][0]

                    coor_result = requests.get(coor_base_url + 'confmKey=' + coor_apikey + '&admCd=' + roadname_info['admCd'] + '&rnMgtSn='\
                                               + roadname_info['rnMgtSn'] + '&udrtYn=' + roadname_info['udrtYn'] + '&buldMnnm='\
                                               + roadname_info['buldMnnm'] + '&buldSlno=' + roadname_info['buldSlno'] +'&resultType=json')
                    coor_info = coor_result.json()['results']['juso'][0]

                    coor_x = pd.to_numeric(coor_info['entX'])
                    coor_y = pd.to_numeric(coor_info['entY'])

                    addr_list.append(address)
                    coor_x_list.append(coor_x)
                    coor_y_list.append(coor_y)

                except:
                    pass
            
        
        '''count += 1
        
        if (count % np.floor(len(unique_addresses)/10)) == 0:
            coor_dict = {'전체주소': addr_list, 'x좌표': coor_x_list, 'y좌표': coor_y_list}
            coor_df = pd.DataFrame(coor_dict)
            coor_df.to_csv('original_coordinates_data{}.csv'.format(datetime.datetime.now()))'''
            
            
        
    coor_dict = {'전체주소': addr_list, 'x좌표': coor_x_list, 'y좌표': coor_y_list}    
    coor_df = pd.DataFrame(coor_dict)    
    coor_df.to_csv('coordinates_data{}.csv'.format(pd.datetime.today().strftime('%y%m%d-%H%M%S')))
    
    return coor_df

In [7]:
my_coor_df = fetch_coor_data(addr_df)

100%|██████████████████████████████████████████████████████████████████████████████| 3931/3931 [27:31<00:00,  2.38it/s]
