In [None]:
import re
import warnings
warnings.filterwarnings('ignore')

import os
import json
import time
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

In [None]:
data_dir = "../data/new"
train_data = pd.read_csv(f"{data_dir}/new_train_ver3.csv")
test_data = pd.read_csv(f"{data_dir}/new_test_ver3.csv")

In [None]:
train_data['is_test'] = 0
test_data['is_test'] = 1

data = pd.concat([train_data, test_data])
data['is_test'].value_counts() 

In [None]:
data.info()

In [None]:
# '주소1' 컬럼 생성
data['주소1'] = data.apply(lambda row: f"{row['구']} {row['동']} {int(row['본번'])}-{int(row['부번'])}" if row['부번'] != 0 else f"{row['구']} {row['동']} {int(row['본번'])}", axis=1)

# '주소2' 컬럼 생성
data['주소2'] = data.apply(lambda row: f"{row['구']} {row['도로명']}", axis=1)

# 1.부동산 이름 찾기

In [None]:
address_type = '주소1'
unique_address = data[address_type].unique()
print(len(unique_address))

## address_type의 고유값들을 기준으로 '아파트명'이 'unknown'인 행들을 선별
# unknown_apartment_rows = data[(data[address_type].isin(unique_address)) & (data['아파트명'] == 'unknown')]
# print(len(unknown_apartment_rows))

# unique_address = unknown_apartment_rows[address_type].unique()
# print(len(unique_address))

In [None]:
url = "https://www.naver.com"
browser = webdriver.Chrome()
browser.get(url)

for idx, address in enumerate(unique_address):
    parts = address.split()
    gu_name = parts[0].strip()
    dong_name = parts[1].strip()
    bonbun_bubun = parts[2].strip()
    print(f"{idx:>08} [원본] : 서울특별시 {gu_name} {dong_name} {bonbun_bubun}")

    bun_splited = bonbun_bubun.split('-')
    if bun_splited[-1] == 0:
        bonbun_bubun = bun_splited[0]

    if idx == 0:
        search_box = browser.find_element(By.CLASS_NAME, "search_input_box")
        search_inp = search_box.find_element(By.TAG_NAME, "input")
    else:
        search_box = browser.find_element(By.CLASS_NAME, "greenbox")
        search_inp = search_box.find_element(By.TAG_NAME, "input")
        search_inp.clear()

    search_inp.send_keys(f"{gu_name} {dong_name} {bonbun_bubun}")
    search_inp.send_keys(Keys.RETURN)
    time.sleep(2)

    container1 = browser.find_elements(By.CLASS_NAME, 'uOjIX')
    build_name1 = browser.find_elements(By.CLASS_NAME, 'dmLin')
    o_build_name = data.loc[data[address_type] == address, '아파트명'].unique()
    if container1 and build_name1:
        build_name = browser.find_element(By.CLASS_NAME, 'dmLin').text

        data.loc[data[address_type] == address, '아파트명'] = build_name
        print(f"원본 : {o_build_name}, 크롤링 : {build_name}")
            
    print()
    time.sleep(1)

In [None]:
data['아파트명'].value_counts()['unknown']

In [None]:
address_type = '주소1'
unique_address = data[address_type].unique()
print(len(unique_address))

url = "https://openab.seoul.go.kr/build/info.do?gubun=document"
browser = webdriver.Chrome()
browser.get(url)

for idx, address in enumerate(unique_address):
    parts = address.split()
    gu_name = parts[0].strip()
    dong_name = parts[1].strip()
    bonbun_bubun = parts[2].strip()
    print(f"{idx:>08} [원본] : 서울특별시 {gu_name} {dong_name} {bonbun_bubun}")

    bun_splited = bonbun_bubun.split('-')
    if bun_splited[-1] == 0:
        bonbun_bubun = bun_splited[0]

    select_element_gu = browser.find_element(By.ID, 'gu')
    select_gu = Select(select_element_gu)
    select_gu.select_by_value(gu_name)

    select_element_dong = browser.find_element(By.ID, 'dong')
    select_dong = Select(select_element_dong)
    select_dong.select_by_value(dong_name)

    inp = browser.find_element(By.ID, "adresDetail")
    inp.send_keys(bonbun_bubun)
    inp.send_keys(Keys.RETURN)

    break

In [None]:
train_df = data[data['is_test'] == 0]
test_df = data[data['is_test'] == 1]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test', 'target'])

train_df.to_csv(f'{data_dir}/new_train_ver4.csv', index=False)
test_df.to_csv(f'{data_dir}/new_test_ver4.csv', index=False)

browser.quit()

# 2. 부동산 세부정보 얻기

In [None]:
data_dir = "../data/new"
train_data = pd.read_csv(f"{data_dir}/new_train_ver4.csv")
test_data = pd.read_csv(f"{data_dir}/new_test_ver4.csv")

train_data['is_test'] = 0
test_data['is_test'] = 1

data = pd.concat([train_data, test_data])
data['is_test'].value_counts() 

In [None]:
columns_to_check = [
    'k-전체동수', 'k-전체세대수', 'k-건설사', 'k-난방방식', '주차대수', 'k-복도유형', 'k-연면적'
]

condition = False
for column in columns_to_check:
    condition |= data[column].isnull() | (data[column] == 'unknown')

filtered_rows = data[condition]
print(len(filtered_rows))

unique_address = filtered_rows['주소2'].unique()
print(len(unique_address))

In [None]:
def extract_info(text):
    # Regular expression to match the pattern
    pattern = re.compile(r'(\d+)세대.*총(\d+)개동')
    
    # Find matches
    matches = pattern.findall(text)
    
    if matches:
        # Since we are looking for the first match
        households, buildings = matches[0]
        return int(households), int(buildings)
    return 0, 0


no_results = []
crawling_results = {}
for idx, address in enumerate(unique_address):
    row = data[data['주소2'] == address].iloc[-1]
    str_num = row['주소1']
    road_name = row['주소2']
    build_name = row['아파트명']
    dong = row['동']
    print(f"{idx:>08}")
    print(f"지번 : {str_num}")
    print(f"도로명 : {road_name}")
    print(f"아파트명 : {build_name}")
    print(f"[원본] - 동 : {row['k-전체동수']}, 세대수 : {row['k-전체세대수']}, 건설사 : {row['k-건설사']}, 난방방식 : {row['k-난방방식']}, 주차대수 : {row['주차대수']}, 복도유형 : {row['k-복도유형']}")

    url = f"https://new.land.naver.com/complexes?ms=37.566427,126.977872,13&a=APT:PRE:ABYG:JGC&e=RETAIL"
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(2)

    search_box = browser.find_element(By.CLASS_NAME, "search_area")
    search_inp = search_box.find_element(By.TAG_NAME, "input")
    
    if idx > 0:
        search_inp.clear()

    search_inp.send_keys(f"{dong} {build_name}")
    search_inp.send_keys(Keys.RETURN)
    time.sleep(2)

    valid = browser.find_elements(By.ID, "summaryInfo")
    if valid:
        container = browser.find_element(By.ID, "summaryInfo")
        btn_box = container.find_element(By.CLASS_NAME, "complex_detail_link")
        btns = btn_box.find_elements(By.TAG_NAME, "button")
        btns[0].click()
        time.sleep(2)

        container = browser.find_element(By.CLASS_NAME, "detail_box--complex")
        table = container.find_elements(By.CLASS_NAME, "info_table_wrap")[0]
        tbody = table.find_element(By.TAG_NAME, "tbody")
        rows_data = tbody.find_elements(By.TAG_NAME, "td")
        
        dong_rooms = rows_data[0].text
        rooms, dong = extract_info(dong_rooms)

        park = rows_data[3].text
        if park == '-':
            park = 0
        else:
            park = park.split('(')[:-1]
            park = park[0][:-1]

        const = rows_data[6].text
        if const == '-':
            const = 'unknown'

        heat = rows_data[7].text
        if heat == '-':
            heat = 'unknown'
        else:
            heat = heat.split(',')[0]

        table = browser.find_elements(By.CLASS_NAME, "info_table_wrap")[1]
        tbody = table.find_element(By.TAG_NAME, "tbody")
        rows_data = tbody.find_elements(By.TAG_NAME, "td")
        corr = rows_data[3].text
        if corr == '-':
            corr = 'unknown'

        crawling_results[address] = {
            'dong': dong,
            'rooms': rooms,
            'park': park,
            'const': const,
            'heat': heat,
            'corr': corr
        }
        
        print(f"[크롤링] - 동 : {dong}, 세대수 : {rooms}, 건설사 : {const}, 난방방식 : {heat}, 주차대수 : {park}, 복도유형 : {corr}")
        print()
        browser.quit()
    else:
        if build_name == "unknown":
            no_results.append(f"{address}")
        else:
            no_results.append(f"{address} {build_name}")

In [None]:
for index, row in data.iterrows():
    address = row['주소2']
    if address in crawling_results:
        result = crawling_results[address]
        if pd.isna(row['k-전체동수']):
            data.at[index, 'k-전체동수'] = result['dong']
        if pd.isna(row['k-전체세대수']):
            data.at[index, 'k-전체세대수'] = result['rooms']
        if pd.isna(row['주차대수']):
            data.at[index, '주차대수'] = result['park']
        if row['k-건설사'] == "unknown":
            data.at[index, 'k-건설사'] = result['const']
        if row['k-난방방식'] == "unknown":
            data.at[index, 'k-난방방식'] = result['heat']
        if row['k-복도유형'] == "unknown":
            data.at[index, 'k-복도유형'] = result['corr']

In [None]:
train_df = data[data['is_test'] == 0]
test_df = data[data['is_test'] == 1]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test', 'target'])

train_df.to_csv(f'{data_dir}/new_train_ver5.csv', index=False)
test_df.to_csv(f'{data_dir}/new_test_ver5.csv', index=False)

browser.quit()

In [None]:
# url = "https://land.seoul.go.kr/land/wskras/generalInfo.do"
# browser = webdriver.Chrome()
# browser.get(url)

# # 자치구 선택 요소 찾기
# select_element_gu = browser.find_element(By.ID, 'selSgg')
# select_gu = Select(select_element_gu)

# # 자치구의 이름과 value를 저장할 딕셔너리
# gu_dict = {}
# for option in select_gu.options:
#     gu_name = option.text
#     if gu_name != "자치구 선택":
#         value = option.get_attribute('value')
#         gu_dict[gu_name] = value

# # 자치구 별로 동의 옵션 값 가져오기
# gd_dict = {}
# for gu_name, gu_value in gu_dict.items():
#     # 자치구 선택
#     select_gu.select_by_value(gu_value)
#     time.sleep(1)  # 페이지 업데이트를 기다림
    
#     # 동 선택 요소 찾기
#     select_element_dong = browser.find_element(By.ID, 'selBjdong')
#     select_dong = Select(select_element_dong)
    
#     dong_dict = {}
#     for option in select_dong.options:
#         dong_name = option.text
#         if dong_name != "동 선택":
#             value = option.get_attribute('value')
#             dong_dict[dong_name] = value
#     gd_dict[gu_name] = dong_dict

# browser.quit()

In [None]:
# with open(f'{data_dir}/seoul_gu_dong.json', 'w', encoding='utf-8') as f:
#     json.dump(gd_dict, f, ensure_ascii=False, indent=4)

In [None]:
# with open(f'{data_dir}/seoul_gu_dong.json', 'r', encoding='utf-8') as f:
#     loaded_data = json.load(f)

In [None]:
# url = "https://land.seoul.go.kr/land/wskras/generalInfo.do"
# browser = webdriver.Chrome()
# browser.get(url)

# for idx, address in enumerate(unique_address):
#     parts = address.split()
#     gu_name = parts[0].strip()
#     dong_name = parts[1].strip()
#     bonbun_bubun = parts[2].strip()
#     print(f"{idx:>08}")
#     print(f"[원본] : 서울특별시 {gu_name} {dong_name} {bonbun_bubun}")

#     if "-" in bonbun_bubun:
#         buns = bonbun_bubun.split('-')
#         bonbun = buns[0]
#         bubeon = buns[1]
#     else:
#         bonbun = bonbun_bubun
#         bubeon = None

#     if gu_name in loaded_data:
#         dong_dict = loaded_data[gu_name]
#         if dong_name in dong_dict:
#             gu_value = gu_dict[gu_name]
#             dong_value = dong_dict[dong_name]

#             ## 구, 동, 본번-부번
#             select_element_gu = browser.find_element(By.ID, 'selSgg')
#             select_gu = Select(select_element_gu)
#             select_gu.select_by_value(gu_value)
#             time.sleep(1)

#             select_element_dong = browser.find_element(By.ID, 'selBjdong')
#             select_dong = Select(select_element_dong)
#             select_dong.select_by_value(dong_value)

#             bonbeon_element = browser.find_element(By.ID, "bonbeon")
#             bonbeon_element.clear()
#             bonbeon_element.send_keys(bonbun)

#             if not bubeon is None:
#                 bubeon_element = browser.find_element(By.ID, "bubeon")
#                 bubeon_element.clear()
#                 bubeon_element.send_keys(bubeon)
#             else:
#                 bubeon_element = browser.find_element(By.ID, "bubeon")
#                 bubeon_element.clear()

#             btn = browser.find_element(By.ID, "btnSearch")
#             btn.click()
#             time.sleep(1)

#             tab2 = browser.find_element(By.ID, "tab2")
#             tab2.click()
#             time.sleep(2)

#             apart_name = browser.find_element(By.ID, "tdBldNm").text.strip()
#             road_name = browser.find_element(By.ID, "newAddressInfo").text.strip().split('(')[0].strip()
#             total_area = browser.find_element(By.ID, "tdTotArea").text.strip().split(' ')[0].strip()
#             print(f"건물명 : {apart_name}, 도로명 : {road_name}, 연면적 : {total_area}")
#             data.loc[data['주소1'] == address, 'k-연면적'] = total_area

#             if len(apart_name) == 0:
#                 print()
#                 continue

#             sub_url = "https://www.naver.com/"
#             sub_browser = webdriver.Chrome()
#             sub_browser.get(sub_url)
#             time.sleep(2)

#             sub_query = f"{gu_name} {dong_name } {bonbun_bubun}"
#             search_box = sub_browser.find_element(By.CLASS_NAME, "search_input_box")
#             search_element = search_box.find_element(By.TAG_NAME, "input")
#             search_element.send_keys(sub_query)
#             search_element.send_keys(Keys.RETURN)
#             time.sleep(2)

#             container = sub_browser.find_element(By.CLASS_NAME, "uOjIX")
#             if container.find_elements(By.CLASS_NAME, "dmLin"):
#                 apart_name = container.find_element(By.CLASS_NAME, "dmLin").text
#             else:
#                 container = sub_browser.find_element(By.CLASS_NAME, "IGZYm")
#                 lis = container.find_elements(By.CLASS_NAME, 'JvDUP')
#                 for li in lis:
#                     title = li.find_element(By.TAG_NAME, 'span').text
#                     if '아파트' in title:
#                         apart_name = title

#             if apart_name == 'ACROHILLS논현':
#                 apart_name = '아크로힐스논현'

#             sub_url = "https://land.naver.com/"
#             sub_browser.get(sub_url)
#             time.sleep(2)

#             sub_query = f"{dong_name} {apart_name}"
#             print(sub_query)
#             search_element = sub_browser.find_element(By.ID, "queryInputHeader")
#             search_element.send_keys(sub_query)
#             search_element.send_keys(Keys.RETURN)
#             time.sleep(2)

#             summary_info = sub_browser.find_elements(By.ID, 'summaryInfo')
#             if not summary_info:
#                 item_list = sub_browser.find_element(By.CLASS_NAME, "item_list--search")
#                 items = item_list.find_elements(By.CLASS_NAME, "item")

#                 for item_idx, item in enumerate(items):
#                     title = item.find_element(By.CLASS_NAME, 'title').text
#                     if apart_name in title:
#                         target_idx = item_idx
#                         break
                
#                 items[item_idx].click()
#                 time.sleep(2)

#             btn_div = sub_browser.find_element(By.CLASS_NAME, 'complex_detail_link')
#             btns = btn_div.find_elements(By.TAG_NAME, 'button')
#             btn = btns[0].click()
#             time.sleep(2)

#             ## 아파트명
#             title = sub_browser.find_element(By.ID, "complexTitle")
#             apart_name = title.text
#             data.loc[data['주소1'] == address, '아파트명'] = apart_name
#             print(f' -아파트명 : {apart_name}')

#             table = sub_browser.find_element(By.CLASS_NAME, 'info_table_wrap')
#             trs = table.find_elements(By.TAG_NAME, 'tr')
            
#             ## 세대수
#             tr0_data = trs[0].find_element(By.TAG_NAME, 'td').text
#             tr0_splits = tr0_data[:-1].split('(')

#             rooms = tr0_splits[0][:-2]
#             dongs = tr0_splits[1][1]
#             data.loc[data['주소1'] == address, 'k-전체동수'] = dongs
#             data.loc[data['주소1'] == address, 'k-전체세대수'] = rooms
#             print(f' -k-전체동수 : {dongs}')
#             print(f' -k-전체세대수 : {rooms}')

#             ## 건설사
#             tr3_data = trs[3].find_element(By.TAG_NAME, 'td').text
#             data.loc[data['주소1'] == address, 'k-건설사'] = tr3_data
#             print(f' -k-건설사 : {tr3_data}')

#             tr4_data = trs[4].find_element(By.TAG_NAME, 'td').text
#             heat_type = tr4_data.split(',')[0]
#             data.loc[data['주소1'] == address, 'k-난방방식'] = heat_type
#             print(f' -k-난방방식 : {heat_type}')

#             # break
#             print()
#             sub_browser.quit()
# browser.quit()