### 라이브러리 호출

In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotVisibleException
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import warnings
import time
import re

warnings.filterwarnings(action='ignore')
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_argument('headless')
options.add_argument('lang=ko_KR')

### 검색을 위한 지하철 정보 불러오기

In [None]:
subway = pd.read_csv('./busansubway.csv', encoding='cp949')

# null값 제거 및 원활한 검색을 위해 부산 name 붙여주기
subway_lst = []
for i in range(len(subway)):
    if pd.isna(subway['역명'].iloc[i]) == True:
        pass
    else:
        subway_lst.append("부산 "+subway['역명'].iloc[i].split('(')[0] + "역")
        

categories = ['한식', '중식', '카페', '술집', '고기집', '횟집', '해산물', '밥집', '분식', '패스트푸드', '파스타', '뷔페', '국물요리', '면요리', '이탈리안', '프렌치', '아시안']
# print(subway_lst)
# print(categories)

In [None]:
url_lst = []
for subway in subway_lst:
    url = 'https://www.diningcode.com/list.php?query={}'.format(subway)
    
    driver = webdriver.Chrome('./chromedriver')
    driver.implicitly_wait(4)
    driver.get(url)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    # 부산이 2만개넘는데 해당역이 없으면 2만개가 출력된다.
    food_string = soup.body.select('#lbl_count')
    food_string2 = food_string[0].text
    cnt = int(re.sub(r'[^0-9]', '', food_string2))
    
    # 대략 넘으면 넘긴다.
    if cnt > 10000:
        continue
    # 10~1000개 정도수준이면 카테고리별로 출력해서 뽑아낸다.
    else:
        for category in categories:
            sub_url = 'https://www.diningcode.com/list.php?query={}%20{}'.format(subway, category)
            url_lst.append(sub_url)
            time.sleep(1)

    time.sleep(3)

### 가게 url 가져오기

In [None]:
store_lst = []

for url in url_lst:
    driver = webdriver.Chrome('./chromedriver')
    driver.implicitly_wait(2)
    driver.get(url)
    
    count = 0
    while count < 10:
    #     print(count)
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            # 스크롤 끝까지 내리기
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            time.sleep(1)
            # 스크롤 다운 후 스크롤 높이 다시 가져옴
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        try:
            driver.find_element_by_xpath('//*[@id="div_list_more"]/span').click()
            count += 1
            time.sleep(1)
        except:
            print("빠진다.", url)
            break
            
    restaurants = driver.find_elements_by_css_selector('#div_list > [onmouseenter]')
    for res in restaurants:
        res_id = res.find_element_by_css_selector("a").get_attribute('href')
        store_lst.append(res_id)
    
    driver.close()

In [None]:
# 중복링크 제거
dupl_store = list(set(store_lst))
df_store = pd.DataFrame({'url': dupl_store})

In [None]:
# 데이터 저장
df_store.to_csv(path_or_buf='store_url.csv')

In [None]:
store = pd.read_csv('./store_url.csv', encoding='cp949', index_col=0)

In [None]:
columns = ['name', 'category', 'address', 'tel', 'time', 'menu', 'image']
df = pd.DataFrame(columns=columns)


for url in store:
    driver = webdriver.Chrome('./chromedriver')
    driver.implicitly_wait(1) 
    driver.get(url)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    try:
        name = soup.body.select('#div_profile > div.s-list.pic-grade > div.tit-point > p')[0].text
    except:
        name = ''
    
    category_lst = []
    try:
        for i in soup.body.select('#div_profile > div.s-list.pic-grade > div.btxt > a'):
            category_lst.append(i.text)
    except:
        pass
    
    try:
        address = soup.body.select('#div_profile > div.s-list.basic-info > ul > li.locat')[0].text
    except:
        address = ''
        
    try:
        tel = soup.body.select('#div_profile > div.s-list.basic-info > ul > li.tel')[0].text
    except:
        tel = ''
    
    try:
        sub_day = []
        for i in soup.body.select('#div_detail > div.busi-hours.short > ul > li'):
            a = i.select('li > p.l-txt')[0].text
            b = i.select('li > p.r-txt')[0].text.rstrip()
            c = a + b
            sub_day.append(c)
        time_lst = '|'.join(sub_day)
    except:
        time_lst = ""
        
    menu = []
    try:
        for i in soup.body.select('#div_detail > div.menu-info.short > ul > li'):
            menu.append((i.select('p')[0].text.split('\n')[0], i.select('p')[1].text))
    except:
        pass
    
    try:
        image_url = soup.body.select('#div_profile > div.s-list.pic-grade > ul > li.bimg.btn-gallery-open > div > div > img')[0]['src']
    except:
        image_url = ''
        
        
    df = df.append({
        'name': name,
        'category': category_lst,
        'address' : address,
        'tel': tel,
        'time': time_lst,
        'menu': menu,
        'image': image_url
    }, ignore_index=True)
    time.sleep(0.5)
    driver.close()

In [None]:
# 파일 저장
df.to_csv(path_or_buf='store_list.csv', encoding='utf-8-sig')