<a href="https://colab.research.google.com/github/m-maekakuchi/tabelog_scraping/blob/main/tabelog_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ライブラリのインストール
!pip install beautifulsoup4 pandas gspread gspread_dataframe


# 認証のためのコード
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

prefecture = "青森県"   # 取得したい都道府県名を入力してください。
file_id = "1m_WEhidFawgTiIqVvur7sKd0LmXx0pyw"   # 実行ファイルのファイルIDを入力してください。

prefecture_arr = { 
  "北海道": "hokkaido",
  "青森県": "aomori",
  "岩手県": "iwate",
  "宮城県": "miyagi",
  "秋田県": "akita",
  "山形県": "yamagata",
  "福島県": "fukushima",
  "茨城県": "ibaraki",
  "栃木県": "tochigi",
  "群馬県": "gunma",
  "埼玉県": "saitama",
  "千葉県": "chiba",
  "東京都": "tokyo",
  "神奈川県": "kanagawa",
  "新潟県": "niigata",
  "富山県": "toyama",
  "石川県": "ishikawa",
  "福井県": "fukui",
  "山梨県": "yamanashi",
  "長野県": "nagano",
  "岐阜県": "gifu",
  "静岡県": "shizuoka",
  "愛知県": "aichi",
  "三重県": "mie",
  "滋賀県": "shiga",
  "京都府": "kyoto",
  "大阪府": "osaka",
  "兵庫県": "hyogo",
  "奈良県": "nara",
  "和歌山県": "wakayama",
  "鳥取県": "tottori",
  "島根県": "shimane",
  "岡山県": "okayama",
  "広島県": "hiroshima",
  "山口県": "yamaguchi",
  "徳島県": "tokushima",
  "香川県": "kagawa",
  "愛媛県": "ehime",
  "高知県": "kochi",
  "福岡県": "fukuoka",
  "佐賀県": "saga",
  "長崎県": "nagasaki",
  "熊本県": "kumamoto",
  "大分県": "oita",
  "宮崎県": "miyazaki",
  "鹿児島県": "kagoshima",
  "沖縄県": "okinawa"
}
prefecture_eng = prefecture_arr[prefecture]


# スプレッドシートの作成
import datetime
from googleapiclient.discovery import build
today = datetime.date.today().strftime('%Y%m%d')
sh_name = '飲食店リスト_' + prefecture + '_' + today
client = gspread.Client(auth = creds)
sh = client.create(sh_name)


# フォルダーの変更
service = build("drive", "v3")

def getFolderID(file_id):
  file = service.files().get(fileId = file_id, fields = "parents").execute()
  return file["parents"][0]

sh_file_id = sh.id                              # 生成されたスプレッドシートのファイルID
pre_folder_id = getFolderID(sh_file_id)         # 生成されたスプレッドシートがあるルートディレクトリのID
folder_id = getFolderID(file_id)                # 実行ファイルのフォルダID

service.files().update(
  fileId = sh_file_id,
  removeParents = pre_folder_id,
  addParents = folder_id
).execute()


# タイトル列を設定
worksheet = sh.get_worksheet(0)
column_title = [[
  '店名', 
  'ジャンル', 
  '住所', 
  '点数', 
  '価格帯（ディナー）', 
  '価格帯（ランチ）', 
  'コース', 
  '営業時間', 
  '定休日', 
  '喫煙可否', 
  '喫煙可否詳細', 
  '店舗URL', 
  '電話番号'
]]
worksheet.append_rows(column_title)
worksheet.format("A1:Z100", {'horizontalAlignment': 'center', 'verticalAlignment': 'middle'})


# Webスクレイピング
import requests
import re
import os
from bs4 import BeautifulSoup

def fetchHtml(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  return soup
  
def extractData(page_link):
  soup = fetchHtml(page_link)

  shops = soup.find_all("a", class_="list-rst__rst-name-target cpy-rst-name")    # ページ内の店舗のaタグを取得
  for shop in shops:
    extracted_data = [[]]     # 店舗毎の抽出データを格納する二次元配列
    shop_name      = ""
    shop_link      = ""
    genre          = ""
    address        = ""
    score          = ""
    price_dinner   = ""
    price_lunch    = ""
    course         = "-"
    open_hour      = ""
    smoking        = ""
    smoking_detail = ""
    holiday        = ""
    tel            = ""

    # 店名
    shop_name = shop.text
    extracted_data[0].append(shop_name)

    # aタグのhref属性にアクセス
    shop_link = shop["href"]
    soup = fetchHtml(shop_link)

    detail_tables = soup.find_all('table', class_='c-table c-table--form rstinfo-table__table')

    # ジャンル
    for table in detail_tables:
        th = table.find('th', string='ジャンル')
        if th:
            genre = th.find_next('td').text
    extracted_data[0].append(genre)

    # 住所
    addresses = soup.find(class_='rstinfo-table__address')
    for addr in addresses.find_all('span'):
      address += addr.text
    extracted_data[0].append(address)

    # 点数
    score = soup.find(class_='rdheader-rating__score-val-dtl').text
    extracted_data[0].append(score)

    # 価格帯
    prices = soup.find_all(class_='rdheader-budget__price-target')
    price_dinner = prices[0].text
    price_lunch = prices[1].text
    extracted_data[0].append(price_dinner)
    extracted_data[0].append(price_lunch)

    # コース
    for table in detail_tables:
      th = table.find('th', string='コース')
      if th:
        course = th.find_next('td').text
    extracted_data[0].append(course)

    # 営業時間と定休日
    for table in detail_tables:
      th = table.find('th', string='営業時間')
      if th:
        open_hours = th.find_next('td')
        if open_hours.find_all("br"):
          for br in open_hours.find_all("br"):
            br.replace_with("\n")
        open_hours = open_hours.text
        open_hour = open_hours.split("定休日")[0]
        open_hour = open_hour.replace('営業時間', '')
        holiday = open_hours.split("定休日")[1]
    extracted_data[0].append(open_hour)
    extracted_data[0].append(holiday)

    # 喫煙可否と詳細
    for table in detail_tables:
      th = table.find('th', string='禁煙・喫煙')
      if th:
        smoking = th.find_next('td').text
        smoking_arr = smoking.splitlines()
        if smoking_arr[0] == "":
          smoking_arr.pop(0)
        
        smoking = smoking_arr[0]
        if ('禁煙' in smoking):
          smoking = '×'
        elif ('喫煙' in smoking or '分煙' in smoking):
          smoking = '〇'
        else:
          smoking = '-'
        
        smoking_len = len(smoking_arr)
        if smoking_len >= 2:
          for i, _smoking in enumerate(smoking_arr):
            smoking_detail += _smoking
            if i != smoking_len - 1:
              smoking_detail += os.linesep
        else:
          smoking_detail = "-"
    extracted_data[0].append(smoking)
    extracted_data[0].append(smoking_detail)

    # 店舗URL
    extracted_data[0].append(shop_link)

    # 電話番号
    for table in detail_tables:
      for th in soup.find_all("th"):
        if th and "予約" in th.text and "お問い合わせ" in th.text:
          tel = th.find_next('td').text
    extracted_data[0].append(tel)

    # スプレッドシートに書き込み
    worksheet.append_rows(extracted_data)



# 指定した都道府県のurl
url = "https://tabelog.com/" + prefecture_eng + "/rstLst/cond04-00-07/"
soup = fetchHtml(url)

pages = soup.find_all("a", class_="c-pagination__num")  # 複数ページある場合、ページのaタグを取得
if pages:
  for page in pages:
    extractData(page["href"])
else:
  extractData(url)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
