In [1]:
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep
from tqdm import tqdm

In [None]:
# ジャンルとエリアの設定
GENRE = "工務店"
AREA = "東京都"
BASE_URL = f"https://www.mapion.co.jp/s/q={GENRE}%20{AREA}/t=spot"
# URL絞込用パラメータ 例）"/?cate=M26&area=13"
ADD_URL = "/?cate=M26&area=13"

In [3]:
# 総ページ数を取得
response = requests.get(BASE_URL + ADD_URL)
sleep(1)  # サーバーへの負荷を避けるために1秒待機
soup = BeautifulSoup(response.text, "html.parser")
total_pages = (int(soup.find_all("p", class_="subTitle")[0].find_all("span")[1].text.split('件')[0]) - 1 ) // 20 + 1
# スクレイピング可能な最大ページ数
max_pages = min(total_pages, 100)
# スクレイピングするURLリストの作成
urls = [f"{BASE_URL}/p={page}{ADD_URL}" for page in range(1, max_pages + 1)]

In [4]:
# スクレイピング開始
pattern = r"^0\d{1,4}-\d{1,4}-\d{3,4}$|^0\d{9,10}$" # 電話番号の正規表現パターン
results = []
for url in tqdm(urls):
    response = requests.get(url)
    sleep(1)  # サーバー負荷軽減のため1秒待機
    soup = BeautifulSoup(response.text, "html.parser")
    section = soup.find("div", id="NumberSection")
    cards = section.find_all("dl")
    items = []
    for card in cards:
        # 会社名を抽出
        name = " - "
        try:
            name = card.find('a').text.replace('\u3000', ' ').strip()
        except:
            name = " - "
        # 住所を抽出
        address = " - "
        try:
            address = card.find_all('dd')[2].text
        except:
            try:
                address = card.find("li", class_="dataAdr").text.strip()
            except:
                address = " - "
        # 電話番号を抽出
        phone = " - "
        try:
            phone = card.find_all('dd')[3].text.strip()
        except:
            try:
                phone = card.find("li", class_="dataTel").text.strip()
            except:
                phone = " - "
        if re.match(pattern, phone):
            items.append({
                "会社名": name,
                "住所": address,
                "電話番号": phone
            })
    results.extend(items)

  0%|          | 0/81 [00:00<?, ?it/s]

100%|██████████| 81/81 [01:37<00:00,  1.21s/it]


In [5]:
# 重複行を削除
df = pd.DataFrame(results)
print(df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop=True)
print(df.duplicated().sum())

15
0


In [6]:
# CSVファイルに出力
df.to_csv(f"data/{GENRE}_{AREA}.csv", index=False)