In [None]:
import requests
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from loguru import logger
import pandas as pd
from selenium import webdriver

In [None]:
URL = "" # Suumo Search result URLs

In [None]:
class SuumoParser():
    
    @staticmethod
    def data_table_parser(soup):
        data = {}
        data_table = soup.find_all("table", class_="data_table table_gaiyou")[0]
        ths = data_table.find_all("th")
        tds = data_table.find_all("td")

        for th, td in zip(ths, tds):
            data[th.text] = td.text.strip()

        return data

    @staticmethod
    def main_table_parser(soup):
        data = {}
        main_table = soup.find_all("table", class_="property_view_table")[0]
        ths = main_table.find_all("th", class_="property_view_table-title")
        tds = main_table.find_all("td", class_="property_view_table-body")

        for th, td in zip(ths, tds):
            data[th.text] = td.text.strip()

        # Post-processing
        data["駅徒歩"] = data["駅徒歩"].strip()
        station_text = ""
        for stat in data["駅徒歩"].split("\n"):
            station_text += stat.split("/")[1] + ","
        data["駅徒歩"] = station_text[:-1]

        data["専有面積"] = float(data["専有面積"].replace("m2", ""))
        return data
    
    @staticmethod
    def cost_html_parser(soup):
        data = {}
        notes = soup.find_all("div", class_="property_view_note-list")
        first_row = notes[0]
        data["rental_fee"] = first_row.find_all("span")[0].text
        data["common_fee"] = first_row.find_all("span")[1].text
        second_row = notes[1]
        data["deposit"] = second_row.find_all("span")[0].text
        data["key_money"] = second_row.find_all("span")[1].text

        # Convert to number
        data["rental_fee"] = data["rental_fee"].replace('万円', '')
        data["rental_fee"] = int(float(data["rental_fee"]) * 10000)

        data["common_fee"] = data["common_fee"].replace('管理費・共益費:\xa0', '').replace('円', '')
        if data["common_fee"] == '-':
            data["common_fee"] = 0
        else:
            data["common_fee"] = int(data["common_fee"])

        data["deposit"] = data["deposit"].replace('敷金:\xa0', '').replace('万円', '')
        if data["deposit"] == '-':
            data["deposit"] = 0
        else:
            data["deposit"] = int(float(data["deposit"]) * 10000)    

        data["key_money"] = data["key_money"].replace('礼金:\xa0', '').replace('万円', '')
        if data["key_money"] == '-':
            data["key_money"] = 0
        else:
            data["key_money"] = int(float(data["key_money"]) * 10000)

        return data

In [None]:
hrefs = []
for i in range(1, 10):
    if i == 1:
        page_offset = ""
    else: 
        page_offset = f"&page={i}"
    
    logger.info(f"Getting page {i}")
    paging_url = URL + page_offset
    response = requests.get(paging_url)
    soup = BeautifulSoup(response.text, "html.parser")
    room_links = soup.find_all("a", class_="js-cassette_link_href cassetteitem_other-linktext")
    if len(room_links) > 0:
        for room_link in room_links:
            hrefs.append(room_link.attrs["href"])
    else:
        logger.info(f"There is no more room in page {i}: stop")
        break

logger.info(f"Found total {len(hrefs)} hrefs")

In [None]:
parsed_uri = urlparse(url)
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
data = []
for href in hrefs:
    room_url = base_url + href
    logger.info(f"Scrape {room_url}")
    response = requests.get(room_url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    title = soup.find_all("h1", class_="section_h1-header-title")[0].text
    
    cost_data = SuumoParser.cost_html_parser(soup)
    main_table_data = SuumoParser.main_table_parser(soup)
    data_table_data = SuumoParser.data_table_parser(soup)
    
    features = soup.find_all("div", class_="bgc-wht ol-g")[0].text.strip().split("、")
    features.sort()
    
    room_data = {
        "title": title,
        **cost_data,
        **main_table_data,
        **data_table_data,
        "features": features
    }
    
    data.append(room_data)

In [None]:
df = pd.DataFrame(data)
df["total_fee"] = df["common_fee"] + df["rental_fee"]
columns = ["title", "total_fee"] + list(df.columns[1:-1])
df = df[columns]
df.sort_values(by=["total_fee", "専有面積", "間取り"], ascending=False, inplace=True)

In [None]:
df.to_csv("suumo_watcher.csv", index=False)