In [6]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import time
import pandas as pd
import re
import tqdm

# strを引数、数字のみ抽出したnumberを返す関数。第二引数無しであればfloat、指定すればintにできる
def extract_number(text, return_type=float):
    matched_text = re.search(r'\d+(\.\d+)?', text)
    if matched_text:
        number = matched_text.group()
        if return_type == int:
            return int(float(number))  # floatへの変換後にintへ変換
        else:
            return float(number)
    else:
        return 0 if return_type == 'int' else 0.0

d_list = []
url = 'https://suumo.jp/chintai/tokyo/sc_koto/?page={}'

for i in range(1,10):
    target_url = url.format(i)
    r = requests.get(target_url)
    
    # 1秒ずつ取得
    time.sleep(1)
    soup = BeautifulSoup(r.text,"html.parser")
    
    contents = soup.find_all('div', class_='cassetteitem')

    for content in contents:
        detail = content.find('div', class_='cassetteitem-detail')
        table = content.find('table', class_='cassetteitem_other')
        
        # タイトルを取得し、空白を削除
        title_text = detail.find('div', class_='cassetteitem_content-title').text
        title = title_text.replace('　', '')
        
        # アドレスを取得
        address = detail.find('li', class_='cassetteitem_detail-col1').text
        
        # アクセスを取得、改行を削除、要素間をカンマ区切りに
        access_text = detail.find('li', class_='cassetteitem_detail-col2').text
        cleaned_access_text = access_text.strip()
        access = cleaned_access_text.replace('\n', ',')
        
        # 築年数と総階数を取得、分割してageとstoryに格納。地下は重要度低いため除外
        spaced_age_and_story = detail.find('li', class_='cassetteitem_detail-col3').text
        age_and_story = spaced_age_and_story = spaced_age_and_story.replace('築0年', '築1年').replace('新築', '築1年')
        numbers = re.findall(r'\d+', age_and_story)
        if len(numbers) == 2:
            age = int(numbers[0])
            story = int(numbers[1])
        elif len(numbers) == 3:
            age = int(numbers[0])
            story = int(numbers[2])
        else:
            age = 0
            story = 0
            
        # trタグから部屋情報を取得
        trtags = table.find_all('tr', class_='js-cassette_link')
        for trtag in trtags:
            original_floor_data, price, first_fee, capacity = trtag.find_all('td')[2:6]
            
            # 階数を取得、複数階は最低階を格納、階なしは1を格納
            floor_text = original_floor_data.text
            matched_floor_number = re.search(r'\d+', floor_text)
            if matched_floor_number:
                floor = int(matched_floor_number.group())
            else:
                floor = 1
            
            # 各オブジェクトに関数適用してfloat型に揃える
            fee_text, management_fee_text = [li.text for li in price.find_all('li')]
            fee = extract_number(fee_text)
            management_fee = extract_number(management_fee_text)/10000 # 管理費も万円に揃える
            
            deposit_text, gratuity_text = [li.text for li in first_fee.find_all('li')]
            deposit = extract_number(deposit_text)
            gratuity = extract_number(gratuity_text)
            
            madori, menseki_text = [li.text for li in capacity.find_all('li')]
            menseki = extract_number(menseki_text)
            
            d = {
                'title': title,
                'address': address,
                'access': access,
                'age': age,
                'story': story,
                'floor': floor,
                'fee': fee,
                'management_fee': management_fee,
                'deposit': deposit,
                'gratuity': gratuity,
                'madori': madori,
                'menseki': menseki,
            }
            d_list.append(d)



In [7]:
# Airdoorデータ取得
url = 'https://airdoor.jp/list?si=d-131083&p={}'

for i in range(1,10):
    target_url = url.format(i)
    r = requests.get(target_url)
    time.sleep(1) # 1秒ずつ
    soup = BeautifulSoup(r.text,"html.parser")
    
    contents = soup.find_all('div', {'class': 'PropertyPanel_propertyPanel__MqCpF'})
    
    for content in contents:
        
        # タイトル
        title_with_brackets = content.find('div', {'class': 'PropertyPanelBuilding_buildingTitle__NbWmb'}).text
        title_with_space = re.sub(r'【.*?】', '', title_with_brackets)
        title = title_with_space.replace('　', '')
        
        # 住所
        divs = content.find_all('div', {'class': 'PropertyPanelBuilding_buildingInformationSection__AMRsh'})
        first_div = divs[0]
        address = first_div.find('p', {'class': 'is-mt5'}).text
        
        # アクセス
        p_tags_without_class = first_div.find_all('p', {'class': False})
        p_tags_without_class
        texts = [p.get_text() for p in p_tags_without_class] # 各pタグのテキストを取得し、リストに格納
        access = ', '.join(texts) # テキストをコンマ区切りで結合し表示
        
        # 築年数
        second_div = divs[1]
        p_tags = second_div.find_all('p')
        age_all_text = p_tags[0].get_text()
        match = re.search(r'\((.*?)\)', age_all_text)
        if match:
            age_text = match.group(1)
        else:
            age_text = '築0年'
        age_text = age_text.replace('新築', '築1年').replace('築0年', '築1年')
        age = extract_number(age_text, int)
        
        # 総階数
        story_text = p_tags[1].get_text()
        story = extract_number(story_text, return_type=int)
        
        # 階数、間取り、面積を取得
        roomItems = content.findAll('a', {'class': 'PropertyPanelRoom_roomItem__3bVhC'})
        for roomItem in roomItems:
            p_tag_text = roomItem.find('span', {'class': 'is-ml5'}).text
            room_number_text, madori, menseki_text, hogaku = [part.strip() for part in p_tag_text.split('/')]
            
            # 階数
            numbers = re.findall(r'\d+', room_number_text)
            if numbers and len(numbers[0]) > 2:
                floor_numbers = numbers[0][:-2]
                floor = int(floor_numbers)
            else:
                floor = 1
                
            # 面積
            menseki = extract_number(menseki_text)
            
            # 家賃、管理費
            div_text = roomItem.find('div', {'class': 'PropertyPanelRoom_rentPrice__HO4Jp'}).text
            sprit_text = div_text.split()
            fee_without_comma = sprit_text[0].replace(',', '')
            fee = extract_number(fee_without_comma) / 10000
            management_fee_without_comma = sprit_text[1].replace(',', '')
            management_fee = extract_number(management_fee_without_comma) / 10000
            
            # 敷金、礼金
            div = roomItem.find('div', {'class': 'PropertyPanelRoom_initialPrices__naYEA'})
            li = div.findAll('li')
            deposit_text = li[0].text.replace('無料', '0万円')
            deposit = extract_number(deposit_text)
            gratuity_text = li[1].text.replace('無料', '0万円')
            gratuity = extract_number(gratuity_text)
            
            d = {
                'title': title,
                'address': address,
                'access': access,
                'age': age,
                'story': story,
                'floor': floor,
                'fee': fee,
                'management_fee': management_fee,
                'deposit': deposit,
                'gratuity': gratuity,
                'madori': madori,
                'menseki': menseki,
            }
            
            d_list.append(d)

In [8]:
df = pd.DataFrame(d_list)

In [9]:
df.shape

(1142, 12)

In [None]:
df_c = df.drop_duplicates()
len(df_c)

In [None]:
df_1 = df.drop_duplicates(['address', 'age', 'story', 'floor', 'fee', 'management_fee', 'deposit', 'gratuity', 'madori', 'menseki'])
filtered_df = df_1.groupby(['address', 'age', 'story', 'floor', 'fee', 'deposit', 'gratuity', 'madori', 'menseki']).filter(lambda x: len(x) > 1)
filtered_df

In [None]:
df_2 = df.drop_duplicates(['address', 'age', 'story', 'floor', 'fee', 'deposit', 'gratuity', 'madori', 'menseki'])
len(df_2)

In [None]:
df_3 = df[df['menseki'] == 51.90]
df_3

In [10]:
df_dropped = df.drop_duplicates(['address', 'age', 'story', 'floor', 'madori', 'menseki'])
df_dropped.shape

(888, 12)

In [11]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# 認証情報の設定
SP_CREDENTIAL_FILE = '/Users/ryosukeinoue/Documents/31_Tech0/Step3/scraping-techone-8842efd0d979.json'
SP_SCOPE = [
    'https://www.googleapis.com/auth/drive',
    'https://www.googleapis.com/auth/spreadsheets'
]
SP_SHEET_KEY = '1wi_aezq9wK88OtUprgOE40R4AlHGEoVUYmT31HbpzCo'
credentials = ServiceAccountCredentials.from_json_keyfile_name(SP_CREDENTIAL_FILE, SP_SCOPE)
# gspreadクライアントの初期化
gc = gspread.authorize(credentials)
# スプレッドシートを開き、最初のワークシートを選択
spreadsheet = gc.open_by_key(SP_SHEET_KEY)
worksheet = spreadsheet.sheet1
# スプレッドシートへのヘッダーの書き込み
worksheet.append_row(df_dropped.columns.tolist())
# DataFrameのデータをスプレッドシートに書き込み（ヘッダーがすでに入っているので2行目から）
data = df.values.tolist()
worksheet.update('A2', data)