In [1]:

import requests
from bs4 import BeautifulSoup
from pprint import pprint
import time
import pandas as pd
import re
from tqdm import tqdm
import streamlit as st
import sqlite3
import numpy as np


# In[2]:


def extract_number(text, return_type=float):
    """テキスト内から数字を取り出して返す（float or int）

    Parameters:
    ----------
    text : str
        数字が入ったテキスト
    return_type : type
        返す値の型。引数なしではfloat型となる

    Returns:
    ----------
    型：return_typeで選択した型
        textから取り出した数字を返す
    """
    matched_text = re.search(r'\d+(\.\d+)?', text)
    if matched_text:
        number = matched_text.group()
        if return_type == int:
            return int(float(number))  # floatへの変換後にintへ変換
        else:
            return float(number)
    else:
        return 0 if return_type == int else 0.0


# In[15]:


# Airdoorデータ取得
d_list = []
url = 'https://airdoor.jp/list?si=d-131083&p={}'

for i in tqdm(range(1,15)):
    target_url = url.format(i)
    r = requests.get(target_url)
    time.sleep(1) # 1秒ずつ
    soup = BeautifulSoup(r.text,"html.parser")
    contents = soup.find_all('div', {'class': 'PropertyPanel_propertyPanel__8oJ13'}) or None
    for content in contents:
        # タイトル
        title = content.find('div', {'class': 'PropertyPanelBuilding_buildingTitle__tuPqN'}).get_text(strip=True) or None
        # 住所
        building_info = content.find_all('div', {'class': 'PropertyPanelBuilding_buildingInformationSection__deSLp'})
        address = building_info[0].find('p', {'class': 'is-mt5'}).get_text(strip=True) or None
        access = ', '.join(p.get_text() for p in building_info[0].find_all('p', {'class': False})) or None
        # 築年数、総階数
        p_tags = building_info[1].find_all('p')
        age = re.search(r'\((.*?)\)', p_tags[0].get_text()).group(1) or '築0年'
        story = p_tags[1].get_text(strip=True)
        # 階数、間取り、面積
        roomItems = content.findAll('a', {'class': 'PropertyPanelRoom_roomItem__95jRr'})
        for roomItem in roomItems:
            p_tag_text = roomItem.find('span', {'class': 'is-ml5'}).get_text(strip=True)
            room_number, madori, menseki, hogaku = [part.strip() for part in p_tag_text.split('/')]
            # 階数
            floor = re.findall(r'\d+', room_number)[0][:-2] if re.findall(r'\d+', room_number) and len(re.findall(r'\d+', room_number)[0]) > 2 else '1'
            # 家賃、管理費
            div_text = roomItem.find('div', {'class': 'PropertyPanelRoom_rentPrice__XdPUp'}).text
            fee = div_text.split()[0].replace(',', '') or '0円'
            management_fee = div_text.split()[1].replace(',', '') or '0円'
            # 敷金、礼金
            div = roomItem.find('div', {'class': 'PropertyPanelRoom_initialPrices__d90C3'})
            deposit = div.find_all('li')[0].get_text(strip=True) or '0円'
            gratuity = div.find_all('li')[1].get_text(strip=True) or '0円'
            d = {
                'title': title,
                'address': address,
                'access': access,
                'age': age,
                'story': story,
                'floor': floor,
                'room_number': room_number,
                'fee': fee,
                'management_fee': management_fee,
                'deposit': deposit,
                'gratuity': gratuity,
                'madori': madori,
                'menseki': menseki,
            }
            d_list.append(d)
df_airdoor = pd.DataFrame(d_list)


# In[16]:


df_airdoor['title'] = df_airdoor['title'].str.replace(r'【.*?】', '', regex=True)
df_airdoor['fee'] = df_airdoor['fee'].apply(extract_number)/10000
df_airdoor['management_fee'] = df_airdoor['management_fee'].apply(extract_number)/10000
df_airdoor['deposit'] = df_airdoor['deposit'].apply(lambda x: "0円" if x in ["無料"] else x).apply(extract_number)
df_airdoor['gratuity'] = df_airdoor['gratuity'].apply(lambda x: "0円" if x in ["無料"] else x).apply(extract_number)
df_airdoor['age'] = df_airdoor['age'].apply(lambda x: "築1年" if x in ["新築", "築0年"] else x).apply(extract_number)
df_airdoor['story'] = df_airdoor['story'].apply(extract_number)
df_airdoor['floor'] = df_airdoor['floor'].apply(extract_number)
df_airdoor['menseki'] = df_airdoor['menseki'].apply(extract_number)


# In[17]:


# accessを取得し、「路線」「駅名」「徒歩分数」に分割し、それぞれ「access1_1」「access1_2」「access1_3」に格納する。アクセスは最大2件まで取得する
# df_airdoorにカラム追加
for i in range(1, 3):
    for j in range(1, 4):
        df_airdoor[f'access{i}_{j}'] = ''
df_airdoor.head()
# 行ごとにテキストを分解してカラムに格納
for index, row in df_airdoor.iterrows():
    accesses = row['access'].split(',')[:3] # アクセス情報をコンマで分割し、最大3つまで取得
    for i, access in enumerate(accesses, start=1):
        match = re.match(r'(.+?)\s+(.+?)\s+徒歩(\d+)分', access.strip()) # 正規表現でテキストを解析
        if match:
            df_airdoor.at[index, f'access{i}_1'] = match.group(1)
            df_airdoor.at[index, f'access{i}_2'] = match.group(2)
            df_airdoor.at[index, f'access{i}_3'] = match.group(3)


# In[26]:


# 複数のスクレイピングデータを統合、重複物件を排除する場合はここで
df_scraped = df_airdoor

# 共通データを付与
from datetime import datetime
current_time = datetime.now().strftime('%Y-%m-%d %H:%M')
df_scraped['scraped_date_time'] = current_time
df_scraped['daily_decreased_room'] = None
df_scraped['weekly_decreased_room'] = None
df_scraped['evaluation_score'] = None


100%|██████████| 14/14 [00:25<00:00,  1.82s/it]


In [2]:
# データを読み込んでdf_dbに格納
db_path = 'techone_2.db'
conn = sqlite3.connect(db_path)
query = 'SELECT * FROM techone_db;'
df_db = pd.read_sql_query(query, conn)
conn.close()

In [3]:
# df_dbのうち、最新のもののみをdf_db_1に格納
last_datetime = df_db['scraped_date_time'].max()
df_db_1 = df_db[df_db['scraped_date_time']==last_datetime]

In [12]:
# df_db_1にあってdf_scrapedにない部屋を抽出してdf_uniqueに入れる
df_marged = pd.merge(df_db_1, df_scraped[['title', 'address', 'room_number']], on=['title', 'address', 'room_number'], how='left', indicator=True)
df_unique = df_marged[df_marged['_merge']=='left_only']
df_unique.shape

(26, 24)

In [14]:
df_unique.columns

Index(['title', 'address', 'access', 'age', 'story', 'floor', 'room_number',
       'fee', 'management_fee', 'deposit', 'gratuity', 'madori', 'menseki',
       'access1_1', 'access1_2', 'access1_3', 'access2_1', 'access2_2',
       'access2_3', 'scraped_date_time', 'daily_decreased_room',
       'weekly_decreased_room', 'evaluation_score', '_merge'],
      dtype='object')

In [None]:
# いつ、どのマンションの部屋が何部屋減ったかを表示するdf_decreased_roomを作成
df_decreased_room = df_unique.groupby(['title', 'address', 'scraped_date_time']).size().reset_index(name='count')

In [16]:
# techone_decreased_room テーブルに書き込み
conn = sqlite3.connect('techone_2.db')
df_decreased_room.to_sql('techone_decreased_room', conn, if_exists='append', index=False)
conn.close()

In [17]:
# techone_decreased_room テーブルを読み込んでdf_decreased_room_allを作成
db_path = 'techone_2.db'
conn = sqlite3.connect(db_path)
query = 'SELECT * FROM techone_decreased_room;'
df_decreased_room_all = pd.read_sql_query(query, conn)
conn.close()

In [25]:
# title、addressをキーにcountを合計
df_all_time_decreased_room = df_decreased_room_all.groupby(['title', 'address'])['count'].sum().reset_index(name='total_count')
df_all_time_decreased_room

Unnamed: 0,title,address,total_count
0,MAXIV西大島DUE,東京都江東区大島２丁目,1
1,ウィルテラス西大島,東京都江東区大島３丁目,1
2,エスティメゾン豊洲レジデンスＡ,東京都江東区豊洲３丁目,1
3,キャナルスクウェア豊洲,東京都江東区豊洲１丁目,1
4,ケーディーエックスレジデンス豊洲,東京都江東区豊洲１丁目,1
5,ザ・パークハビオ亀戸,東京都江東区大島３丁目,1
6,シーズンフラッツ木場,東京都江東区木場３丁目,3
7,パークアクシス亀戸中央公園,東京都江東区亀戸７丁目,1
8,パークアクシス豊洲,東京都江東区豊洲１丁目,1
9,パークアクシス豊洲キャナル,東京都江東区豊洲６丁目,1


In [26]:
# 最大値を3で割り、値が0から1/3であれば1…と条件をつけ、 evaluation カラムを作成して代入
max_total_count = df_all_time_decreased_room['total_count'].max()

# 条件を設定
conditions = [
    df_all_time_decreased_room['total_count'] < max_total_count/3,  # 最大値の1/3以下
    (max_total_count/3 <= df_all_time_decreased_room['total_count']) & (df_all_time_decreased_room['total_count'] < max_total_count*2/3),  # 最大値の1/3を超え、2/3未満
    df_all_time_decreased_room['total_count'] >= max_total_count*2/3  # 最大値の2/3以上
]
# 各条件に対する値を設定
values = [1, 2, 3]
# numpy.selectを使用して条件に基づく値を設定
df_all_time_decreased_room['evaluation'] = np.select(conditions, values, default=np.nan)

3

In [None]:
df_scraped_with_score = pd.merge(df_scraped, df_all_time_decreased_room[['title', 'address', 'evaluation']], on=['title', 'address'], how='left')
df_scraped_with_score['evaluation_score'] = df_scraped_with_score['evaluation']
df_scraped_with_score.drop(columns=['evaluation'], inplace=True)
df_scraped_with_score

In [30]:
df_scraped_with_score.shape

(441, 23)

In [31]:
# techone_db テーブルに書き込み
conn = sqlite3.connect('techone_2.db')
df_scraped_with_score.to_sql('techone_db', conn, if_exists='append', index=False)
conn.close()