In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
import sys
import re
import os

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Start up

In [2]:
if sys.platform == 'darwin':
    driver = webdriver.Chrome('../webdriver/chromedriver_mac')
elif sys.platform == 'win32':
    driver = webdriver.Chrome('../webdriver/chromedriver_win.exe')

In [3]:
driver.get('https://bungol.ca')

In [4]:
login = driver.find_element_by_link_text('Login/Signup')
login.click()

username = driver.find_element_by_name('username')
password = driver.find_element_by_name('password')

username.send_keys('scottw')
password.send_keys('zojzes-dycpoQ-2nidwy')
password.send_keys(Keys.RETURN)

# Gather properties

Choose an area and set filter conditions in bungol before you run the cells below

In [94]:
urls = []

leftSidebarClose = driver.find_element_by_id('leftSidebarClose')
if not ('leftSidebarClosed' in leftSidebarClose.get_attribute('class')):
    leftSidebarClose.click()

icons = driver.find_elements_by_css_selector('div.leaflet-marker-icon')
for icon in icons:
    actions = ActionChains(driver)
    actions.move_to_element(icon).perform()
    sleep(1)

    popup = driver.find_element_by_id('popup')
    urls += [a.get_attribute('href') for a in popup.find_elements_by_tag_name('a')]
    
    # Clear the popup
    driver.execute_script('arguments[0].innerHTML = ""', popup)

In [95]:
len(urls)

31

In [96]:
details = []

# 70% of aunty's home
standardExpenses = pd.Series({
    'Water': 56.21,
    'Heat': 98.41,
    'Insurance': 71.69,
    'CAC': 0,             # included in hydro
    'Hydro': 106.6
}) * 0.7

for i, url in enumerate(urls):
    if i % 10 == 0 and i > 0:
        print(f'{i} / {len(urls)}')
        
    driver.get(url)
    driver.find_element_by_link_text('Community').click()
    wait = WebDriverWait(driver, 5, poll_frequency=1)
    wait.until(EC.text_to_be_present_in_element((By.ID, 'listingCommunityAverageIncome'), ','))
    
    soup = BeautifulSoup(driver.page_source)
    s = pd.Series()

    # Variables whose format is <div id="#...">value</div>
    info = {
        'Address': '#listingAddress',
        'PropertyType': '#listingPropertyType',
        'AvgHouseholdIncome': '#listingCommunityAverageIncome',
        'AvgHouseholdSize': '#listingCommunityHouseholdSize'
    }
    
    for variable, selector in info.items():
        value = soup.select(selector)[0].get_text()
        s[variable] = value

    # Variables whose format is <div id="#...">label: value</div>
    info = {
        'ListPrice': '#listingListPrice',
        'ListingDate': '#listingContractDate',
        'SoldDate': '#listingEndDate',
        'SquareFootage': '#listingSummarySqFt',
        'MaintenanceFee': '#listingSummaryMaintenanceFees',
        'PropertyTax': '#listingSummaryTaxes',
    }

    for variable, selector in info.items():
        tmp = soup.select(selector)[0].get_text().split(':')
        value = tmp[1] if len(tmp) == 2 else None
        s[variable] = value

    # Some special cases
    tmp = soup.select('#listingStatus')[0].get_text().split(' - ')
    s['Status'], s['Price'] = tmp if len(tmp) == 2 else (tmp[0], None)

    tmp = soup.select('#listingBedBath')[0].get_text().split('|')
    s['Bedrooms'], s['Bathrooms'], s['ParkingSpots'] = tmp if len(tmp) == 3 else (None, None, None)
    
    s = s.str.strip()
    
    # Monthly expense and condo fees, if applicable
    expenses = standardExpenses.copy()
    
    if driver.find_element_by_id('listingCondoFeesTable') is None:
        s['IsTaxesIncluded'] = False
        expenses['Parking'] = 0
    else:
        for item in ['Water', 'Heat', 'Insurance', 'CAC', 'Hydro', 'Parking', 'Taxes']:
            tmp = driver.find_element_by_id(f'listingCondoFeesTable{item}')
            isCovered = 'bg-success' in tmp.get_attribute('class')

            if item == 'Taxes':
                s['IsTaxesIncluded'] = isCovered
            elif isCovered:
                expenses[item] = 0

    s['UncoveredExpenses'] = expenses.sum()
    s['URL'] = url
    details.append(s)

10 / 31
20 / 31
30 / 31


In [97]:
df = pd.DataFrame(details)

In [98]:
s = df['Address'].str.extract(r'(.\d+) - (\d.+)', expand=True)
s.columns = ['Unit', 'Address']

df['Unit'] = s['Unit'].fillna('')
df['Address'] = s['Address'].combine_first(df['Address'])

In [99]:
df['PropertyType'] = df['PropertyType'].map({
    'Condo Apartment': 'Condo',
    'Townhouse (Condo)': 'Townhouse',
    'Detached House': 'Detached'
})

In [100]:
cols = ['ListingDate', 'SoldDate']
df[cols] = df[cols].apply(pd.to_datetime, axis=1)
df['DaysOnMarket'] = (df['SoldDate'] - df['ListingDate']) / pd.Timedelta(days=1)

In [101]:
cols = ['ListPrice', 'Price', 'MaintenanceFee', 'PropertyTax', 'AvgHouseholdIncome', 'AvgHouseholdSize']
df[cols] = df[cols].apply(lambda col: col.str.replace(r'[^0-9\.]', '')).apply(pd.to_numeric)

In [102]:
df['AvgPersonalIncome'] = df['AvgHouseholdIncome'] / df['AvgHouseholdSize']

The reported tax on the listing tends to be lower than it really is. We take the *greater* of the reported property tax and Toronto's property tax rate of 0.51%.

In [103]:
df['PropertyTax'] = pd.DataFrame({
    'ReportedTax': df['PropertyTax'],
    'CalculatedTax': df['ListPrice'] * 0.0051 / 12
}).max(axis=1)

In [104]:
df['MonthlyExpenses'] = df['MaintenanceFee'].fillna(0) + df['UncoveredExpenses'].fillna(0)

# If maintenance fee doesn't include property tax, add that in
cond = df['IsTaxesIncluded'] == False
df.loc[cond, 'MonthlyExpenses'] += df.loc[cond, 'PropertyTax']

df['TrueMaintenanceFee'] = df['MonthlyExpenses'] - df['PropertyTax'] - standardExpenses.sum()

In [105]:
df['Bedrooms'] = df['Bedrooms'].apply(eval)

cols = ['Bathrooms', 'ParkingSpots']
df[cols] = df[cols].apply(pd.to_numeric)

In [106]:
def make_hyperlink(cell):
    return f'<a target="_blank" href="{cell}">Link</a>'
    
result = df[['PropertyType', 'Unit', 'Address', 'Status', 'SquareFootage', 'ListPrice', 'MonthlyExpenses', 'TrueMaintenanceFee', 'URL']] \
            .sort_values('MonthlyExpenses').reset_index(drop=True) \
            .style.format({
                'ListPrice': '{:,.0f}',
                'PropertyTax': '{:,.2f}',
                'MonthlyExpenses': '{:,.0f}',
                'TrueMaintenanceFee': '{:,.0f}',
                'AvgPersonalIncome': '{:,.0f}',
                'URL': make_hyperlink
            })
result

Unnamed: 0,PropertyType,Unit,Address,Status,SquareFootage,ListPrice,MonthlyExpenses,TrueMaintenanceFee,URL
0,Condo,#212,"4200 Bathurst St S, North York",For Sale,800-899,534000,753,293,Link
1,Condo,#701,"2522 Keele St, North York",For Sale,800-899,499000,777,332,Link
2,Condo,#420,"565 Wilson Ave, North York",For Sale,700-799,549990,836,369,Link
3,Townhouse,,"27 Streamdale Crt, North York",For Sale,1400-1599,599000,837,350,Link
4,Townhouse,#9,"55 Cedarcroft Blvd, North York",For Sale,1200-1399,595000,872,386,Link
5,Condo,#1105,"1 De Boers Dr, North York",For Sale,800-899,599990,876,388,Link
6,Condo,#411,"525 Wilson Ave, North York",For Sale,800-899,599900,892,404,Link
7,Townhouse,#757,"5 Everson Dr, North York",Offer Pending,900-999,590000,910,426,Link
8,Condo,#205,"8 Pemberton Ave, North York",Offer Pending,900-999,550000,946,479,Link
9,Condo,#910,"16 Harrison Garden Blvd, North York",For Sale,800-899,599900,950,462,Link


# Export

In [107]:
# result.to_excel('./result.xlsx', index=False)

In [108]:
standardExpenses.sum()

233.03699999999998