In [22]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
import sys
import re
import os

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Start up

In [2]:
if sys.platform == 'darwin':
    driver = webdriver.Chrome('../webdriver/chromedriver_mac')
elif sys.platform == 'win32':
    driver = webdriver.Chrome('../webdriver/chromedriver_win.exe')

In [3]:
driver.get('https://bungol.ca')

In [4]:
login = driver.find_element_by_link_text('Login/Signup')
login.click()

username = driver.find_element_by_name('username')
password = driver.find_element_by_name('password')

username.send_keys('scottw')
password.send_keys('zojzes-dycpoQ-2nidwy')
password.send_keys(Keys.RETURN)

# Gather properties

Choose an area and set filter conditions in bungol before you run the cells below

In [119]:
urls = []

leftSidebarClose = driver.find_element_by_id('leftSidebarClose')
if not ('leftSidebarClosed' in leftSidebarClose.get_attribute('class')):
    leftSidebarClose.click()

icons = driver.find_elements_by_css_selector('div.leaflet-marker-icon')
for icon in icons:
    actions = ActionChains(driver)
    actions.move_to_element(icon).perform()
    sleep(1)

    popup = driver.find_element_by_id('popup')
    urls += [a.get_attribute('href') for a in popup.find_elements_by_tag_name('a')]
    
    # Clear the popup
    driver.execute_script('arguments[0].innerHTML = ""', popup)

In [120]:
len(urls)

16

In [121]:
details = []

for i, url in enumerate(urls):
    if i % 10 == 0 and i > 0:
        print(f'{i} / {len(urls)}')
        
    driver.get(url)
    driver.find_element_by_link_text('Community').click()
    wait = WebDriverWait(driver, 5, poll_frequency=1)
    wait.until(EC.text_to_be_present_in_element((By.ID, 'listingCommunityAverageIncome'), ','))
    
    soup = BeautifulSoup(driver.page_source)
    s = pd.Series()

    # Variables whose format is <div id="#...">value</div>
    info = {
        'Address': '#listingAddress',
        'PropertyType': '#listingPropertyType',
        'AvgHouseholdIncome': '#listingCommunityAverageIncome',
        'AvgHouseholdSize': '#listingCommunityHouseholdSize'
    }
    
    for variable, selector in info.items():
        value = soup.select(selector)[0].get_text()
        s[variable] = value

    # Variables whose format is <div id="#...">label: value</div>
    info = {
        'ListPrice': '#listingListPrice',
        'ListingDate': '#listingContractDate',
        'SoldDate': '#listingEndDate',
        'SquareFootage': '#listingSummarySqFt',
        'MaintenanceFee': '#listingSummaryMaintenanceFees',
        'PropertyTax': '#listingSummaryTaxes',
    }

    for variable, selector in info.items():
        tmp = soup.select(selector)[0].get_text().split(':')
        value = tmp[1] if len(tmp) == 2 else None
        s[variable] = value

    # Some special cases
    tmp = soup.select('#listingStatus')[0].get_text().split(' - ')
    s['Status'], s['Price'] = tmp if len(tmp) == 2 else (tmp[0], None)

    tmp = soup.select('#listingBedBath')[0].get_text().split('|')
    s['Bedrooms'], s['Bathrooms'], s['ParkingSpots'] = tmp if len(tmp) == 3 else (None, None, None)
    
    s = s.str.strip()
    
    # Monthly expense and condo fees, if applicable
    expenses = pd.Series({
        'Water': 38.47,
        'Heat': 104.9,
        'Insurance': 63.72,
        'CAC': 0,             # included in hydro
        'Hydro': 98.46,
        'Parking': 200
    })
    
    if driver.find_element_by_id('listingCondoFeesTable') is None:
        s['IsTaxesIncluded'] = False
        expenses['Parking'] = 0
    else:
        for item in ['Water', 'Heat', 'Insurance', 'CAC', 'Hydro', 'Parking', 'Taxes']:
            tmp = driver.find_element_by_id(f'listingCondoFeesTable{item}')
            isCovered = 'bg-success' in tmp.get_attribute('class')

            if item == 'Taxes':
                s['IsTaxesIncluded'] = isCovered
            elif isCovered:
                expenses[item] = 0

    s['UncoveredExpenses'] = expenses.sum()
    s['URL'] = url
    details.append(s)

10 / 16


In [122]:
df = pd.DataFrame(details)

In [123]:
s = df['Address'].str.extract(r'(.\d+) - (\d.+)', expand=True)
s.columns = ['Unit', 'Address']

df['Unit'] = s['Unit'].fillna('')
df['Address'] = s['Address'].combine_first(df['Address'])

In [124]:
df['PropertyType'] = df['PropertyType'].map({
    'Condo Apartment': 'Condo',
    'Townhouse (Condo)': 'Townhouse',
    'Detached House': 'Detached'
})

In [125]:
cols = ['ListingDate', 'SoldDate']
df[cols] = df[cols].apply(pd.to_datetime, axis=1)
df['DaysOnMarket'] = (df['SoldDate'] - df['ListingDate']) / pd.Timedelta(days=1)

In [126]:
cols = ['ListPrice', 'Price', 'MaintenanceFee', 'PropertyTax', 'AvgHouseholdIncome', 'AvgHouseholdSize']
df[cols] = df[cols].apply(lambda col: col.str.replace(r'[^0-9\.]', '')).apply(pd.to_numeric)

In [127]:
df['AvgPersonalIncome'] = df['AvgHouseholdIncome'] / df['AvgHouseholdSize']

The reported tax on the listing tends to be lower than it really is. We take the *greater* of the reported property tax and Toronto's property tax rate of 0.51%.

In [128]:
df['PropertyTax'] = pd.DataFrame({
    'ReportedTax': df['PropertyTax'],
    'CalculatedTax': df['ListPrice'] * 0.0051 / 12
}).max(axis=1)

In [129]:
df['MonthlyExpenses'] = df['MaintenanceFee'].fillna(0) + df['UncoveredExpenses'].fillna(0)

cond = df['IsTaxesIncluded'] == False
df.loc[cond, 'MonthlyExpenses'] += df.loc[cond, 'PropertyTax']

In [130]:
df['Bedrooms'] = df['Bedrooms'].apply(eval)

cols = ['Bathrooms', 'ParkingSpots']
df[cols] = df[cols].apply(pd.to_numeric)

In [131]:
def make_hyperlink(cell):
    return f'<a target="_blank" href="{cell}">Link</a>'
    
result = df[['PropertyType', 'Unit', 'Address', 'Status', 'SquareFootage', 'ListPrice', 'MonthlyExpenses', 'Bedrooms', 'Bathrooms', 'ParkingSpots', 'URL']] \
            .query('MonthExpenses <= 800') \
            .sort_values('MonthlyExpenses').reset_index(drop=True) \
            .style.format({
                'ListPrice': '{:,.0f}',
                'PropertyTax': '{:,.2f}',
                'MonthlyExpenses': '{:,.0f}',
                'AvgPersonalIncome': '{:,.0f}',
                'URL': make_hyperlink
            })
result

Unnamed: 0,PropertyType,Unit,Address,Status,SquareFootage,ListPrice,MonthlyExpenses,Bedrooms,Bathrooms,ParkingSpots,URL
0,Condo,#2812,"7895 Jane St, Vaughan",Offer Pending,500-599,429900,320,2,1,1,Link
1,Condo,#1913,"7895 Jane St, Vaughan",For Sale,600-699,550000,371,2,2,1,Link
2,Condo,#307,"3600 Highway 7 Rd, Vaughan",Offer Pending,600-699,509000,810,2,1,1,Link
3,Condo,#2505,"3600 Highway 7, Vaughan",For Sale,800-899,594990,844,2,2,1,Link
4,Condo,#2902,"3700 Highway 7 Rd, Vaughan",For Sale,800-899,599999,852,2,2,1,Link
5,Condo,#614,"7895 Jane St, Vaughan",For Sale,600-699,530000,853,2,2,1,Link
6,Condo,#402,"2900 Highway 7 Rd, Vaughan",For Sale,800-899,549900,897,2,2,1,Link
7,Condo,#1002,"2910 Highway 7 Ave W, Vaughan",For Sale,800-899,576000,914,2,2,1,Link
8,Condo,#419,"2910 Highway 7 Rd, Vaughan",Offer Pending,800-899,529000,915,2,2,1,Link
9,Condo,#2307,"2910 Highway 7 Rd W, Vaughan",For Sale,700-799,549900,921,2,2,1,Link


# Export

In [1]:
result.to_excel('./result.xlsx', index=False)

NameError: name 'df' is not defined