In [59]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
import sys
import re

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# Start up

In [11]:
if sys.platform == 'darwin':
    driver = webdriver.Chrome('../webdriver/chromedriver')
elif sys.platform == 'windows':
    driver = webdriver.Chrome('../webdriver/chromedriver.exe')

In [12]:
driver.get('https://bungol.ca')

In [13]:
login = driver.find_element_by_link_text('Login/Signup')
login.click()

username = driver.find_element_by_name('username')
password = driver.find_element_by_name('password')

username.send_keys('scottw')
password.send_keys('zojzes-dycpoQ-2nidwy')
password.send_keys(Keys.RETURN)

# Gather properties

Choose an area and set filter conditions in bungol before you run the cells below

In [108]:
urls = []

leftSidebarClose = driver.find_element_by_id('leftSidebarClose')
if not ('leftSidebarClosed' in leftSidebarClose.get_attribute('class')):
    leftSidebarClose.click()

icons = driver.find_elements_by_css_selector('div.leaflet-marker-icon')
for icon in icons:
    actions = ActionChains(driver)
    actions.move_to_element(icon).perform()
    sleep(1)

    popup = driver.find_element_by_id('popup')
    urls += [a.get_attribute('href') for a in popup.find_elements_by_tag_name('a')]
    
    # Clear the popup
    driver.execute_script('arguments[0].innerHTML = ""', popup)

In [109]:
len(urls)

152

In [122]:
details = []

for url in urls:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source)
    s = pd.Series()

    # Variables whose format is <div id="#...">value</div>
    info = {
        'Address': '#listingAddress',
        'PropertyType': '#listingPropertyType'
    }
    
    for variable, selector in info.items():
        value = soup.select(selector)[0].get_text()
        s[variable] = value

    # Variables whose format is <div id="#...">label: value</div>
    info = {
        'ListPrice': '#listingListPrice',
        'ListingDate': '#listingContractDate',
        'SoldDate': '#listingEndDate',
        'SquareFootage': '#listingSummarySqFt',
        'MaintenanceFee': '#listingSummaryMaintenanceFees',
        'PropertyTax': '#listingSummaryTaxes',
    }

    for variable, selector in info.items():
        tmp = soup.select(selector)[0].get_text().split(':')
        value = tmp[1] if len(tmp) == 2 else None
        s[variable] = value

    # Some special cases
    tmp = soup.select('#listingStatus')[0].get_text().split(' - ')
    s['Status'], s['Price'] = tmp if len(tmp) == 2 else (tmp[0], None)

    tmp = soup.select('#listingBedBath')[0].get_text().split('|')
    s['Bedrooms'], s['Bathrooms'], s['ParkingSpots'] = tmp if len(tmp) == 3 else (None, None, None)
    
    s = s.str.strip()

    # Condo fee coverage, if applicable
    tmp = driver.find_element_by_id('listingCondoFeesTable')
    if tmp is None:
        continue
    
    expenses = pd.Series({
        'Water': 38.47,
        'Heat': 104.9,
        'Insurance': 63.72,
        'CAC': 0,             # included in hydro
        'Hydro': 98.46,
        'Parking': 200
    })
    
    for item in ['Water', 'Heat', 'Insurance', 'CAC', 'Hydro', 'Parking', 'Taxes']:
        tmp = driver.find_element_by_id(f'listingCondoFeesTable{item}')
        isCovered = 'bg-success' in tmp.get_attribute('class')
        
        if item == 'Taxes':
            s['IsTaxesIncluded'] = isCovered
        elif isCovered:
            expenses[item] = 0
    
    s['UncoveredExpenses'] = expenses.astype('float').sum()
    details.append(s)

In [123]:
df = pd.DataFrame(details)

In [124]:
cols = ['ListingDate', 'SoldDate']
df[cols] = df[cols].apply(pd.to_datetime, axis=1)
df['DaysOnMarket'] = (df['SoldDate'] - df['ListingDate']) / pd.Timedelta(days=1)

In [125]:
cols = ['ListPrice', 'Price', 'MaintenanceFee', 'PropertyTax']
df[cols] = df[cols].apply(lambda col: col.str.replace(r'[^0-9\.]', '')).apply(pd.to_numeric)

The reported tax on the listing looks lower than it really is. We take the GREATER of the reported property tax and Toronto's property tax rate, set at 0.51%.

In [126]:
df['PropertyTax'] = pd.DataFrame({
    'ReportedTax': df['PropertyTax'],
    'CalculatedTax': df['ListPrice'] * 0.0051 / 12
}).max(axis=1)

In [127]:
df['MonthlyExpenses'] = df['MaintenanceFee'] + df['UncoveredExpenses']

In [128]:
cond = df['IsTaxesIncluded'] == False
df.loc[cond, 'MonthlyExpenses'] += df.loc[cond, 'PropertyTax']

In [129]:
df['Bedrooms'] = df['Bedrooms'].apply(eval)

cols = ['Bathrooms', 'ParkingSpots']
df[cols] = df[cols].apply(pd.to_numeric)

In [131]:
df[['PropertyType', 'Address', 'Status', 'SquareFootage', 'ListPrice', 'PropertyTax', 'MonthlyExpenses', 'Bedrooms', 'Bathrooms', 'ParkingSpots']] \
    .sort_values('MonthlyExpenses').reset_index(drop=True) \
    .style.format({
        'ListPrice': '{:,.0f}',
        'MonthlyExpenses': '{:,.2f}'
    })

Unnamed: 0,PropertyType,Address,Status,SquareFootage,ListPrice,PropertyTax,MonthlyExpenses,Bedrooms,Bathrooms,ParkingSpots
0,Condo Apartment,"#1114 - 3237 Bayview Ave, North York",For Sale,600-699,515000,218.875,467.93,2,2,1
1,Condo Apartment,"#715 - 872 Sheppard Ave W, North York",For Sale,800-899,549000,233.325,595.83,3,2,1
2,Condo Apartment,"#1907 - 205 Hilda Ave, North York",For Sale,900-999,519000,220.575,630.05,2,2,1
3,Townhouse (Condo),"#117 - 19 Coneflower Cres, North York",For Sale,700-799,495000,210.375,679.2,3,1,1
4,Condo Apartment,"#1711 - 205 Hilda Ave, North York",For Sale,1200-1399,549999,233.75,693.22,3,2,1
5,Condo Apartment,"#1211 - 233 Beecroft Rd, North York",For Sale,600-699,483999,205.7,702.17,2,1,1
6,Condo Apartment,"#708 - 3237 Bayview Ave, North York",For Sale,600-699,515000,218.875,708.81,2,2,1
7,Condo Apartment,"#608 - 3237 Bayview Ave, North York",For Sale,600-699,499000,212.075,715.0,2,2,1
8,Townhouse (Condo),"#356 - 3 Everson Dr, North York",For Sale,800-899,509000,216.325,744.8,2,1,1
9,Condo Apartment,"#108 - 26 Olive Ave, North York",Offer Pending,600-699,480000,204.0,748.47,2,1,1
