In [376]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# Start up

In [377]:
driver = webdriver.Chrome('../webdriver/chromedriver.exe')

In [378]:
driver.get('https://bungol.ca')

In [379]:
login = driver.find_element_by_link_text('Login/Signup')
login.click()

username = driver.find_element_by_name('username')
password = driver.find_element_by_name('password')

username.send_keys('scottw')
password.send_keys('zojzes-dycpoQ-2nidwy')
password.send_keys(Keys.RETURN)

Now to the the website and set the location, set filtering criteria, etc. before coming back here.

# Gather properties

In [383]:
urls = []

leftSidebarClose = driver.find_element_by_id('leftSidebarClose')
if not ('leftSidebarClosed' in leftSidebarClose.get_attribute('class')):
    leftSidebarClose.click()

icons = driver.find_elements_by_css_selector('div.leaflet-marker-icon')
for icon in icons:
    actions = ActionChains(driver)
    actions.move_to_element(icon).perform()
    sleep(1)

    popup = driver.find_element_by_id('popup')
    urls += [a.get_attribute('href') for a in popup.find_elements_by_tag_name('a')]
    driver.execute_script('arguments[0].innerHTML = ""', popup)

In [384]:
len(urls)

56

# Property detail

In [392]:
details = []

for url in urls[:3]:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source)
    s = pd.Series()

    # Variables whose format is <div id="#...">value</div>
    info = {
        'Address': '#listingAddress',
        'PropertyType': '#listingPropertyType'
    }
    
    for variable, selector in info.items():
        value = soup.select(selector)[0].get_text()
        s[variable] = value

    # Variables whose format is <div id="#...">label: value</div>
    info = {
        'ListPrice': '#listingListPrice',
        'ListingDate': '#listingContractDate',
        'SoldDate': '#listingEndDate',
        'SquareFootage': '#listingSummarySqFt',
        'MaintenanceFee': '#listingSummaryMaintenanceFees',
        'PropertyTax': '#listingSummaryTaxes',
    }

    for variable, selector in info.items():
        tmp = soup.select(selector)[0].get_text().split(':')
        value = tmp[1] if len(tmp) == 2 else None
        s[variable] = value

    # Some special cases
    tmp = soup.select('#listingStatus')[0].get_text().split(' - ')
    s['Status'], s['Price'] = tmp if len(tmp) == 2 else (tmp[0], None)

    tmp = soup.select('#listingBedBath')[0].get_text().split('|')
    s['Bedrooms'], s['Bathrooms'], s['ParkingSpots'] = tmp if len(tmp) == 3 else (None, None, None)
    
    s = s.str.strip()

    # Condo fee coverage, if applicable
    tmp = driver.find_element_by_id('listingCondoFeesTable')
    if tmp is None:
        continue
    
    coverage = {}
    for item in ['Water', 'Heat', 'Insurance', 'CAC', 'Hydro', 'Parking', 'Taxes']:
        tmp = driver.find_element_by_id(f'listingCondoFeesTable{item}')
        classes = tmp.get_attribute('class')
        if 'bg-danger' in classes:
            isIncluded = False
        elif 'bg-success' in classes:
            isIncluded = True
        else:
            isIncluded = None
        
        coverage[item] = isIncluded
        
    s['Coverage'] = coverage
    details.append(s)

In [393]:
df = pd.DataFrame(details)

In [394]:
cols = ['ListingDate', 'SoldDate']
df[cols] = df[cols].apply(pd.to_datetime, axis=1)

In [404]:
cols = ['ListPrice', 'Price', 'MaintenanceFee', 'PropertyTax']
df[cols] = df[cols].apply(lambda col: col.str.replace(r'[^0-9\.]', '')).apply(pd.to_numeric)

In [408]:
df['Bedrooms'] = df['Bedrooms'].apply(eval)

cols = ['Bathrooms', 'ParkingSpots']
df[cols] = df[cols].apply(pd.to_numeric)

In [409]:
df.dtypes

Address                   object
PropertyType              object
ListPrice                  int64
ListingDate       datetime64[ns]
SoldDate          datetime64[ns]
SquareFootage             object
MaintenanceFee           float64
PropertyTax                int64
Status                    object
Price                    float64
Bedrooms                   int64
Bathrooms                  int64
ParkingSpots               int64
Coverage                  object
dtype: object

In [401]:
df

Unnamed: 0,Address,PropertyType,ListPrice,ListingDate,SoldDate,SquareFootage,MaintenanceFee,PropertyTax,Status,Price,Bedrooms,Bathrooms,ParkingSpots,Coverage
0,"14 Hoover Cres, North York",Semi-Detached/Link,716000,2019-05-27,NaT,1500-2000,,214,Offer Pending,,4,2,3,"{'Water': None, 'Heat': None, 'Insurance': Non..."
1,"67 Hoover Cres, North York",Detached House,869000,2019-05-08,2019-06-07,No data,,293,Sold,835000.0,4+1,2,6,"{'Water': None, 'Heat': None, 'Insurance': Non..."
2,"380 Cook Rd, North York",Semi-Detached/Link,819000,2019-04-17,2019-05-24,2000-2500,,319,Sold,780000.0,4+4,5,2,"{'Water': None, 'Heat': None, 'Insurance': Non..."


In [407]:
def stylize(row):
    style = 'color: lightgray' if row['Status'] == 'Sold' else ''
    return np.repeat(style, len(row))
    
df.style.apply(stylize, axis=1)

Unnamed: 0,Address,PropertyType,ListPrice,ListingDate,SoldDate,SquareFootage,MaintenanceFee,PropertyTax,Status,Price,Bedrooms,Bathrooms,ParkingSpots,Coverage
0,"14 Hoover Cres, North York",Semi-Detached/Link,716000,2019-05-27 00:00:00,NaT,1500-2000,,214,Offer Pending,,4,2,3,"{'Water': None, 'Heat': None, 'Insurance': None, 'CAC': None, 'Hydro': None, 'Parking': None, 'Taxes': None}"
1,"67 Hoover Cres, North York",Detached House,869000,2019-05-08 00:00:00,2019-06-07 00:00:00,No data,,293,Sold,835000.0,4+1,2,6,"{'Water': None, 'Heat': None, 'Insurance': None, 'CAC': None, 'Hydro': None, 'Parking': None, 'Taxes': None}"
2,"380 Cook Rd, North York",Semi-Detached/Link,819000,2019-04-17 00:00:00,2019-05-24 00:00:00,2000-2500,,319,Sold,780000.0,4+4,5,2,"{'Water': None, 'Heat': None, 'Insurance': None, 'CAC': None, 'Hydro': None, 'Parking': None, 'Taxes': None}"
