In [1]:
import pip
import time
import os
import numpy as np
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import re

from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import sys
from operator import itemgetter 
from selenium.webdriver import ActionChains
    
try:
    from selenium import webdriver
except ImportError:
    pip.main(['install', 'selenium'])
    from selenium import webdriver

from IPython.display import clear_output
pd.options.display.float_format = '{:,.0f}'.format

def clear():
    os.system( 'cls' )

# Progress bar function unrelated to web scraping
def update_progress(progress, desc_string):
    bar_length = 60
    
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    # clear()
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}% Last Post: {2}".format( "#" * block + "-" * (bar_length - block), 
                                                                   progress * 100,
                                                                   desc_string)
    print(text)


chrome_dir = r'D:\Zolo Data'

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")



In [9]:
def listing_summary(listing, idx):
    r = re.compile(r"^\W+")
    cols = ['Address','Neighbourhood','Price','Beds','Baths','Sqft','BuiltIn','Url']

    listing_addr = listing.find_elements_by_xpath('//a[@class="address link-primary"]')

    listing_url = listing_addr[idx].get_attribute('href')

    listing_addr = listing_addr[idx].text
    
    try:
        listing_nbhood = r.sub("",listing.find_element_by_class_name('neighbourhood').text.strip())
    except:
        listing_nbhood = ''
    # listing.find_element_by_xpath('//div[@class="card-listing--details xs-p2 xs-text-4 fill-white flex xs-flex-column xs-flex-shrink-0 xs-relative"]')
    listing_dets = list(filter(lambda x: '--details' in x.get_attribute('class'), listing.find_elements_by_tag_name('div')))[0]
    listing_price = int(listing.find_elements_by_xpath('//span[@itemprop="price"]')[idx].get_attribute('value'))
    listing_dets = listing_dets.find_element_by_tag_name('ul').find_elements_by_tag_name('li')
    listing_dets = [r.sub("", li.text).replace('Built in ','') for li in list(filter(lambda x: 'price' not in x.get_attribute('class'), listing_dets))]

    listing_dets = [listing_addr, listing_nbhood, listing_price] + listing_dets + [listing_url]

    listing_df = pd.DataFrame(np.array(listing_dets).reshape(-1,len(listing_dets)), columns = cols)
    
    return listing_df

def get_page_listings(site, browser):
    
    browser.get(site)

    browser.delete_all_cookies()

    curr_listings = WebDriverWait(browser, 5).until(EC.visibility_of_element_located((By.CLASS_NAME,'listings-wrapper')))
    curr_listings = curr_listings.find_elements_by_tag_name('article')

    curr_listings_df = pd.DataFrame()

    for idx, listing in enumerate(curr_listings):
        try:
            curr_row = listing_summary(listing, idx)
            curr_listings_df = curr_listings_df.append(curr_row)
        except:
            continue

    curr_listings_df = curr_listings_df.reset_index(drop = True)
    
    return curr_listings_df

def neighbourhood_listings(browser, site):

    browser.get(site)

    browser.delete_all_cookies()

    nav = browser.find_element_by_xpath('//section[@class="supplementary-nav xs-mt5 xs-mb6 sm-mt5 sm-mb6 xs-flex xs-flex-column xs-flex-align-center"]')
    max_pages = max([int(x) for x in list(filter(lambda x: x.isnumeric(), [a.text for a in nav.find_elements_by_tag_name('a')]))])

    listings_df = pd.DataFrame()

    start_time = time.time()

    for i in range(1, max_pages + 1):
        curr_site = site + '/page-{}'.format(i)
        curr_listings = get_page_listings(curr_site, browser)
        listings_df = listings_df.append(curr_listings)

        run_time = round(time.time() - start_time, 2)
        
        print(site)
        update_progress((i)/max_pages, '{0} Seconds {1} '.format(run_time, 'page-{}'.format(i)))

    listings_df = listings_df.drop_duplicates().reset_index(drop = True)
    listings_df['PullDate'] = dt.datetime.today().date()

    listings_df.to_sql('listings',
                       con = zolo_engine,
                       index = False,
                       if_exists = 'append')
    
    return listings_df

def listing_details(browser, url):
    
    def clean_cols(c):
        r = re.compile(r"^\W+")

        if 'MLS' in c:
            return 'MLS'
        else:
            return r.sub("",c)

    browser.get(url)
    
    summary = browser.find_elements_by_xpath('//section[@class="section-listing-content min-width-0"]')[0].find_element_by_tag_name('div')
    summary = pd.DataFrame([(dl.find_element_by_class_name('column-label').text,dl.find_element_by_class_name('column-value').text) for dl in summary.find_elements_by_tag_name('dl')]).T
    summary.columns = summary.iloc[0,:]
    summary = summary.iloc[1:,:]
    summary.columns = list(map(lambda x: clean_cols(x), summary.columns))
    
    pricing = browser.find_element_by_xpath('//section[@class="listing-trends-table"]')
    pricing = bs(pricing.find_element_by_tag_name('table').get_attribute('outerHTML'),'lxml')

    price_df = pd.DataFrame()

    for r in pricing.find_all('tr'):
        if len(r.find_all('th')) > 1:
            headers = list(map(lambda x: clean_cols(x.text), r.find_all('th')))
        elif len(r.find_all('th')) == 1:
            continue
        else:
            price_row = pd.DataFrame(np.array(list(map(lambda x: x.text.strip().replace('\n',' '), 
                                                       r.find_all('td')))).reshape(-1,len(r.find_all('td'))), 
                                     columns = headers)
            price_df = price_df.append(price_row)

    price_df = price_df.reset_index(drop = True)
    
    return summary, price_df

In [66]:
zolo_engine = create_engine('sqlite:///zolo.db', echo=False)

browser = webdriver.Chrome(executable_path = chrome_dir + "\\chromedriver.exe", options=options)


In [4]:
van_site = 'https://www.zolo.ca/vancouver-real-estate/3-bedroom'
rich_site = 'https://www.zolo.ca/richmond-real-estate/3-bedroom'
delt_site = 'https://www.zolo.ca/delta-real-estate/3-bedroom'
tsaw_site = 'https://www.zolo.ca/tsawwassen-real-estate/3-bedroom'
wrock_site = 'https://www.zolo.ca/white-rock-real-estate/3-bedroom'
surr_site = 'https://www.zolo.ca/surrey-real-estate/3-bedroom'

sites = [van_site,
         rich_site,
         delt_site,
         tsaw_site,
         wrock_site,
         surr_site]

In [10]:
for site in sites:
    site_listings = neighbourhood_listings(browser, site)

Progress: [############################################################] 100.0% Last Post: 511.21 Seconds page-70 


In [14]:
listings_query = """
SELECT *
FROM listings
WHERE BuiltIn > 2010
AND CAST(REPLACE(Sqft, " sqft", "") AS INT) > 3000
AND CAST(Price AS INT) < 8000000
"""

listings_df = pd.read_sql(listings_query,
                          con = zolo_engine,
                          parse_dates = ['PullDate'])
listings_df = listings_df[listings_df['PullDate'] == dt.datetime.today().date()].reset_index(drop = True)
listings_df['City'] = listings_df.Address.str.split(', ').str[1]
#listings_df['Preference'] = np.nan
listings_df = listings_df.reset_index(drop = True)

In [17]:
details_df = pd.DataFrame()
prices_df = pd.DataFrame()

start_time = time.time()

i = 1
for idx, row in listings_df.iterrows():
    curr_sum, curr_price = listing_details(browser, row.Url)
    curr_sum['Url'] = row.Url
    curr_price['Url'] = row.Url
    
    details_df = details_df.append(curr_sum)
    prices_df = prices_df.append(curr_price)
    
    run_time = round(time.time() - start_time, 2)
    
    update_progress((i)/len(listings_df), '{0} Seconds {1} '.format(run_time, row.Address))
    i += 1
    
details_df = details_df.reset_index(drop = True)
# details_df['DoM'] = details_df['Days on Site'].str.split(' \(').str[0]

prices_df = prices_df.reset_index(drop = True)
del prices_df['']

details_df['PullDate'] = dt.datetime.today().date()
prices_df['PullDate'] = dt.datetime.today().date()

details_df.to_sql('listing_details',
                  con = zolo_engine,
                  index = False,
                  if_exists = 'append')
prices_df.to_sql('price_history',
                 con = zolo_engine,
                 index = False,
                 if_exists = 'append')

Progress: [############################################################] 100.0% Last Post: 817.11 Seconds 17962 67 Avenue, Surrey, BC 


In [23]:
van_sur_hoods = ['Fraser Heights',
 'Cedar Hills',
 'Crescent Bch Ocean Pk.',
 'Panorama Ridge',
 'White Rock',
 'Guildford',
 'Royal Heights',
 'MacKenzie Heights',
 'Fraser VE',
 'Marpole',
 'Arbutus',
 'S.W. Marine',
 'South Vancouver',
 'Cambie',
 'Yaletown',
 'Dunbar',
 'Point Grey',
 'Kerrisdale',
 'Kitsilano',
 'Quilchena',
 'South Cambie',
 'Killarney VE',
 'South Granville',
 'Shaughnessy',
 'Fraserview VE',
 'Southlands',
 'Oakridge VW']
hoods = van_sur_hoods + listings_df[listings_df.City.isin(['Vancouver','Surrey']) == False].Neighbourhood.drop_duplicates().tolist()

In [57]:
listings_query = """
SELECT *
FROM listings
WHERE BuiltIn > 2010
AND CAST(REPLACE(Sqft, " sqft", "") AS INT) > 3000
AND CAST(Price AS INT) < 8000000
"""

listings_df = pd.read_sql(listings_query,
                          con = zolo_engine,
                          parse_dates = ['PullDate'])
listings_df = listings_df[listings_df['PullDate'] == dt.datetime.today().date()].reset_index(drop = True)
listings_df['City'] = listings_df.Address.str.split(', ').str[1]
#listings_df['Preference'] = np.nan
listings_df = listings_df.reset_index(drop = True)

details_query = """
SELECT *
FROM listing_details
"""

price_query = """
SELECT *
FROM price_history
"""

listings_df = pd.read_sql(listings_query,
                          con = zolo_engine,
                          parse_dates = ['PullDate'])
listings_df = listings_df[listings_df['PullDate'] == dt.datetime.today().date()].reset_index(drop = True)

listings_df['City'] = listings_df.Address.str.split(', ').str[1]
#listings_df['Preference'] = np.nan
listings_df = listings_df.reset_index(drop = True)
listings_df['Price'] = listings_df.Price.apply(lambda x: float(x))
listings_df['SqftDollar'] = listings_df.Price/pd.to_numeric(listings_df.Sqft.str.split(' ').str[0])

listings_df = listings_df[listings_df.Neighbourhood.isin(hoods)].reset_index(drop = True)

details_df = pd.read_sql(details_query,
                         con = zolo_engine,
                          parse_dates = ['PullDate'])
details_df = details_df[details_df['PullDate'] == dt.datetime.today().date()].reset_index(drop = True)

prices_df = pd.read_sql(price_query,
                        con = zolo_engine,
                          parse_dates = ['PullDate'])
prices_df = prices_df[prices_df['PullDate'] == dt.datetime.today().date()].reset_index(drop = True)


In [26]:
building_infos = pd.DataFrame()

for idx, row in listings_df.iterrows():
    browser.get(row.Url)
    
    start_time = time.time()
    
    try:
        building_info = list(filter(lambda x: 'Type' not in x.text, browser.find_elements_by_xpath("//*[contains(text(), 'Building')]")))
        building_info = building_info[1].find_element_by_xpath('..').find_elements_by_class_name('column')
        building_info = pd.DataFrame([(dl.find_element_by_class_name('column-label').get_attribute('innerHTML'),
                                       dl.find_element_by_class_name('column-value').find_element_by_tag_name('span').get_attribute('innerHTML')) for dl in building_info]).T
        building_info.columns = building_info.iloc[0,:]
        building_info = building_info.iloc[1:,:]
        building_info['Url'] = row.Url

        building_infos = building_infos.append(building_info)
    except:
        None
    
    run_time = round(time.time() - start_time, 2)
    update_progress(idx/len(listings_df), '{0} Seconds {1} '.format(run_time, row.City))

Progress: [############################################################] 99.9% Last Post: 0.18 Seconds Surrey 


In [35]:
listings_cols = ['Address', 'Neighbourhood', 'Price', 'Beds', 'Baths', 'Sqft', 'BuiltIn',
                 'Url', 'PullDate', 'City', 'SqftDollar', 'Days on Site',
                 'Lot Size', 'Size','Style', 'Taxes',
                 'Type', 'Walk Score', 'Year Built']

#listings_df = listings_df[listings_df.City.isin(['Vancouver','Surrey']) == False]

pref = building_infos[['View','Url']].merge((listings_df.merge(details_df,
                  how = 'inner',
                  left_on = 'Url',
                  right_on = 'Url')[listings_cols].sort_values('SqftDollar')),
                     how = 'inner',
                     left_on = 'Url',
                     right_on = 'Url')

#pref = pref[pref.View.isnull() == False].sort_values('SqftDollar').reset_index(drop = True)
#pref.View = pref.View.str.lower()

#pref.to_csv('zolo_prefs_2.csv')

Unnamed: 0,Address,Neighbourhood,Price,Beds,Baths,Sqft,BuiltIn,Url,PullDate,City,SqftDollar
376,"12121 101a Avenue, Surrey, BC",Cedar Hills,1500000,7 bed,8 bath,5251 sqft,2013,https://www.zolo.ca/surrey-real-estate/12121-1...,2020-04-04,Surrey,286
316,"10032 174a Street, Surrey, BC",Fraser Heights,1599888,11 bed,10 bath,5590 sqft,2020,https://www.zolo.ca/surrey-real-estate/10032-1...,2020-04-04,Surrey,286
325,"12129 100 Avenue, Surrey, BC",Cedar Hills,1249000,4 bed,3 bath,4357 sqft,9999,https://www.zolo.ca/surrey-real-estate/12129-1...,2020-04-04,Surrey,287
391,"10229 145 Street, Surrey, BC",Guildford,1951000,10 bed,8 bath,6719 sqft,2019,https://www.zolo.ca/surrey-real-estate/10229-1...,2020-04-04,Surrey,290
317,"12376 103a Avenue, Surrey, BC",Cedar Hills,1699000,7 bed,6 bath,5834 sqft,2019,https://www.zolo.ca/surrey-real-estate/12376-1...,2020-04-04,Surrey,291
392,"11679 96 Avenue, Surrey, BC",Royal Heights,1789000,9 bed,8 bath,6042 sqft,2017,https://www.zolo.ca/surrey-real-estate/11679-9...,2020-04-04,Surrey,296
375,"10024 174a Street, Surrey, BC",Fraser Heights,1568000,9 bed,7 bath,5294 sqft,2019,https://www.zolo.ca/surrey-real-estate/10024-1...,2020-04-04,Surrey,296
380,"17375 100 Avenue, Surrey, BC",Fraser Heights,1688888,8 bed,9 bath,5649 sqft,2019,https://www.zolo.ca/surrey-real-estate/17375-1...,2020-04-04,Surrey,299
345,"17716 102 Avenue, Surrey, BC",Fraser Heights,1348800,7 bed,6 bath,4454 sqft,2011,https://www.zolo.ca/surrey-real-estate/17716-1...,2020-04-04,Surrey,303
313,"10050 172 Street, Surrey, BC",Fraser Heights,1649000,9 bed,8 bath,5355 sqft,2020,https://www.zolo.ca/surrey-real-estate/10050-1...,2020-04-04,Surrey,308


In [63]:
pref = listings_df.sort_values('SqftDollar').reset_index(drop = True)#pd.read_csv('zolo_prefs.csv', index_col = 0)
# pref = pref[pref.View.str.contains('ocean')].reset_index(drop = True)
pref['Preference'] = 0

for idx, row in pref.iterrows():
    
    if row.Preference != 0 or row.City == 'Surrey' or row.City == 'Vancouver':
        continue
    browser.get(row.Url)
    # print(row)
    
    x = input('Prefernce of Property (y or n): ')
    
    if x == 'quit':
        break
    pref.loc[idx, 'Preference'] = x
    clear_output(wait = True)


Prefernce of Property (y or n): quit


In [72]:
for idx, row in pref.iterrows():
    
    if row.Preference != 0 or row.City == 'Surrey' or row.City == 'Vancouver':
        continue
    browser.get(row.Url)
    # print(row)
    
    x = input('Prefernce of Property (y or n): ')
    
    if x == 'quit':
        break
    pref.loc[idx, 'Preference'] = x
    clear_output(wait = True)

Prefernce of Property (y or n): quit


In [73]:
pref.to_csv('zolo_pref_{}.csv'.format(dt.datetime.today().strftime('%Y-%m-%d')))