## Redfin Web Scraping Project

#### Minh (Mark) Le

### Import libraries

In [11]:
from bs4 import BeautifulSoup
import requests
import time
import datetime
from time import sleep
import smtplib
import random
import os
import pandas as pd

In [12]:
## Import Selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.remote.webelement import WebElement

### Parsing with BeautifulSoup

#### Get Information about Properties in Greater Vancouver Area

In [None]:
## Looping to pages to get information about properties

In [3]:
## get number of pages each city name
## split by comma and strip the item in the listing
def get_number_of_pages(city_name):
    formatted_city_name = city_name.split(',')
    formatted_province = formatted_city_name[1].strip().lower()
    formatted_city = formatted_city_name[0].strip().lower()
    url = f'https://www.redfin.ca/{formatted_province}/{formatted_city}'
    header = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "en-US,en;q=0.9",
        'referer': 'https://www.redfin.ca/',
}
    html = requests.get(url=url,headers=header)
    soup = BeautifulSoup(html.content, 'lxml')
    no_pages = int(soup.find('div', {'class': 'viewingPage'}).get_text().split()[4])

    return no_pages

In [4]:
def get_page_url(city_name):
    formatted_city_name = city_name.split(',')
    formatted_province = formatted_city_name[1].strip().lower()
    formatted_city = formatted_city_name[0].strip().lower()

    for page_number in range(0, get_number_of_pages(city_name)+1):
        url = f'https://www.redfin.ca/{formatted_province}/{formatted_city}/'
        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "en-US,en;q=0.9",
            'referer': 'https://www.redfin.ca/',
}


    return url

In [None]:
# Navigating to listings

In [None]:
# copy the property listing and use selenium to navigate each page instead of using looping

In [None]:
def property_listing(city_name):
    url = get_page_url(city_name)
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(url)

    for page_number in range(1, get_number_of_pages(city_name) + 1):
        print(page_number)
        print(get_page_url(city_name))

        header = {
        # use same headers as a popular web browser (Chrome on Windows in this case)
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "en-US,en;q=0.9",
        }
        result = requests.get(url = url, headers = header)
        soup = BeautifulSoup(result.content, 'lxml')
        # page_source = driver.page_source
        # soup = BeautifulSoup(page_source)
        sleep(random.randint(20,30))

        listings = soup.find_all('div', {'class': 'v2 interactive'})
        data = {
            'price_list': [],
            'address': [],
            'home_stat': []
        }
        for listing in listings:
            try:
                price_list = listing.find('span', {'class': 'homecardV2Price'}).get_text()
                address = listing.find('div', {'class': 'link-and-anchor'}).text
                home_stat = listing.find('div', {'class': 'HomeStatsV2 font-size-small'}).text
                data['price_list'].append(price_list)
                data['address'].append(address)
                data['home_stat'].append(home_stat)
            except:
                pass

        hdr = False  if os.path.isfile(f'data_{city_name}.csv') else True
        df = pd.DataFrame(data).to_csv(f'data_{city_name}.csv', mode='a',header=hdr, index=False, encoding='utf-8')
        sleep(random.randint(20,30))

        # driver.find_element(By.XPATH,'//*[@id="results-display"]/div[5]/div/div[3]/button[2]')
        # location = e.location
        # y = location.get('y')

        # use Selenium to scroll down and click next page
        driver.execute_script("window.scrollTo(0, window.scrollY + 15400);")
        sleep(random.randint(10, 15))
        # e.click()
        if page_number == get_number_of_pages(city_name):
            break
        else:
            driver.find_element(By.XPATH,'//*[@id="results-display"]/div[5]/div/div[3]/button[2]').click()
            sleep(random.randint(20, 30))
            url = driver.current_url

#### User defines city name and province of the area they want to scrape

In [49]:
city_name_list = ['Vancouver, BC', 'Toronto, ON']

In [35]:
# no_pages = []
#
# [no_pages.append(get_number_of_pages(city_name)) for index, city_name in enumerate(city_name_list)]
#
# # for index, job_name in enumerate(jobs_titles_list):
# #     no_pages.append(get_no_pages(job_name))
#
# no_pages[index]

### Off market - Sold property

In [None]:
def get_number_of_pages_sold(city_name):
    formatted_city_name = city_name.split(',')
    formatted_province = formatted_city_name[1].strip().lower()
    formatted_city = formatted_city_name[0].strip().lower()
    url = f'https://www.redfin.ca/{formatted_province}/{formatted_city}/filter/include=sold-1yr/'
    header = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "en-US,en;q=0.9",
        'referer': 'https://www.redfin.ca/',
}
    html = requests.get(url=url,headers=header)
    soup = BeautifulSoup(html.content, 'lxml')
    no_pages = int(soup.find('div', {'class': 'viewingPage'}).get_text().split()[4])

    return no_pages

In [None]:
def sold_property(city_name):
    driver = webdriver.Chrome('chromedriver.exe')
    header = {
            # use same headers as a popular web browser (Chrome on Windows in this case)
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "Accept-Language": "en-US,en;q=0.9",
            }
    mainpage = 'https://www.redfin.ca/'
    driver.get(mainpage)
    sleep(random.randint(10, 15))
    search_bar = driver.find_element(By.XPATH, '//*[@id="search-box-input"]')
    search_bar.clear()
    search_bar.send_keys(city_name)
    search_bar.send_keys(Keys.RETURN)
    sleep(random.randint(5, 8))
    #click drop down menu
    delement = driver.find_element(By.XPATH, '//*[@id="sidepane-header"]/div/div[1]/form/div[1]/div').click()
    sleep(random.randint(2, 5))
    #click sold
    delement = driver.find_element(By.XPATH, '//*[@id="solds-expandable-segment"]/div[1]/div/span/span/div/div/label/span').click()
    sleep(random.randint(2, 5))
    #choosing 1 year
    delement = driver.find_element(By.XPATH, '//*[@id="solds-expandable-segment"]/div[1]/div/div[5]/span/span/div/div/label/span').click()
    #click done
    delement = driver.find_element(By.XPATH, '//*[@id="sidepane-header"]/div/div[1]/form/div[1]/div[2]/div[1]/div/div[2]/div/button[2]').click()
    sleep(random.randint(3, 5))
    #click login button
    delement = driver.find_element(By.XPATH, '//*[@id="header-content"]/header[2]/div[2]/div[5]/button').click()
    sleep(random.randint(3, 5))
    login = driver.find_element(By.XPATH, '/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/form/div/div[1]/div/span/span').click()
    sleep(random.randint(1, 2))
    driver.find_element(By.XPATH,'/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/form/div/div[1]/div/span/span/div/input').send_keys(username)
    sleep(random.randint(1, 2))
    #click continue with email button
    driver.find_element(By.XPATH,'/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/form/div/div[1]/button').click()
    sleep(random.randint(3, 5))
    #click sign in with email instead
    driver.find_element(By.XPATH, '/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/div[3]/button').click()
    sleep(random.randint(1, 2))
    driver.find_element(By.XPATH, '/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/form/div/div[2]/span/span/div/input').send_keys(password)
    sleep(random.randint(1, 2))
    #click continue with email
    driver.find_element(By.XPATH, '/html/body/div[8]/div/div[2]/div/div/div/div[2]/div/div/div/div[1]/div/div/form/div/div[4]/button').click()

    for page_number in range(1, get_number_of_pages_sold(city_name) + 1):
        #print(page_number)
        #print(get_page_url(city_name))

        header = {
        # use same headers as a popular web browser (Chrome on Windows in this case)
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "en-US,en;q=0.9",
        }
        result = requests.get(url = driver.current_url, headers = header)
        soup = BeautifulSoup(result.content, 'lxml')
        # page_source = driver.page_source
        # soup = BeautifulSoup(page_source)
        sleep(random.randint(20,30))

        listings = soup.find_all('div', {'class': 'v2 interactive'})
        data = {
            'price_list': [],
            'address': [],
            'home_stat': [],
            'date_sold': []
        }
        for listing in listings:
            try:
                price_list = listing.find('span', {'class': 'homecardV2Price'}).get_text()
                address = listing.find('div', {'class': 'link-and-anchor'}).text
                home_stat = listing.find('div', {'class': 'HomeStatsV2 font-size-small'}).text
                date_sold = listing.find('div', {'class': 'Pill Pill--sold padding-vert-smallest padding-horiz-smaller font-size-smaller font-weight-bold font-color-white HomeSash margin-right-smaller margin-top-smallest'}).text
                data['price_list'].append(price_list)
                data['address'].append(address)
                data['home_stat'].append(home_stat)
                data['date_sold'].append(date_sold)
            except:
                pass

        hdr = False  if os.path.isfile(f'data_sold_property_{city_name}.csv') else True
        df_sold = pd.DataFrame(data).to_csv(f'data_{city_name}.csv', mode='a',header=hdr, index=False, encoding='utf-8')
        #df_sold = pd.DataFrame(data).to_csv(f'data_sold_property_{city_name}.csv', mode='a',header=hdr, index=False, encoding='utf-8')
        #df_sold = df_sold.transpose()
        # df_sold= pd.DataFrame.from_dict(df_sold, orient='index')
        # df_sold = df_sold.transpose()
        sleep(random.randint(20,30))

        # driver.find_element(By.XPATH,'//*[@id="results-display"]/div[5]/div/div[3]/button[2]')
        # location = e.location
        # y = location.get('y')

        # use Selenium to scroll down and click next page
        driver.execute_script("window.scrollTo(0, window.scrollY + 15400);")
        sleep(random.randint(10, 15))
        # e.click()
        if page_number == get_number_of_pages_sold(city_name):
            break
        else:
            driver.find_element(By.XPATH,'//*[@id="results-display"]/div[5]/div/div[3]/button[2]').click()
            sleep(random.randint(20, 30))
            url = driver.current_url

In [61]:
#Input your credential of Redfin Login here
username = 'mark.mqle@gmail.com'
password = 'Lequangminh1509!'

In [62]:
## Main loop for scraping
for city_name in city_name_list:
    #property_listing(city_name)
    sold_property(city_name)
    continue

  driver = webdriver.Chrome('chromedriver.exe')


PermissionError: [Errno 13] Permission denied: 'data_Toronto, ON.csv'

### Exploratory Data Analysis

In [None]:
dfdict = {}
for city_name in city_name_list:
    exec(f'df_{city_name} = create_df("data_{city_name}.csv")') # create a dataframe for each job
    exec(f'dfdict[city_name] = df_{city_name}') # create a dict that contains the dataframes