## project luther

Nathaniel Kelley

01/25/18

## imports and driver setup

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import re
from selenium.webdriver.chrome.options import Options
import pandas as pd


chromedriver = "/home/nate/Bench/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# scrape for individual page

In [2]:
def house_info(driver):
    
    '''
    FOR: https://www.redfin.com
    
    call once on the webpage for an individual house
    
    variables scraped include: 
            address, home price, beds, baths, house size by sqft, lot size by sqft, 
            year built, garage type, yearly property tax, number of nearby buslines
            
    stored in a dictionary with address as the key, so any address scraped twice
    won't be duplicated
    '''
    

    address = driver.find_element_by_xpath('//*[@class="street-address"]').text    
    csz = driver.find_element_by_xpath('//*[@class="citystatezip"]').text    
    full_address = address + ', ' + csz


    home_price = driver.find_element_by_xpath('//*[@class="info-block price"]').text
    try:
        try:
            home_price = float(home_price.replace('$', '').replace(',', '').replace('\nPrice', ''))
        except:
            home_price = float(home_price.replace('$', '').replace(',', '').replace('\nListed at Price', ''))
    except:
        home_price = float(home_price.replace('$', '').replace(',', '').replace('+\nPrice', ''))

    beds = driver.find_element_by_xpath('//*[@data-rf-test-id="abp-beds"]').text
    try:
        beds = float(beds.replace('\n', ' ').split(' ')[0])
    except:
        beds = float(0)
        

    baths = driver.find_element_by_xpath('//*[@data-rf-test-id="abp-baths"]').text
    try:
        baths = float(baths.replace('\n', ' ').split(' ')[0])
    except:
        baths = float(0)
        
    
    house_sqft = driver.find_element_by_xpath('//*[@class="info-block sqft"]').text
    try:
        house_sqft = float(house_sqft.split(' ')[0].replace(',', ''))
    except:
        house_sqft = float(0)
        
        
    key_deets = driver.find_element_by_xpath('//*[@class="keyDetailsList"]').text
    try:
        key_deets = key_deets.split('Built')[1].split('\n')

        year_built = float(key_deets[1])

        try:
            lot_sqft = float(key_deets[3].split(' ')[0].replace(',', ''))
        except:
            lot_sqft = float(0)
    except:
        year_built = float(0)
        lot_sqft = float(0)

    
    try:
        amenities = driver.find_element_by_xpath('//*[@class="amenities-container"]').text
        amenities = amenities.replace('\n', ' ')


        garage_find = re.compile('\w*tached G')
        garage_type = re.findall(garage_find, amenities)
        try:
            garage_type = str(garage_type[0]).lower().split(' ')[0]
        except:
            garage_type = 'none'


        try:
            try:    
                tax_find = re.search(r'Taxes:(.*?)Tax', amenities).group(1)
                yearly_tax = float(tax_find.replace(' ', '').replace('$', '').replace(',', ''))
            except:
                monthly_tax = driver.find_element_by_xpath('//*[@class="MortgageCalculatorSummary"]').text
                monthly_tax = monthly_tax.replace('\n', ' ')
                monthly_tax = re.search(r'Taxes(.*?)H', monthly_tax).group(1)
                monthly_tax = float(monthly_tax.strip().replace('$', '').replace(',', ''))
                yearly_tax = 12*monthly_tax
        except:
            yearly_tax = float(0)
    except:
        garage_type = 'none'
        yearly_tax = float(0)
    

    
    stats = [full_address, home_price, beds, baths, \
            house_sqft, lot_sqft, year_built, \
            garage_type, yearly_tax]
    
    return {stats[0]: stats[1:]}

# scraping the whole page

In [3]:
def page_pull(driver):
    
    '''
    for page one
    
    open house, scrape, close, repeat
    
    close window after house 20 (max)
    '''

    for i in range(20):

        
        try:
            search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[3]/a/button'.format(i))
        except:
            try:
                search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[4]/a/button'.format(i))
            except:
                search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[2]/a/button'.format(i))
        

    
        search.click()
        time.sleep(1);

        driver.switch_to_window(driver.window_handles[1]);

        house_pull.update(house_info(driver));
        time.sleep(1);


        driver.close();

        driver.switch_to_window(driver.window_handles[0]);

In [4]:
def page_over_pull(driver):
    
    '''
    for page i in (2, n)
    
    open house, scrape, close, repeat
    
    close window after house 20*i (max)
    '''

    for i in range(20*(j-1),20*j):

        try:
            search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[3]/a/button'.format(i))
        except:
            try:
                search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[4]/a/button'.format(i))
            except:
                search = driver.find_element_by_xpath('//*[@id="MapHomeCard_{}"]/div/div[2]/div[2]/a/button'.format(i))
                
                
        search.click()
        time.sleep(1);

        driver.switch_to_window(driver.window_handles[1]);

        house_pull.update(house_info(driver));
        time.sleep(1);


        driver.close();

        driver.switch_to_window(driver.window_handles[0]);

# scraping all the pages

In [5]:
# empty dict for dropping house data

house_pull = {}


In [12]:
# scrape page 1


# initially not included in whole loop because not 'https://...type=house/page-1'
# left on its own for page 1 update-scrapes

# Seattle:
# https://www.redfin.com/city/16163/WA/Seattle/filter/sort=lo-days,property-type=house
# Denver:
# https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house
# Denver test for more data:
# https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house+condo+townhouse,include=forsale+construction
# goto then select areas

options = Options() 
options.add_argument("--start-maximized") 
driver=webdriver.Chrome(chromedriver, chrome_options=options);

driver.get('https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house+condo+townhouse,include=forsale+construction,viewport=39.70835:39.62378:-104.95502:-105.11776')

page_pull(driver);

driver.quit();

In [13]:
# scrape pages 2 thru n-1


# can identify n, or just plug in high number and let it error off
# recommend checking n to avoid extra dummy houses at end


# Seattle:
# https://www.redfin.com/city/16163/WA/Seattle/filter/sort=lo-days,property-type=house/page-{}
# Denver:
# https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house/page-{}
# Denver test for more data:
# https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house+condo+townhouse,include=forsale+construction/page-{}

options = Options() 
options.add_argument("--start-maximized") 
driver=webdriver.Chrome(chromedriver, chrome_options=options);

n = 8
city = 'https://www.redfin.com/city/5155/CO/Denver/filter/sort=lo-days,property-type=house+condo+townhouse,include=forsale+construction,viewport=39.70835:39.62378:-104.95502:-105.11776/page-{}'

for j in range(2, n):
    driver.get(city.format(j));
    time.sleep(1);
    page_over_pull(driver);
    
driver.quit();

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="MapHomeCard_134"]/div/div[2]/div[2]/a/button"}
  (Session info: chrome=64.0.3282.119)
  (Driver info: chromedriver=2.35.528139 (47ead77cb35ad2a9a83248b292151462a66cd881),platform=Linux 4.10.0-42-generic x86_64)


In [22]:
len(house_pull)

581

In [14]:
old_dict = pd.read_pickle("/home/nate/ds/metis/class_work/projects/project_luther/data/den_raw.pkl")

In [16]:
len(old_dict)

1172

In [15]:
old_dict.update(house_pull)

In [17]:
# file names:
# Seattle: sea_raw.pkl
# Denver: den_raw.pkl


import pickle

pickle_out = open("den_raw_new.pkl","wb")
pickle.dump(old_dict, pickle_out)
pickle_out.close()

In [20]:
den_raw = pd.read_pickle('/home/nate/ds/metis/class_work/projects/project_luther/code/den_raw_new.pkl')

In [42]:
sea_raw = pd.read_pickle('/home/nate/ds/metis/class_work/projects/project_luther/data/sea_raw.pkl')

In [75]:
# df to dict, with index as key

# sea_raw = sea_raw.transpose()

# sea_raw = sea_raw.to_dict(orient='list')

In [21]:
len(den_raw)

1172