In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import time
import os
from selenium import webdriver

In [3]:
def random_user_agent():
    try:
        ua = UserAgent()
        return ua.random
    except:
        default_ua = 'Mozilla/5.0 (Macintosh; Intel Mac O21S X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
        return default_ua

def get_soup(url):
    
    headers = {'User-Agent': random_user_agent()}
    response = requests.get(url, headers=headers)
    results = response.content
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
    return soup

def extract_num(text):
    """
    A helper function that extract any number from a text 

    Parameters
    ----------
    text : str
        a string of text that might contains numbers 

    Returns
    -------
    num : float
        the number extracted from the text 

    >>> _extract_num('$1000 per month')
    1000.0
    """
    try:
        # pattern to find any number (int or float)
        pattern = r'[-+]?\d*\.\d+|\d+'
        result = re.findall(pattern, text)[0]
        return float(result)
    except:
        return np.nan

def soup_attempts(url, total_attempts=5):

    soup = get_soup(url)

    if soup:
        return soup
    else:
        attempts = 0
        while attempts < total_attempts:
            time.sleep(3)
            soup = get_soup(url)
            if soup:
                return soup
        raise ValueError(f'FAILED to get soup for apt url {url}')

In [3]:
url = 'https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565'
soup = soup_attempts(url)

In [11]:
soup.find('figure', class_='getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN')

<figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure>

In [12]:
soup.find('div', class_='carousel-item')

<div class="carousel-item active"><div aria-label="slide image" aria-pressed="false" class="getCarouselSlides__SlideImageWrapper-cel6pe-1 kRBdIV" role="button" tabindex="0"><figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure></div><div class="carousel-caption d-none d-md-block"><h3></h3><p>caption</p></div></div>

In [13]:
soup.find('figure', class_='getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN')

<figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure>

In [14]:
soup.find('div', attrs={'data-name': 'col-md-8'})

<div class="sc-htpNat Col-sc-1wb67sp-0 MainListingInfo__LeftCol-sc-1fxwvn8-1 cKZOWW" data-name="col-md-8"><h1>30 Bartlett Street</h1><div class="MainListingInfo__UnitTypeAndStatusContainer-sc-1fxwvn8-8 djClTt">Commercial<!-- --> | <!-- -->for Sale</div><div class="MainListingInfo__NeighborhoodNameLink-sc-1fxwvn8-9 eNUIut"><a href="/nyc-real-estate/neighborhoods/williamsburg" id="neighborhood-name-link">Williamsburg</a></div><div class="MainListingInfo__CrossStreets-sc-1fxwvn8-10 fBSNno">Between Harrison Avenue and Throop Avenue</div><strong class="MainListingInfo__WebId-sc-1fxwvn8-4 haTtwu">WEB ID: <!-- -->5151565</strong></div>

In [6]:
url_all = 'https://www.corcoran.com/nyc-real-estate/for-sale/search?neighborhoods=battery-park-city%2Cbeekman%2Ccentral-park-south%2Cchelsea-hudson-yards%2Cchinatown%2Cclinton%2Ceast-harlem%2Ceast-village%2Cfinancial-district%2Cflatiron%2Cgramercy%2Cgreenwich-village%2Chamilton-heights%2Charlem%2Cinwood%2Clower-east-side%2Cmidtown-east%2Cmidtown-west%2Cmorningside-heights%2Cmurray-hill%2Croosevelt-island%2Csoho-nolita%2Csutton-area%2Ctribeca%2Cupper-east-side%2Cupper-west-side%2Cwashington-heights%2Cwest-village%2Ccarnegie-hill%2Ckips-bay%2Cno-mad%2Cbath-beach%2Cbensonhurst%2Cbay-ridge%2Cbedford-stuyvesant%2Cbergen-beach%2Cboerum-hill%2Cborough-park%2Cbrighton-beach%2Cbrooklyn-heights%2Cbrownsville%2Cbushwick%2Ccanarsie%2Ccarroll-gardens%2Cclinton-hill%2Ccobble-hill%2Cconey-island%2Ccrown-heights%2Ccypress-hill%2Cditmas-park%2Cdowntown-brooklyn%2Cdyker-heights%2Ceast-flatbush%2Ceast-new-york%2Cflatbush%2Cflatlands%2Cfort-greene%2Cgowanus%2Cgravesend%2Cgreenpoint%2Cgreenwood%2Ckensington%2Clefferts-gardens%2Cmanhattan-beach%2Cmapleton%2Cmarine-park%2Cmidwood%2Cmill-basin%2Cnew-lots%2Cpark-slope%2Cprospect-heights%2Cprospect-park-south%2Cred-hook%2Csea-gate%2Csheepshead-bay%2Cspring-creek%2Cstarrett-city%2Csunset-park%2Cdumbo-vinegar-hill%2Cweeksville%2Cwilliamsburg%2Cwindsor-terrace%2Cocean-parkway%2Cgerritsen-beach%2Cbrooklyn-navy-yard%2Ccolumbia-waterfront%2Castoria%2Cbelle-harbor%2Cforest-hills%2Cjackson-heights%2Ckew-gardens%2Clong-island-city%2Csunnyside%2Cwoodside%2Cridgewood%2Cflushing%2Cfresh-meadows%2Cjamaica%2Csouth-jamaica%2Cmaspeth%2Cglendale%2Cmiddle-village%2Cwoodhaven%2Celmhurst%2Ceast-elmhurst%2Ccorona%2Ccollege-point%2Cwhitestone%2Cqueens-village%2Cbellerose%2Chollis%2Cst-albans%2Ccambria-heights%2Cozone-park%2Csouth-ozone-park%2Choward-beach%2Crichmond-hills%2Cspringfield-gardens%2Claurelton%2Crockaway-beach%2Cbriarwood%2Cbroad-channel%2Cfloral-park%2Cglen-oaks%2Cjamaica-hills%2Ckew-gardens-hills%2Clittle-neck%2Cnew-hyde-park%2Crego-park%2Crochdale%2Crockaway%2Crosedale%2Criverdale%2Callerton%2Cbaychester%2Cbedford-park%2Cbelmont%2Ccastle-hill%2Ccity-island%2Cco-op-city%2Ccountry-club%2Ceast-tremont%2Ceastchester%2Cedenwald%2Cedgewater-park%2Cfordham%2Chighbridge%2Chunts-point%2Ckingsbridge%2Claconia%2Clongwood%2Cmelrose%2Cmorris-heights%2Cmorris-park%2Cmorrisania%2Cmott-haven%2Cnorwood%2Cparkchester%2Cpelham-bay%2Cpelham-gardens%2Cpelham-parkway%2Cschuylerville%2Csoundview%2Cthroggs-neck%2Ctremont%2Cuniversity-heights%2Cvan-nest%2Cwakefield%2Cwilliamsbridge%2Cwoodlawn&keywordSearch=houses%2Ctownhouses'
soup_all = soup_attempts(url_all)

In [16]:
apts = soup_all.find_all('div', class_='ListingCard__ListingCardWrapper-k9s72e-7 bxPua')

In [17]:
len(apts)

50

In [6]:
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = f'/Users/itachi/Downloads/Chrome/chromedriver'
browser = webdriver.Chrome(executable_path=chromedriver)
browser.get(url_all)
time.sleep(1)
elem = browser.find_element_by_xpath("//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[49]")
print(elem.text, '\n')
browser.execute_script('arguments[0].scrollIntoView(true)', elem)

UPPER EAST SIDE
162 East 63rd Street
5 BD
5.5 BA
4,000 SF
$10,750,000 



In [5]:
elem2 = browser.find_element_by_xpath(f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{49*1}]")
print(elem2.text)
href = elem2.find_element_by_xpath("//a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']")
print(href.get_attribute('href'))
browser.execute_script('arguments[0].scrollIntoView(true)', elem2)

NEW LISTING
GREENPOINT
75 Beadel Street
6 BD
6 BA
3,300 SF
$2,400,000
https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565


In [4]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = f'/Users/itachi/Downloads/Chrome/chromedriver'

def get_apt_info(path, wait):
    try:
        element = wait.until(EC.presence_of_element_located((By.XPATH, path)))
        apt_info = element.text.replace('\n', '|')
        url_path = f"//a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']"
        url_element = wait.until(EC.presence_of_element_located((By.XPATH, url_path)))
        href = url_element.get_attribute('href')
        return apt_info, href 
    except:
        print('can not find apartment')
        return None, None

def get_apt_info_batches(start, end, wait, verbose):
    results_batch = []
    for i in range(start, end):
        sibling_path = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{i}]"
        apt_info, href = get_apt_info(sibling_path, wait)
        
        if verbose:
            print(apt_info, href)
        
        results_batch.append([apt_info, href])
    return results_batch

def scroll_down(destination, wait):
    try:
        elem_dest = wait.until(EC.presence_of_element_located((By.XPATH, destination)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_dest)
    except:
        print('scrolling failed')
        
def get_total_apt_num(url):
    soup = soup_attempts(url)
    header = soup.find('h2')\
                 .get_text()
    total_apt_num = int(extract_num(header))
    return total_apt_num
        
def get_apt_essentials(chromedriver, url, verbose=False):
    browser = webdriver.Chrome(executable_path=chromedriver)
    browser.get(url)
    wait = WebDriverWait(browser, 20)
    roll_over = 50
    total_apt_num = get_total_apt_num(url)
    # exclude the first apartment which we already counted
    nbatches = (total_apt_num-1) // roll_over
    remain = (total_apt_num-1) % roll_over
    
    first_apt_path = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']"
    fst_apt_info, fst_href = get_apt_info(first_apt_path, wait)
    
    results = [[fst_apt_info, fst_href]]
    
    for nb in range(nbatches):
        start, end = nb*49, (nb+1)*49
        
        destination = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{end}]"
        scroll_down(destination, wait)
        
        results_batch = get_apt_info_batches(start, end, wait, verbose=verbose)
        results += results_batch

    return results
        

In [None]:
get_apt_essentials(chromedriver, url_all, verbose=True)

scrolling failed
can not find apartment
None None
BROOKLYN HEIGHTS|18 Remsen Street|7 BD|4 BA|$8,750,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
UPPER EAST SIDE|3 East 63rd Street|13 BD|14 BA|9,950 SF|$15,500,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
FEATURED|GREENPOINT|1097 Lorimer Street|4 BD|2.5 BA|2,448 SF|$2,650,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
REDUCED PRICE|SOHO/NOLITA|7 CENTRE MARKET PLACE|5 BD|4.5 BA|5,200 SF|$7,400,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
FORT GREENE|412 Adelphi Street|4.5 BD|3.5 BA|2,960 SF|$3,100,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
PARK SLOPE|431 8th Street|3 BD|2 BA|2,306 SF|$2,890,000 https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565
FEATURED|BAY R

can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
scrolling failed
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
None None
can not find apartment
Non

In [28]:
get_total_apt_num(url_all)

1763

In [21]:
def scroll_down(destination, wait):
    try:
        elem_dest = wait.until(EC.presence_of_element_located((By.XPATH, destination)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_dest)
        browser.implicitly_wait(5)
    except:
        print('scrolling failed')

browser = webdriver.Chrome(executable_path=chromedriver)
browser.get(url_all)
wait = WebDriverWait(browser, 20)

In [22]:
for i in range(10):
    scroll_pg = 49*(i+1)
    destination = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_pg}]"
    scroll_down(destination, wait)
    print(f'scroll down {i}')

scroll down 0
scroll down 1
scroll down 2
scroll down 3
scroll down 4
scroll down 5
scroll down 6
scrolling failed
scroll down 7
scrolling failed
scroll down 8
scrolling failed
scroll down 9
