In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import time
import os
from selenium import webdriver

In [2]:
def random_user_agent():
    try:
        ua = UserAgent()
        return ua.random
    except:
        default_ua = 'Mozilla/5.0 (Macintosh; Intel Mac O21S X 10_12_3) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/58.0.3029.110 Safari/537.36'
        return default_ua

def get_soup(url):
    
    headers = {'User-Agent': random_user_agent()}
    response = requests.get(url, headers=headers)
    results = response.content
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
    return soup

def extract_num(text):
    """
    A helper function that extract any number from a text 

    Parameters
    ----------
    text : str
        a string of text that might contains numbers 

    Returns
    -------
    num : float
        the number extracted from the text 

    >>> _extract_num('$1000 per month')
    1000.0
    """
    try:
        # pattern to find any number (int or float)
        pattern = r'[-+]?\d*\.\d+|\d+'
        result = re.findall(pattern, text)[0]
        return float(result)
    except:
        return np.nan

def soup_attempts(url, total_attempts=5):

    soup = get_soup(url)

    if soup:
        return soup
    else:
        attempts = 0
        while attempts < total_attempts:
            time.sleep(3)
            soup = get_soup(url)
            if soup:
                return soup
        raise ValueError(f'FAILED to get soup for apt url {url}')

In [3]:
url = 'https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565'
soup = soup_attempts(url)

In [11]:
soup.find('figure', class_='getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN')

<figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure>

In [12]:
soup.find('div', class_='carousel-item')

<div class="carousel-item active"><div aria-label="slide image" aria-pressed="false" class="getCarouselSlides__SlideImageWrapper-cel6pe-1 kRBdIV" role="button" tabindex="0"><figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure></div><div class="carousel-caption d-none d-md-block"><h3></h3><p>caption</p></div></div>

In [13]:
soup.find('figure', class_='getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN')

<figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/68020238/box/800x800"/></figure>

In [14]:
soup.find('div', attrs={'data-name': 'col-md-8'})

<div class="sc-htpNat Col-sc-1wb67sp-0 MainListingInfo__LeftCol-sc-1fxwvn8-1 cKZOWW" data-name="col-md-8"><h1>30 Bartlett Street</h1><div class="MainListingInfo__UnitTypeAndStatusContainer-sc-1fxwvn8-8 djClTt">Commercial<!-- --> | <!-- -->for Sale</div><div class="MainListingInfo__NeighborhoodNameLink-sc-1fxwvn8-9 eNUIut"><a href="/nyc-real-estate/neighborhoods/williamsburg" id="neighborhood-name-link">Williamsburg</a></div><div class="MainListingInfo__CrossStreets-sc-1fxwvn8-10 fBSNno">Between Harrison Avenue and Throop Avenue</div><strong class="MainListingInfo__WebId-sc-1fxwvn8-4 haTtwu">WEB ID: <!-- -->5151565</strong></div>

In [3]:
url_all = 'https://www.corcoran.com/nyc-real-estate/for-sale/search?neighborhoods=battery-park-city%2Cbeekman%2Ccentral-park-south%2Cchelsea-hudson-yards%2Cchinatown%2Cclinton%2Ceast-harlem%2Ceast-village%2Cfinancial-district%2Cflatiron%2Cgramercy%2Cgreenwich-village%2Chamilton-heights%2Charlem%2Cinwood%2Clower-east-side%2Cmidtown-east%2Cmidtown-west%2Cmorningside-heights%2Cmurray-hill%2Croosevelt-island%2Csoho-nolita%2Csutton-area%2Ctribeca%2Cupper-east-side%2Cupper-west-side%2Cwashington-heights%2Cwest-village%2Ccarnegie-hill%2Ckips-bay%2Cno-mad%2Cbath-beach%2Cbensonhurst%2Cbay-ridge%2Cbedford-stuyvesant%2Cbergen-beach%2Cboerum-hill%2Cborough-park%2Cbrighton-beach%2Cbrooklyn-heights%2Cbrownsville%2Cbushwick%2Ccanarsie%2Ccarroll-gardens%2Cclinton-hill%2Ccobble-hill%2Cconey-island%2Ccrown-heights%2Ccypress-hill%2Cditmas-park%2Cdowntown-brooklyn%2Cdyker-heights%2Ceast-flatbush%2Ceast-new-york%2Cflatbush%2Cflatlands%2Cfort-greene%2Cgowanus%2Cgravesend%2Cgreenpoint%2Cgreenwood%2Ckensington%2Clefferts-gardens%2Cmanhattan-beach%2Cmapleton%2Cmarine-park%2Cmidwood%2Cmill-basin%2Cnew-lots%2Cpark-slope%2Cprospect-heights%2Cprospect-park-south%2Cred-hook%2Csea-gate%2Csheepshead-bay%2Cspring-creek%2Cstarrett-city%2Csunset-park%2Cdumbo-vinegar-hill%2Cweeksville%2Cwilliamsburg%2Cwindsor-terrace%2Cocean-parkway%2Cgerritsen-beach%2Cbrooklyn-navy-yard%2Ccolumbia-waterfront%2Castoria%2Cbelle-harbor%2Cforest-hills%2Cjackson-heights%2Ckew-gardens%2Clong-island-city%2Csunnyside%2Cwoodside%2Cridgewood%2Cflushing%2Cfresh-meadows%2Cjamaica%2Csouth-jamaica%2Cmaspeth%2Cglendale%2Cmiddle-village%2Cwoodhaven%2Celmhurst%2Ceast-elmhurst%2Ccorona%2Ccollege-point%2Cwhitestone%2Cqueens-village%2Cbellerose%2Chollis%2Cst-albans%2Ccambria-heights%2Cozone-park%2Csouth-ozone-park%2Choward-beach%2Crichmond-hills%2Cspringfield-gardens%2Claurelton%2Crockaway-beach%2Cbriarwood%2Cbroad-channel%2Cfloral-park%2Cglen-oaks%2Cjamaica-hills%2Ckew-gardens-hills%2Clittle-neck%2Cnew-hyde-park%2Crego-park%2Crochdale%2Crockaway%2Crosedale%2Criverdale%2Callerton%2Cbaychester%2Cbedford-park%2Cbelmont%2Ccastle-hill%2Ccity-island%2Cco-op-city%2Ccountry-club%2Ceast-tremont%2Ceastchester%2Cedenwald%2Cedgewater-park%2Cfordham%2Chighbridge%2Chunts-point%2Ckingsbridge%2Claconia%2Clongwood%2Cmelrose%2Cmorris-heights%2Cmorris-park%2Cmorrisania%2Cmott-haven%2Cnorwood%2Cparkchester%2Cpelham-bay%2Cpelham-gardens%2Cpelham-parkway%2Cschuylerville%2Csoundview%2Cthroggs-neck%2Ctremont%2Cuniversity-heights%2Cvan-nest%2Cwakefield%2Cwilliamsbridge%2Cwoodlawn&keywordSearch=houses%2Ctownhouses'
soup_all = soup_attempts(url_all)

In [16]:
apts = soup_all.find_all('div', class_='ListingCard__ListingCardWrapper-k9s72e-7 bxPua')

In [17]:
len(apts)

50

In [44]:
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = f'/Users/itachi/Downloads/Chrome/chromedriver'
browser = webdriver.Chrome(executable_path=chromedriver)
browser.get(url_all)
time.sleep(1)
elem = browser.find_element_by_xpath("//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[49]/a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']")
print(elem.text, '\n')

# elem_a = elem.find_element_by_xpath("//a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']")
browser.execute_script('arguments[0].scrollIntoView(true)', elem)

UPPER EAST SIDE
162 East 63rd Street
5 BD
5.5 BA
4,000 SF 



In [46]:
elem.get_attribute('href')

'https://www.corcoran.com/nyc-real-estate/for-sale/upper-east-side/162-east-63rd-street/5893737'

In [5]:
elem2 = browser.find_element_by_xpath(f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{49*1}]")
print(elem2.text)
href = elem2.find_element_by_xpath("//a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']")
print(href.get_attribute('href'))
browser.execute_script('arguments[0].scrollIntoView(true)', elem2)

NEW LISTING
GREENPOINT
75 Beadel Street
6 BD
6 BA
3,300 SF
$2,400,000
https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565


In [58]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = f'/Users/itachi/Downloads/Chrome/chromedriver'

def get_apt_info(apt_path, wait):
    try:
        url_path = "a[@class='ListingCard__TopSectionLink-k9s72e-17 icXLMN']"
        full_path = f'{apt_path}/{url_path}'
        element = wait.until(EC.presence_of_element_located((By.XPATH, full_path)))
#         apt_info = element.text.replace('\n', '|')
#         href = url_element.get_attribute('href')
        href = element.get_attribute('href')
        return href 
    except:
        print('can not find apartment')
        return None

def get_apt_info_batches(start, end, wait):
    results_batch = []
    for i in range(start, end):
        sibling_path = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{i}]"
        href = get_apt_info(sibling_path, wait)
        results_batch.append(href)
    return results_batch

def scroll_down(scroll_pg, wait):
    try:
        dest = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_pg}]"
        browser.implicitly_wait(5)
        elem_dest = wait.until(EC.presence_of_element_located((By.XPATH, dest)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_dest)
        
        buffer_dest = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_pg+5}]"
        browser.implicitly_wait(5)
        elem_buffer = wait.until(EC.presence_of_element_located((By.XPATH, buffer_dest)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_buffer)
    except:
        print(f'scrolling failed')
        
def get_total_apt_num(url):
    soup = soup_attempts(url)
    header = soup.find('h2')\
                 .get_text()
    total_apt_num = int(extract_num(header))
    return total_apt_num

def buffer_page(scroll_pg):
    if scroll_pg <= 10*49:
        scroll_buffer = scroll_pg+5
    elif scroll_pg <= 16*49:
        scroll_buffer = scroll_pg+10
    elif scroll_pg <= 21*49:
        scroll_buffer = scroll_pg+15
    elif scroll_pg <= 25*49:
        scroll_buffer = scroll_pg+20
    elif scroll_pg <= 31*49:
        scroll_buffer = scroll_pg+25
    else:
        scroll_buffer = scroll_pg+30
    
    return scroll_buffer
        
def scroll_down(scroll_pg, wait):
    try:
        dest = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_pg}]"
        browser.implicitly_wait(5)
        elem_dest = wait.until(EC.presence_of_element_located((By.XPATH, dest)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_dest)
        time.sleep(5)
        scroll_buffer = buffer_page(scroll_pg)
        
        buffer_dest = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_buffer}]"
        browser.implicitly_wait(5)
        elem_buffer = wait.until(EC.presence_of_element_located((By.XPATH, buffer_dest)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_buffer)
    except:
        print('scrolling failed') 

def keep_scrolling_down(total_apt_num, wait, verbose=False, test=False):
    nbatches = int((total_apt_num//49)+1)
    results = []
    
    if test:
        nbatches = 5
    
    for i in range(nbatches):
        scroll_pg = 49*(i+1)
        scroll_down(scroll_pg, wait)
        if verbose:
            print(f'page {i+1} scrolled')
        
        start, end = 49*i, 49*(i+1)
        if i == 0:
            start = 1
        results += get_apt_info_batches(start, end, wait)
        
        if verbose:
            print(f'results for page {i+1} obtained')
            
    return results

def get_apt_urls(url, wait, verbose=False, test=False):
    first_apt_path = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']"
    first_apt_url = get_apt_info(first_apt_path, wait)
    results = [first_apt_url]
    total_apt_num = get_total_apt_num(url)    
    results += keep_scrolling_down(total_apt_num, wait, verbose, test)
    results_final = list(filter(lambda x: x!=None, results))
    return results_final
    

In [23]:
def scroll_down_v2(scroll_pg, wait):
    try:
        dest = f"//div[@class='ListingCard__ListingCardWrapper-k9s72e-7 bxPua']/following-sibling::div[{scroll_pg}]"
        browser.implicitly_wait(5)
        elem_dest = wait.until(EC.presence_of_element_located((By.XPATH, dest)))
        browser.execute_script('arguments[0].scrollIntoView(true)', elem_dest)
    except:
        print(f'failed scrolling')

In [364]:
scroll_pg = 49*35
scroll_down_v2(scroll_pg, wait)

In [59]:
chromedriver = f'/Users/itachi/Downloads/Chrome/chromedriver'
browser = webdriver.Chrome(executable_path=chromedriver)
url_all = 'https://www.corcoran.com/nyc-real-estate/for-sale/search?neighborhoods=battery-park-city%2Cbeekman%2Ccentral-park-south%2Cchelsea-hudson-yards%2Cchinatown%2Cclinton%2Ceast-harlem%2Ceast-village%2Cfinancial-district%2Cflatiron%2Cgramercy%2Cgreenwich-village%2Chamilton-heights%2Charlem%2Cinwood%2Clower-east-side%2Cmidtown-east%2Cmidtown-west%2Cmorningside-heights%2Cmurray-hill%2Croosevelt-island%2Csoho-nolita%2Csutton-area%2Ctribeca%2Cupper-east-side%2Cupper-west-side%2Cwashington-heights%2Cwest-village%2Ccarnegie-hill%2Ckips-bay%2Cno-mad%2Cbath-beach%2Cbensonhurst%2Cbay-ridge%2Cbedford-stuyvesant%2Cbergen-beach%2Cboerum-hill%2Cborough-park%2Cbrighton-beach%2Cbrooklyn-heights%2Cbrownsville%2Cbushwick%2Ccanarsie%2Ccarroll-gardens%2Cclinton-hill%2Ccobble-hill%2Cconey-island%2Ccrown-heights%2Ccypress-hill%2Cditmas-park%2Cdowntown-brooklyn%2Cdyker-heights%2Ceast-flatbush%2Ceast-new-york%2Cflatbush%2Cflatlands%2Cfort-greene%2Cgowanus%2Cgravesend%2Cgreenpoint%2Cgreenwood%2Ckensington%2Clefferts-gardens%2Cmanhattan-beach%2Cmapleton%2Cmarine-park%2Cmidwood%2Cmill-basin%2Cnew-lots%2Cpark-slope%2Cprospect-heights%2Cprospect-park-south%2Cred-hook%2Csea-gate%2Csheepshead-bay%2Cspring-creek%2Cstarrett-city%2Csunset-park%2Cdumbo-vinegar-hill%2Cweeksville%2Cwilliamsburg%2Cwindsor-terrace%2Cocean-parkway%2Cgerritsen-beach%2Cbrooklyn-navy-yard%2Ccolumbia-waterfront%2Castoria%2Cbelle-harbor%2Cforest-hills%2Cjackson-heights%2Ckew-gardens%2Clong-island-city%2Csunnyside%2Cwoodside%2Cridgewood%2Cflushing%2Cfresh-meadows%2Cjamaica%2Csouth-jamaica%2Cmaspeth%2Cglendale%2Cmiddle-village%2Cwoodhaven%2Celmhurst%2Ceast-elmhurst%2Ccorona%2Ccollege-point%2Cwhitestone%2Cqueens-village%2Cbellerose%2Chollis%2Cst-albans%2Ccambria-heights%2Cozone-park%2Csouth-ozone-park%2Choward-beach%2Crichmond-hills%2Cspringfield-gardens%2Claurelton%2Crockaway-beach%2Cbriarwood%2Cbroad-channel%2Cfloral-park%2Cglen-oaks%2Cjamaica-hills%2Ckew-gardens-hills%2Clittle-neck%2Cnew-hyde-park%2Crego-park%2Crochdale%2Crockaway%2Crosedale%2Criverdale%2Callerton%2Cbaychester%2Cbedford-park%2Cbelmont%2Ccastle-hill%2Ccity-island%2Cco-op-city%2Ccountry-club%2Ceast-tremont%2Ceastchester%2Cedenwald%2Cedgewater-park%2Cfordham%2Chighbridge%2Chunts-point%2Ckingsbridge%2Claconia%2Clongwood%2Cmelrose%2Cmorris-heights%2Cmorris-park%2Cmorrisania%2Cmott-haven%2Cnorwood%2Cparkchester%2Cpelham-bay%2Cpelham-gardens%2Cpelham-parkway%2Cschuylerville%2Csoundview%2Cthroggs-neck%2Ctremont%2Cuniversity-heights%2Cvan-nest%2Cwakefield%2Cwilliamsbridge%2Cwoodlawn&keywordSearch=houses%2Ctownhouses'
browser.get(url_all)
wait = WebDriverWait(browser, 20)
results_apt = get_apt_urls(url_all, wait, verbose=True, test=True)

page 1 scrolled
results for page 1 obtained
page 2 scrolled
results for page 2 obtained
page 3 scrolled
results for page 3 obtained
page 4 scrolled
results for page 4 obtained
page 5 scrolled
results for page 5 obtained


In [60]:
sample1 = results_apt
sample1

['https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565',
 'https://www.corcoran.com/nyc-real-estate/for-sale/brooklyn-heights/18-remsen-street/5846984',
 'https://www.corcoran.com/nyc-real-estate/for-sale/upper-east-side/3-east-63rd-street/5689408',
 'https://www.corcoran.com/nyc-real-estate/for-sale/so-ho-nolita/7-centre-market-place/5859066',
 'https://www.corcoran.com/nyc-real-estate/for-sale/greenpoint/1097-lorimer-street/5777325',
 'https://www.corcoran.com/nyc-real-estate/for-sale/fort-greene/412-adelphi-street/5891369',
 'https://www.corcoran.com/nyc-real-estate/for-sale/park-slope/431-8th-street/5890310',
 'https://www.corcoran.com/nyc-real-estate/for-sale/bay-ridge/41-76th-street/5903532',
 'https://www.corcoran.com/nyc-real-estate/for-sale/bedford-stuyvesant/933-greene-avenue/5702129',
 'https://www.corcoran.com/nyc-real-estate/for-sale/east-flatbush/606-east-91st-street/5721881',
 'https://www.corcoran.com/nyc-real-estate/for-sale/park-sl

In [85]:
sample1_url = sample1[0]
sample1_url

'https://www.corcoran.com/nyc-real-estate/for-sale/williamsburg/30-bartlett-street/5151565'

In [86]:
soup_sample1 = soup_attempts(sample1_url)

In [70]:
soup_sample1.find('div', class_='MainListingInfo__UnitTypeAndStatusContainer-sc-1fxwvn8-9 hnYmAD').get_text()

'Townhouse | for Sale'

In [71]:
soup_sample1.find('strong', class_='MainListingInfo__WebId-sc-1fxwvn8-4 haTtwu')

<strong class="MainListingInfo__WebId-sc-1fxwvn8-4 haTtwu">WEB ID: <!-- -->5689408</strong>

In [80]:
soup_sample1.find('ul', class_='Essentials__EssentialsWrapper-sc-1jh003w-0 boWTpJ').find_all('li')[5].get_text()

'25 WIDTH'

In [67]:
soup_sample1.find_all('figure', class_='getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN')

[<figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/111071788/box/800x800"/></figure>,
 <figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/111071762/box/800x800"/></figure>,
 <figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/111071773/box/800x800"/></figure>,
 <figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__SlideImage-cel6pe-3 cNdyRy" src="https://mediarouting.vestahub.com/Media/111071768/box/800x800"/></figure>,
 <figure class="getCarouselSlides__SlideFigure-cel6pe-2 jpnhrN"><img alt="slide image" class="getCarouselSlides__Sli

In [54]:
list(filter(lambda x: x != None, [1,2,3,4,None]))

[1, 2, 3, 4]

In [117]:
def get_apt_address(soup_apt):
    try:
        address = soup_apt.find('div', attrs={'data-name': 'col-md-8'})\
                          .find('h1')\
                          .get_text()
        return address
    except:
        return None

def get_apt_listing_type(soup_apt):
    try:
        listing_type = soup_apt.find('div', class_='MainListingInfo__UnitTypeAndStatusContainer-sc-1fxwvn8-9 hnYmAD')\
                               .get_text()\
                               .split('|')[0]\
                               .strip()
        return listing_type
    except:
        return None

def get_apt_web_id(soup_apt):
    try:
        web_id = soup_apt.find('strong', class_='MainListingInfo__WebId-sc-1fxwvn8-4 haTtwu')\
                         .get_text()\
                         .split(':')[1]\
                         .strip()
        return web_id
    except:
        return None

def get_apt_essentials(soup_apt):
    beds, baths, floors, units, width, sf = None, None, None, None, None, None
    try:
        essentials_tags = soup_apt.find('ul', class_='Essentials__EssentialsWrapper-sc-1jh003w-0 boWTpJ')\
                                  .find_all('li')
        essentials = [etag.get_text().lower() for etag in essentials_tags]
        
        for item in essentials:
            if 'bath' in item:
                baths = extract_num(item)
            if 'bed' in item:
                beds = extract_num(item)
            if 'unit' in item:
                units = extract_num(item)
            if 'width' in item:
                width = extract_num(item)
            if 'sqft' in item:
                sf = extract_num(item)
            if 'floor' in item:
                floors = extract_num(item)
        
        return beds, baths, floors, units, width, sf
    except:
        return beds, baths, floors, units, width, sf

def get_apt_price(soup_apt):
    try:
        price_text = soup_apt.find('div', attrs={'data-name': 'col-md-4'})\
                             .get_text()\
                             .replace('$', '')\
                             .replace(',', '')
        price = extract_num(price_text)
        

In [93]:
get_apt_address(soup_sample1)

'30 Bartlett Street'

In [108]:
get_apt_listing_type(soup_sample1)

'Commercial'

In [109]:
get_apt_web_id(soup_sample1)

'5151565'

In [118]:
get_apt_essentials(soup_sample1)

(None, 0.5, 1.0, 1.0, 99.0, 5000.0)

In [119]:
soup_sample2 = soup_attempts(sample1[3])

In [120]:
get_apt_address(soup_sample2)

'7 CENTRE MARKET PLACE'

In [121]:
get_apt_listing_type(soup_sample2)

'Townhouse'

In [122]:
get_apt_web_id(soup_sample2)

'5859066'

In [123]:
get_apt_essentials(soup_sample2)

(5.0, 4.5, 5.0, 4.0, 25.0, 5200.0)

In [124]:
sample1[3]

'https://www.corcoran.com/nyc-real-estate/for-sale/so-ho-nolita/7-centre-market-place/5859066'