In [31]:
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import re

driver = webdriver.Chrome()

In [2]:
# Pull zip codes from the bay area from zipcodesonline.com

# Match every zip code to their corresponding city
# This first page will include bay area zip codes excluding SF and SJ

driver.get('https://www.zipcodesonline.com/2020/08/bay-area-zip-codes-in-2020.html')
page = driver.page_source
soup = bs(page)

table = soup.find('table', class_ = 'MsoNormalTable')
rows = table.find_all('tr')

# Manually designate which cities we want to look at
cities = {"San Francisco City": [], "Alameda City": [], 
          "Oakland": [], "Berkeley": [], "San Rafael": [], 
          "Richmond": [], "Daly City": [], "San Mateo City": [], 
          "Albany": [], "El Cerrito": [], "San Pablo": [], 
          "Greenbrae": [], "Kentfield": [], "Belvedere Tiburon": [], 
          "Corte Madera": [], "Larkspur": [], "Mill Valley": [],
          "Ross": [], "San Anselmo": [], "Sausalito": [], 
          "Brisbane": [], "Burlingame": [], "Milbrae": [], 
          "Pacifica": [], "San Bruno": [], "South San Francisco": []}

# Put all the zip codes into a list corresponding to its city
for row in rows:
    row_cells = row.find_all('td')
    zip_code = row_cells[1].text.strip('\n')
    city_name = row_cells[3].text.strip()
    if city_name in cities.keys():
        cities[city_name].append(zip_code)
        

        
# Wait 5-15 seconds before accessing next page
time.sleep(5 + 10*np.random.random())
        

        
# Add all San Francisco zip codes to the San Francisco City list
driver.get('https://www.zipcodesonline.com/2020/06/san-francisco-zip-code-in-2020.html')
page = driver.page_source
soup = bs(page)

table = soup.find('table', class_ = 'MsoNormalTable')
rows = table.find_all('tr')

# Put all the zip codes into the San Francisco City list
for row in rows:
    row_cells = row.find_all('td')
    zip_code = row_cells[1].text.strip('\n')
    zip_type = row_cells[3].text.strip()
    if zip_type == 'Standard Zip Code':
        cities["San Francisco City"].append(zip_code)

In [3]:
# Create a list of locations we want to query
# The format should be of the form City, CA Zip

location_querylist = []
for city in cities.keys():
    for zip_code in cities[city]:
        location_querylist.append(city+', CA '+zip_code)

In [4]:
## Pick a check_in and check_out time - this could be user input?
## Could be a set dates in a month or range from certain months
## Spring Break maybe? April 18-22
## Weekends/Holidays are best
## Maybe spend part of the holiday in SF?

## check_in = pd.to_datetime('Insert Date')
## check_out = pd.to_datetime('Insert Date')

In [5]:
# Create a function to scrape each individual page
def page_scrape(curr_data, zip_code, driver_name):
    # Determine the number of pages for the given zip code we searched for
    # This while loop will make sure the first page loaded properly
    while True:
        try:
            listing_num = int(driver_name.find_element(By.CLASS_NAME, "_78tyg5").text.split(' ')[0].strip('+'))
            if listing_num > 20:
                num_pages = int(driver_name.find_elements(By.CLASS_NAME, "_833p2h")[-1].text.strip())
                # If successful, move out of the loop and continue
            else:
                num_pages = 1
            break
        # If it did not load properly, wait 1-2 minutes to refresh and try again till it works
        except:
            time.sleep(60 + 60*np.random.random())
            driver_name.refresh()
            time.sleep(5 + 10*np.random.random())
    
    # Copy our old list of data into a new one for output
    updated_data = curr_data[:]
    
    # Go through every page
    for _ in range(num_pages):
        
        # [FOR DEBUGGING] Create an error value to see how many times we miss a listing
        err = False
        err_count = 0
        
        # This while loop will make sure the current page loaded properly
        while True:
            try:
                # Grab the page source of the current page
                soup = bs(driver_name.page_source)
                
                # Find every single listing on this page - max should be 20
                # And last page may have less than 20
                listings = soup.find_all(class_ = "_8s3ctt")
                
                # We'll also do this using selenium to click on buttons later
                i = 0
                driver_listings = driver_name.find_elements(By.CLASS_NAME, "_8s3ctt")
                
                # Go through each listing on the current page
                for listing in listings:
                    
                    # [FOR DEBUGGING] Try to create a row of data using the current listing
                    try:
                        # Find the title of the listing
                        title = listing.find(class_ = "_im5s6sq").text
                        
                        # Find the type of place it is (room, house, apartment, etc.)
                        listing_type = listing.find(class_ = "_1xzimiid").text.rsplit(' in ', 1)[0].lower()
                        
                        # Find the city of the listing
                        ## This is important because AirBnB will have a city associated with the listing
                        ## Which may be different from the list we created earlier based on zip codes
                        ## So this may provide better insight on what would come up if you just searched
                        ## The city and will help us get rid of duplicates later on
                        city = listing.find(class_ = "_1xzimiid").text.rsplit(' in ', 1)[1]
                        
                        # Find the description of the listing
                        # The description will be split into 2 parts
                        desc = listing.find_all(class_ = "_3c0zz1")
                        
                        if len(desc) == 1:
                            # The first will list the numerical descriptions such as number of bedrooms and bathrooms
                            numerical_descriptors = " - ".join(t.text for t in desc[0].find_all(class_ = "_3hmsj"))
                            amenities = ''
                        else:
                            # The first will list the numerical descriptions such as number of bedrooms and bathrooms
                            numerical_descriptors = " - ".join(t.text for t in desc[0].find_all(class_ = "_3hmsj"))
                            # The second will list a rundown on the amenities the host provides
                            amenities = " - ".join(t.text for t in desc[1].find_all(class_ = "_3hmsj"))
                        
                        # Find the dates for the given stay
                        try:
                            dates = listing.find(class_ = "_17bkx6k").text
                        # If no dates are listed, leave an empty string in its place
                        except:
                            dates = ''
                            
                        # Try to find the rating and number of reviews
                        try:
                            rating = listing.find(class_ = "_10fy1f8").text
                            num_reviews = listing.find(class_ = "_a7a5sx").text
                            num_reviews = int(re.sub("[()]", "", num_reviews).strip().split(" review")[0])
                        # If this doesn't exist, then give it 0 reviews and NaN for its rating
                        except:
                            rating = np.NaN
                            num_reviews = 0
                            
                        # Find the pricing
                        price_per_night = listing.find(class_ = "a8jt5op dir dir-ltr").text
                        # Determine if the price is at a discounted rate
                        if "originally" in price_per_night:
                            # If it is, find the original value of the listing
                            discounted = True
                            original_value = float(price_per_night.split(" ")[-1].strip("$").replace(",", ""))
                        else:
                            # Otherwise, just say it was not discounted
                            discounted = False
                            original_value = np.NaN
                        # Find the price per night
                        price_per_night = float(price_per_night.strip("$").strip().split(" ")[0].replace(",", ""))
                        
                        try:
                            # Click on the price breakdown button for the current listing
                            breakdown_button = driver_listings[i].find_element(By.CLASS_NAME, "_prflas2")
                            breakdown_button.click()
                            time.sleep(1 + np.random.random())
                            
                            # Gather all the values from this breakdown which may include cleaning and service fees
                            price_vals = dict(zip(
                                [names.text for names in driver_name.find_elements(By.CLASS_NAME, "_18x3iiu")],
                                [nums.text for nums in driver_name.find_elements(By.CLASS_NAME, "_1k4xcdh")]))
                                
                            # Find if there is a cleaning fee listed, otherwise put 0
                            if "Cleaning fee" in price_vals.keys():
                                cleaning_fee = float(price_vals["Cleaning fee"].strip("$").replace(",", ""))
                            else:
                                cleaning_fee = 0.
                                
                            # Find if there is a service fee listed, otherwise put 0
                            if "Service fee" in price_vals.keys():
                                service_fee = float(price_vals["Service fee"].strip("$").replace(",", ""))
                            else:
                                service_fee = 0.
                                
                            # Click x to exit the breakdown overlay
                            driver_name.find_element(By.CLASS_NAME, "_9grt04").click()
                            
                        except:
                            # If we refresh and it still doesn't show a button, just put 0 for both
                            if err_count > 0:
                                cleaning_fee = 0.
                                service_fee = 0.
                            else:
                                raise Exception('Refreshing page in case of loading error.')
                                
                        # Increment to click button on the next listing
                        i += 1
                        
                        updated_data += [{"title": title, 
                                     "listing_type": listing_type,
                                     "city": city,
                                     "zip_code": zip_code,
                                     "numerical_descriptors": numerical_descriptors,
                                     "amenities": amenities,
                                     "dates": dates,
                                     "rating": rating,
                                     "num_reviews": num_reviews,
                                     "discounted": discounted,
                                     "original_value": original_value,
                                     "price_per_night": price_per_night,
                                     "cleaning_fee": cleaning_fee,
                                     "service_fee": service_fee}]

                        # Pause for 1-2 seconds before moving onto then next listing
                        # Act natural...
                        time.sleep(1 + np.random.random())
                        
                        
                    # [FOR DEBUGGING] If unable to create a new row, increment err and exit to retry
                    except:
                        err_count += 1
                        print(title)
                        print("Error Found on Search", zip_code, "page", _)
                        err = True
                        break
                        
                # If there was an error, refresh and try the page again
                # If there was no errors or refreshing 5 times still resulted in errors
                # Just move onto next page - this is just so we're not stuck in an
                # Infinite loop and so that we can at least get some listings from the page
                # This should only kick in when the page bugs out and the price breakdown
                # Did not load - so err should ideally be 1 at most
                if err and err_count <= 5:
                    time.sleep(60 + 60*np.random.random())
                    driver_name.refresh()
                    time.sleep(5 + 10*np.random.random())
                    err = False
                else:
                    break
            
            # Otherwise, wait 1-2 minutes, refresh the page, and try again till it works
            except:
                time.sleep(60 + 60*np.random.random())
                driver_name.refresh()
                time.sleep(5 + 10*np.random.random())
                
        # Go to the next page
        if num_pages > 1:
            driver_name.find_element(By.CLASS_NAME, "_1bfat5l").click()
            # Wait a few seconds for page to load
            time.sleep(5 + 10*np.random.random())
    
    # [FOR DEBUGGING] Show how many times we failed to create a row on this current location
    
    # Return the newly updated list of data
    return updated_data

In [6]:
# Create a list for our data
data_output = []

In [26]:
# For each city, query AirBnB with the check_in and check_out time
# Scrape each post from each page
# Sleep between 5 and 15 seconds between queries and each page

# For now, we'll just do any weekend in December
# And no specification on number of guests to get a full list

while len(location_querylist) > 0:
    
    location = location_querylist[0]
    # For the first location, we will be on the landing page
    # Every location following this, we will just change our search from the location page
    
    # Get the zip_code of our current location and put it in our data later
    zip_code = int(location.split()[-1])
    
    driver.get('https://www.airbnb.com/')
    
    location_box = driver.find_element(By.CLASS_NAME, "_1xq16jy")
    date_boxes = driver.find_elements(By.CLASS_NAME, "_uh2dzp")
    check_in_box = date_boxes[0]
    search_button = driver.find_element(By.CLASS_NAME, "_sxfp92z")

    # Wait for the page to load before doing anything else!
    time.sleep(5 + 10*np.random.random())
    
    

    # Put the location into the location search box
    location_box.send_keys(location)
    time.sleep(.5 + np.random.random())

    # Click the check-in box
    check_in_box.click()
    time.sleep(.5 + np.random.random())

    # Find the 'flexible dates' option and click it
    driver.find_element(By.CLASS_NAME, "_9qlt59").click()
    time.sleep(.5 + np.random.random())

    # Remove January from the search since it will automatically be highlighted
    driver.find_element(By.ID, "flexible_trip_dates-january").click()
    time.sleep(.5 + np.random.random())

    # Search!
    search_button.click()

    # Wait for page to fully load
    time.sleep(5 + 10*np.random.random())

    # Scrape the pages for this location
    data_output = page_scrape(data_output, zip_code, driver)

    # Remove the location we just completed scraping
    # This will allow us to return to this program if there is an error
    # Without needing to completely restart
    location_querylist.pop(0)
    
    
    
    # If we're not on the last location of the list, we'll wait before searching again
    # Otherwise, we can just stop... This is just to save us 5-15 seconds...
    if len(location_querylist) > 0:
        time.sleep(5 + 10*np.random.random())

In [27]:
# Turn the list of listings into a dataframe
df = pd.DataFrame(data_output)

In [28]:
# Take a look at the end to make sure everything is in order
df.tail()

Unnamed: 0,title,listing_type,city,zip_code,numerical_descriptors,amenities,dates,rating,num_reviews,discounted,original_value,price_per_night,cleaning_fee,service_fee
20316,"Near Serramonte Shopping Center, Golf, 3 Conte...",hotel room,South San Francisco,94083,12 guests - 3 bedrooms - 6 beds - 3 baths,Wifi - Free parking - Kitchen - Washer,Dec 17 – 19,,0,False,,564.0,0.0,159.0
20317,"3 Budget-Friendly Suites w/Kitchen, Laundry, F...",hotel room,South San Francisco,94083,12 guests - 3 bedrooms - 6 beds - 3 baths,Wifi - Free parking - Kitchen - Washer,Dec 24 – 26,,0,False,,645.0,0.0,182.0
20318,Cozy home near SFO in great neighborhood,private room,San Bruno,94083,2 guests - 1 bedroom - 1 bed - 1 shared bath,Wifi - Free parking - Kitchen,Dec 31 – Jan 2,3.88,8,False,,169.0,50.0,55.0
20319,"3 Spacious Units, Minutes to Convention Center...",hotel room,South San Francisco,94083,12 guests - 3 bedrooms - 6 beds - 3 baths,Wifi - Free parking - Kitchen - Washer,Dec 24 – 26,,0,False,,645.0,0.0,182.0
20320,85' Yacht docked at Pier 40 San Francisco,boat,Brisbane,94083,8 guests - 5 bedrooms - 5 beds - 5 baths,Free parking - Kitchen - Washer,Dec 31 – Jan 2,,0,False,,1950.0,0.0,551.0


In [40]:
# Export as a csv for cleaning
df.to_csv('listings.csv', index = False)