# Webscrape Individual AirBnB Listing Pages

In Part 1 of the webscraping, we scraped 180,000~ AirBnB listings in the western half of the United States. In the process of that webscraping, we got each listing's `listing_id` -- their unique ID in AirBnB's system.

Using this as a stepping stone, we can programmatically step from the search page into each listing page, and access a much richer/wider range of data than what exists on the search page.

This notebook is going to scrape individual listing pages to put together a larger picture of the AirBnB listings.

_An example of a listing page._

![Image of individual listing](https://lh5.googleusercontent.com/5Oxx59oyuTQE_oEHvRvs_kNykfOsv1e1xsrpJFRJ4cnN0ve4xEGpvhb4BpHrTNykBDM=w2400)


# Import packages

In [None]:
# For webscraping.
import requests
from bs4 import BeautifulSoup
import re
import random
import time
from requests_html import HTMLSession
import json

# For standard data manipulation.
import numpy as np
import pandas as pd
from datetime import datetime

# For progress tracking.
from tqdm import tqdm

# Makes it easier to see all the columns in wide dataframes!
pd.set_option('display.max_colwidth', None)

# Read in webscraped data from part 1

In [None]:
df = pd.read_csv('scraped_listings BACKUP.csv')
df.drop(columns=['Unnamed: 0','Unnamed: 0.8','Unnamed: 0.9'],inplace=True)

df.head()

# Set Headers

We want to make our requests to AirBnB look as human as possible to prevent getting blocked. VPN's can always help if they block an IP address, but we'd prefer to not get blocked in the first place.

In [None]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'OptanonAlertBoxClosed=NR; OptanonAlertBoxClosed=NR; bev=1671467614_NDg3YWI1OWViODE0; country=US; everest_cookie=1671467934.o8UVAqq76D3QPRXxo5SV.2TXJ26qIr6wRp2aGPSqpNu0hmcaL7JhFzmDBWbnzmDw; _csrf_token=V4%24.airbnb.com%24Dwa4m1DnRds%24l_ThNsfxVtYzJnV3gf7vehcjPJusOG_QBcYxaP2lZOM%3D; flags=0; _gcl_au=1.1.1979572412.1671467936; _ga=GA1.1.923589191.1671467936; tzo=-420; frmfctr=wide; ak_bmsc=2BF62A04339B5A8040F6BE42D7397A3E~000000000000000000000000000000~YAAQjuXEF4UQhgeFAQAA27JjLRJCqxbQs87wSwEdHRYX0G6/TDKZTovu4o6yCZ81IqZ6MhWckkFsgOG1eRvCyGcwYcGwZT6Hj0lKPA7+LECJW/p7uLVBPkdj9vDMc+M3APPxtp3dA4N7Xiz9k2CsK1cScWB15o99WkNT8TblkPRJMP+//tibMWG2xoEjeEV14znNbpxu/a19eOVXAFS6pvZFEKc1PvKaN6qVTNMeaHczy8vtIlUIhOHm8n0/neIB5WjhQewZ2bsI4Jk5FAyuugtHiXFiRJU7poWmTC1z3vQV5BNZly4umCoBfxL+4/1Gwtvg+5jzWIgEsQIOlNvhj6IhTWNdgIiUubyhm6SwXUHdD2r80FCzSqqkgs/F95M/C2eTuiMmpV414Qo=; jitney_client_session_id=d94ad55f-160c-4b75-aa19-6b94073c92fc; jitney_client_session_created_at=1671505980; _user_attributes=%7B%22curr%22%3A%22USD%22%2C%22guest_exchange%22%3A1.0%2C%22device_profiling_session_id%22%3A%221671467934--953d4ec0c3e885ffcfcecdcb%22%2C%22giftcard_profiling_session_id%22%3A%221671503786--bedf36f5fd7bc1727463feba%22%2C%22reservation_profiling_session_id%22%3A%221671503786--104feb525ef2701da84b3359%22%7D; jitney_client_session_updated_at=1671505982; _ga_2P6Q8PGG16=GS1.1.1671505983.4.1.1671505983.0.0.0; _uetsid=a14c3a907fbb11ed80888db3ffb5b814; _uetvid=a14c54c07fbb11edbfa35d16156e96a6; previousTab=%7B%22id%22%3A%22c5e39694-2720-411d-bbb4-5880ad1455b6%22%2C%22url%22%3A%22https%3A%2F%2Fwww.airbnb.com%2Fs%2FUnited-States%2Fhomes%22%7D; bm_sv=67E8D97D583DC2BA73262B2A33B8BD6A~YAAQjuXEF3gehweFAQAAzlqFLRKkxd6iaP1xq0hHbfO3CsyIIQifNN1kg5O8tYdobGiFUFkAU0ek5Tg67B28nu2btfk+1yHhT6TE1jibTRrmUjxmCnS6gZ9Sclthu91yIEEePEd32A66YE2G6ofZjSQCTS8TsZU9O8pKFQ7Nid5vVQ6DlyEPLX5yEYMLK94A1kBNulzVg+KjaWf2UN8pSkr66hIUxNn5Qq3fQtl8kCxJeSb6MGW977OC++DGDRXKqg==~1; cfrmfctr=MOBILE; cbkp=2',
    'device-memory': '8',
    'dpr': '2',
    'ect': '4g',
    'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'no-cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'viewport-width': '895'
    }

# Create the helper functions required to run the scraper

Let's set up a few functions we'll use to breathe life into this webscraper.


### def `reset_dataframe`:
**Args:** 
* None.

**Returns:**
* Empty DataFrame with the shape and headers of the data we intend to scrape.

In [None]:
def reset_dataframe():

    # Create the dataframe.
    pages_dataframe = pd.DataFrame(
            {'listing_id':           [],
             'n_guests':             [],
             'n_bedrooms':           [],
             'n_beds':               [],
             'n_baths':              [],
             'n_amenities':          [],
             'amenities':            [],
             'rating_cleanliness':   [],
             'rating_communication': [],
             'rating_checkin':       [],
             'rating_accuracy':      [],
             'rating_location':      [],
             'rating_value':         [],
             'sleeping':             [],
             'description':          []
            })
    
    return pages_dataframe



### def `get_page`:
**Args:** 
* url: A URL on a search page that will be scraped using the `request` Python package.
* headers: The pre-set headers used to make the requests look more human.

**Returns:**
* Either a `BeautifulSoup` soup object of the webpage, or a None object.

In [None]:
def get_page(url, headers=headers):
        
    try:
        r = requests.get(url, headers=headers)
        text = r.text
        soup = BeautifulSoup(text, 'html.parser')
        return soup
    except:
        try:
            r = requests.get(url, headers=headers)
            text = r.text
            soup = BeautifulSoup(text, 'html.parser')
            return soup
        except:
            try:
                r = requests.get(url, headers=headers)
                text = r.text
                soup = BeautifulSoup(text, 'html.parser')
                return soup
            except:
                return None

### def `get_json`:
**Args:** 
* soup: a `BeautifulSoup` object.

**Returns:**
* A JSON-ified object with the section of the datastructure where we'll scrape data.

In [None]:
def get_json(soup):
    try:
        string = str(soup.html.find_all('script')[-1])
        string_filtered = string[84:-9]
        json_obj = json.loads(string_filtered)

        obj = json_obj['niobeMinimalClientData'][0][1]['data']['presentation']['stayProductDetailPage']['sections']['sections']

        return obj
    except:
        return None

### def `fiddle_with_it`:

_Each AirBnB listing can have slightly different positions within a section of elements. This is an effective, albeit a little inefficient, way of fiddling with the JSON object if the data returned for a specific feature doesn't look like it should._

**Args:** 
* obj: the JSON-ified object.
* var: the variable being scraped.

**Returns:**
* The scraped value.

In [None]:
def fiddle_with_it(obj, var):
    try:
        for i in range(25):
            try:
                if var == 'n_guests':
                    value = obj[i]['section']['detailItems'][0]['title'].split(' ')[0]
                elif var == 'n_bedrooms':
                    value = obj[i]['section']['detailItems'][1]['title'].split(' ')[0]
                elif var == 'n_beds':
                    value = obj[i]['section']['detailItems'][2]['title'].split(' ')[0]
                elif var == 'n_baths':
                    value = obj[i]['section']['detailItems'][3]['title'].split(' ')[0]
                elif var == 'n_amenities':
                    value = obj[i]['section']['seeAllAmenitiesButton']['title']
                    if 'ameni' not in value:
                        pass
                elif var == 'amenities':
                    amenities_dict = {}
                    amenities = obj[i]['section']['seeAllAmenitiesGroups']
                    for x in amenities:
                        # Add dictionary value counting number of amenities in that type.
                        count = 0
                        amenity_type = x['title']
                        for y in x['amenities']:
                            count += 1
                        amenities_dict[amenity_type] = count
                    value            = amenities_dict
                elif var == 'rating_cleanliness':
                    value = obj[i]['section']['ratings'][0]['localizedRating']
                elif var == 'rating_communication':
                    value = obj[i]['section']['ratings'][2]['localizedRating']
                elif var == 'rating_checkin':
                    value = obj[i]['section']['ratings'][4]['localizedRating']
                elif var == 'rating_accuracy':
                    value = obj[i]['section']['ratings'][1]['localizedRating']
                elif var == 'rating_location':
                    value = obj[i]['section']['ratings'][3]['localizedRating']
                elif var == 'rating_value':
                    value = obj[i]['section']['ratings'][5]['localizedRating']
                elif var == 'sleeping':
                    # create a list.
                    sleep_list = []
                    # Get sleeping options.
                    sleep = obj[i]['section']['arrangementDetails']
                    # For each option:
                    for x in sleep:
                        # add a tuple of room and bedtype to the list.
                        title = x['title']
                        subtitle = x['subtitle']
                        sleep_list.append((title, subtitle))
            
                    value             = sleep_list
                elif var == 'description':
                    value = obj[i]['section']['htmlDescription']['htmlText']
                
                return value
                break
            except:
                pass
    except:
        return None

### def `extract_listing_info`:
**Args:** 
* obj: a JSON data structure object that is parsed out of a `BeautifulSoup` soup object of a webpage.
* listing_id: the listing_id of the listing being scraped.
* df: a DataFrame with all of the listings scraped to date. For the first listing, it is the empty DataFrame generated by `reset_dataframe()`. For the second listing and onwards, it is a DataFrame of all the prior data scraped.

**Returns:**
* a DataFrame of all the listings scraped up to that point.

In [None]:
def extract_listing_info(obj, listing_id, df):
    try:
        try:
            listing_id           = listing_id
        except:
            listing_id           = None
        try:
            n_guests             = obj[11]['section']['detailItems'][0]['title'].split(' ')[0]
        except:
            n_guests             = fiddle_with_it(obj, 'n_guests')
        try:
            n_bedrooms           = obj[11]['section']['detailItems'][1]['title'].split(' ')[0]
        except:
            n_bedrooms           = fiddle_with_it(obj, 'n_bedrooms')
        try:
            n_beds               = obj[11]['section']['detailItems'][2]['title'].split(' ')[0]
        except:
            n_beds               = fiddle_with_it(obj, 'n_beds')
        try:
            n_baths              = obj[11]['section']['detailItems'][3]['title'].split(' ')[0]
        except:
            n_baths              = fiddle_with_it(obj, 'n_baths')
        try:
            n_amenities          = obj[16]['section']['seeAllAmenitiesButton']['title']
        except:
            n_amenities          = fiddle_with_it(obj, 'n_amenities')
        try:
            # ccreate a dictionary.
            amenities_dict = {}
            # Get all of the nested amenities.
            amenities = obj[16]['section']['seeAllAmenitiesGroups']
            # For each group of amenities.
            for x in amenities:
                # Add dictionary value counting number of amenities in that type.
                count = 0
                amenity_type = x['title']
                for y in x['amenities']:
                    count += 1
                amenities_dict[amenity_type] = count
    
            amenities            = amenities_dict
        except:
            amenities            = fiddle_with_it(obj, 'amenities')
        try:
            rating_cleanliness   = obj[1]['section']['ratings'][0]['localizedRating']
        except:
            rating_cleanliness   = fiddle_with_it(obj, 'rating_cleanliness')
        try:
            rating_communication = obj[1]['section']['ratings'][2]['localizedRating']
        except:
            rating_communication = fiddle_with_it(obj, 'rating_communication')
        try:
            rating_checkin       = obj[1]['section']['ratings'][4]['localizedRating']
        except:
            rating_checkin       = fiddle_with_it(obj, 'rating_checkin')
        try:
            rating_accuracy      = obj[1]['section']['ratings'][1]['localizedRating']
        except:
            rating_accuracy      = fiddle_with_it(obj, 'rating_accuracy')
        try:
            rating_location      = obj[1]['section']['ratings'][3]['localizedRating']
        except:
            rating_location      = fiddle_with_it(obj, 'rating_location')
        try:
            rating_value         = obj[1]['section']['ratings'][5]['localizedRating']
        except:
            rating_value         = fiddle_with_it(obj, 'rating_value')
        try:
            # create a list.
            sleep_list = []
            # Get sleeping options.
            sleep = obj[15]['section']['arrangementDetails']
            # For each option:
            for x in sleep:
                # add a tuple of room and bedtype to the list.
                title = x['title']
                subtitle = x['subtitle']
                sleep_list.append((title, subtitle))
            
            sleeping             = sleep_list
        except:
            sleeping             = fiddle_with_it(obj, 'sleeping')
        try:
            description          = obj[14]['section']['htmlDescription']['htmlText']
        except:
            description          = fiddle_with_it(obj, 'description')
                    
        row = pd.DataFrame(
            {'listing_id':           [listing_id],
             'n_guests':             [n_guests],
             'n_bedrooms':           [n_bedrooms],
             'n_beds':               [n_beds],
             'n_baths':              [n_baths],
             'n_amenities':          [n_amenities],
             'amenities':            [amenities],
             'rating_cleanliness':   [rating_cleanliness],
             'rating_communication': [rating_communication],
             'rating_checkin':       [rating_checkin],
             'rating_accuracy':      [rating_accuracy],
             'rating_location':      [rating_location],
             'rating_value':         [rating_value],
             'sleeping':             [sleeping],
             'description':          [description]
            })
        # Return dataframe, with new row of data appended.
        return pd.concat([df, row], ignore_index=True)
    except:
        return df

# Generate a list of tuples that will be used to iterate through listings.

In [None]:
ids_and_urls = [(val[0], val[1]) for val in zip(df['listing_id'], df['listing_url'])]

In [None]:
print(len(ids_and_urls))

# Start scraping the webpages.

In [None]:
url_count = 0

pages_dataframe = reset_dataframe()

extract1 = pd.read_csv('extract pages 1.csv')

for id_and_url in tqdm(ids_and_urls[59500+93780:]):
    id_ = id_and_url[0]
    url = id_and_url[1]
    soup = get_page(url)
    obj = get_json(soup)
    pages_dataframe = extract_listing_info(obj, id_, pages_dataframe)
    url_count += 1
    time.sleep(random.randint(4,7))
    if url_count % 100 == 0:
        # Copy the list of frames.
        #write_frames = frames.copy()
        # Add the extracted listings from prior jobs that were interrupted.
        #write_frames.append(extract1)
        # Write the concatenated list of frames into one csv that is plugged into Tableau for monitoring.
        pd.concat([pages_dataframe, extract1], ignore_index=True).to_csv('scraped_pages.csv')
        #pages_dataframe.to_csv('scraped_pages.csv') 

pages_dataframe = pd.concat([pages_dataframe, extract1], ignore_index=True)
        
pages_dataframe.to_csv('scraped_pages.csv') 

In [None]:
pd.options.display.max_colwidth = 200
pages_dataframe.head(50)

# Fiddle and fix the data.

Scraping this one got a little tricky. It required getting creative with VPNs. Because of this, I had to regularly stop the main job, drop nulls, and start again. Nulls are a sign that AirBnB blocked the scraper, and that we need to switch to a different IP address.

In [None]:
# Read in the data.
frame = pd.read_csv('scraped_pages extract 1.csv')

In [None]:
# Drop index.
frame.drop(
    columns=[ 'Unnamed: 0'],
    inplace=True
)
# Drop dupes.
frame.drop_duplicates(
    subset=['listing_id'], 
    inplace=True, 
    ignore_index=False
)

In [None]:
# Take a look at the info in the data.
frame.info()

In [None]:
# Drop records that have n_guests null -- this is our proxy for listings that didn't get scraped.
frame.dropna(axis=0, subset=['n_guests'], inplace=True)

In [None]:
# Take a look at the info in the table again.
frame.info()

In [None]:
frame.head()

In [None]:
# Save the de-nulled data.
frame.to_csv('scraped_pages extract 1.csv')

In [None]:
# Read in the data.
frame = pd.read_csv('scraped_pages extract 1.csv')
frame.drop(columns=['Unnamed: 0'],inplace=True)

## Get the listing ids that aren't yet scraped.

In [None]:
# Read in main data. 
df = pd.read_csv('scraped_listings BACKUP.csv')
df.drop(columns=['Unnamed: 0','Unnamed: 0.8','Unnamed: 0.9'],inplace=True)

df.head()

In [None]:
print(len(df['listing_id']))
print(len(frame['listing_id']))

In [None]:
# Get ids of listings scraped in part 1.
search_page_ids = [id_ for id_ in df['listing_id'].astype('str')]
# Get ids of listings scraped in part 2 so far.
listing_page_ids = [id_ for id_ in frame['listing_id'].astype('int').astype('str')]

In [None]:
# Get IDs left to be scraped.
ids_left_to_scrape = [id_ for id_ in tqdm(search_page_ids) if id_ not in listing_page_ids]
print(len(ids_left_to_scrape))

In [None]:
# Create URLs.
urls_left_to_scrape = ['https://www.airbnb.com/rooms/' + id_ for id_ in ids_left_to_scrape]
urls_left_to_scrape[:1]

In [None]:
# Create list of tuples to loop through.
ids_and_urls = [(val[0], val[1]) for val in zip(ids_left_to_scrape, urls_left_to_scrape)]

In [None]:
# testing something...
random.shuffle(ids_and_urls)

# Restart scrapper with fixed details.

In [None]:
url_count = 0

pages_dataframe = reset_dataframe()

extract1 = pd.read_csv('scraped_pages extract 1.csv')
extract1.drop(columns=['Unnamed: 0'],inplace=True)

for id_and_url in tqdm(ids_and_urls):
    id_ = id_and_url[0]
    url = id_and_url[1]
    soup = get_page(url)
    obj = get_json(soup)
    pages_dataframe = extract_listing_info(obj, id_, pages_dataframe)
    url_count += 1
    time.sleep(random.randint(3,6))
    if url_count % 100 == 0:
        # Copy the list of frames.
        #write_frames = frames.copy()
        # Add the extracted listings from prior jobs that were interrupted.
        #write_frames.append(extract1)
        # Write the concatenated list of frames into one csv that is plugged into Tableau for monitoring.
        pd.concat([pages_dataframe, extract1], ignore_index=True).to_csv('scraped_pages.csv')
        #pages_dataframe.to_csv('scraped_pages.csv') 

pages_dataframe = pd.concat([pages_dataframe, extract1], ignore_index=True)
        
pages_dataframe.to_csv('scraped_pages.csv') 