In [1]:
# For scraping and wrangling data.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
#from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
#from itertools import chain
#import re

# For building human like randomness into the webscraper.
import time
import random

# For tracking progress.
from tqdm import tqdm

In [2]:
# Extracting the listings file of webscraped listing we want to reference.
listings = pd.read_csv('data/backup/BACKUP cleaned_zillow_data.csv').drop(columns=['Unnamed: 0']).sample(frac=1)

In [3]:
print(f'''# of listings: {len(listings)}''')

# of listings: 128960


In [4]:
existing_data = pd.read_csv('sales_data.csv')


In [5]:
listings = listings[~listings['zillowId'].isin([x for x in existing_data['zillowId']])]

In [6]:
print(f'''# of listings (filtered): {len(listings)}''')

# of listings (filtered): 127494


In [7]:
# Headers for a more human looking request when web scraping.
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'OptanonAlertBoxClosed=NR; OptanonAlertBoxClosed=NR; bev=1671467614_NDg3YWI1OWViODE0; country=UK; everest_cookie=1671467934.o8UVAqq76D3QPRXxo5SV.2TXJ26qIr6wRp2aGPSqpNu0hmcaL7JhFzmDBWbnzmDw; _csrf_token=V4%24.airbnb.com%24Dwa4m1DnRds%24l_ThNsfxVtYzJnV3gf7vehcjPJusOG_QBcYxaP2lZOM%3D; flags=0; _gcl_au=1.1.1979572412.1671467936; _ga=GA1.1.923589191.1671467936; tzo=-420; frmfctr=wide; ak_bmsc=2BF62A04339B5A8040F6BE42D7397A3E~000000000000000000000000000000~YAAQjuXEF4UQhgeFAQAA27JjLRJCqxbQs87wSwEdHRYX0G6/TDKZTovu4o6yCZ81IqZ6MhWckkFsgOG1eRvCyGcwYcGwZT6Hj0lKPA7+LECJW/p7uLVBPkdj9vDMc+M3APPxtp3dA4N7Xiz9k2CsK1cScWB15o99WkNT8TblkPRJMP+//tibMWG2xoEjeEV14znNbpxu/a19eOVXAFS6pvZFEKc1PvKaN6qVTNMeaHczy8vtIlUIhOHm8n0/neIB5WjhQewZ2bsI4Jk5FAyuugtHiXFiRJU7poWmTC1z3vQV5BNZly4umCoBfxL+4/1Gwtvg+5jzWIgEsQIOlNvhj6IhTWNdgIiUubyhm6SwXUHdD2r80FCzSqqkgs/F95M/C2eTuiMmpV414Qo=; jitney_client_session_id=d94ad55f-160c-4b75-aa19-6b94073c92fc; jitney_client_session_created_at=1671505980; _user_attributes=%7B%22curr%22%3A%22USD%22%2C%22guest_exchange%22%3A1.0%2C%22device_profiling_session_id%22%3A%221671467934--953d4ec0c3e885ffcfcecdcb%22%2C%22giftcard_profiling_session_id%22%3A%221671503786--bedf36f5fd7bc1727463feba%22%2C%22reservation_profiling_session_id%22%3A%221671503786--104feb525ef2701da84b3359%22%7D; jitney_client_session_updated_at=1671505982; _ga_2P6Q8PGG16=GS1.1.1671505983.4.1.1671505983.0.0.0; _uetsid=a14c3a907fbb11ed80888db3ffb5b814; _uetvid=a14c54c07fbb11edbfa35d16156e96a6; previousTab=%7B%22id%22%3A%22c5e39694-2720-411d-bbb4-5880ad1455b6%22%2C%22url%22%3A%22https%3A%2F%2Fwww.airbnb.com%2Fs%2FUnited-States%2Fhomes%22%7D; bm_sv=67E8D97D583DC2BA73262B2A33B8BD6A~YAAQjuXEF3gehweFAQAAzlqFLRKkxd6iaP1xq0hHbfO3CsyIIQifNN1kg5O8tYdobGiFUFkAU0ek5Tg67B28nu2btfk+1yHhT6TE1jibTRrmUjxmCnS6gZ9Sclthu91yIEEePEd32A66YE2G6ofZjSQCTS8TsZU9O8pKFQ7Nid5vVQ6DlyEPLX5yEYMLK94A1kBNulzVg+KjaWf2UN8pSkr66hIUxNn5Qq3fQtl8kCxJeSb6MGW977OC++DGDRXKqg==~1; cfrmfctr=MOBILE; cbkp=2',
    'device-memory': '8',
    'dpr': '2',
    'ect': '4g',
    'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'no-cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'viewport-width': '895'
    }

In [8]:
"""
get_soup

Our basic building blod of requesting the HTML of a page living at a given URL,
and then turning that into a BeautifulSoup object.

    Args:
      url: the url of the page to scrape.
      headers: the headers for the request (helps the request look less suspicious).
    
    Returns:
      bs4 BeautifulSoup text representation of the page (or, None of fails 5 times in a row).
"""

def get_soup(url, headers=headers):

    # Make 5 attempts to get the page.
    for i in range(5):
        try:
            session = requests.Session()
            session.headers.update(headers)
            response = session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        except Exception as e:
            print('get_soup:', e)
            time.sleep(random.randint(5,11)) # Wait a few seconds before tryin again.
            pass
            
    # If nothing returned, let it pass.
    return None

In [9]:
"""
fetch_script_data

Helps us quickly isolate the section of the soup object where we 
will want to scrape data.

    Args:
      A bs4 BeautifulSoup text represenation of a listing page.
    
    Returns:
      A specific subsection of the webpage where a lot of valuable metadata
      lives. This is a nifty helper function for `scrape_listing_page`.
"""

def fetch_script_data(soup):
    # JSON data hiding in script format in HTML.
    script_id= "__NEXT_DATA__"
    dictionary = json.loads(json.loads(soup.find('script', {'id':script_id}).string)['props']['pageProps']['componentProps']['gdpClientCache'])
    
    # The values we want to access are in the key at the 0th index -- but the key name changes based on the
    # listing. As opposed to trying to recreate the key, we're just going to tell the script to "get the values
    # at the 0th index key.
    _, dictionary_values = next(iter(dictionary.items()))
    
    # Fetch the property data.
    script_data = dictionary_values['property']

    # Return it as the variable script_data.
    return script_data

In [10]:
"""
extract_sold_data

If the listing has been sold, determine when it was sold, what the sell price was, and what the new 
zestimate is.

    Args:
        a JSON data struture `script_data`, output of the function `fetch_script_data`.
    
    Returns:
        A DataFrame with the zpid, price, zestimate, and date sold. If the listing is not yet
        sold, the dataframe will be empty.
"""

def extract_sales_data(script_data):
    
    # Empty dictionary for collecting data.
    scrape = {}
    
    # If the listing is Sold, attempt to scrape the data.
    if script_data['homeStatus'] in ['SOLD','RECENTLY_SOLD']:
        
        try:
            scrape['zillowId'] = script_data['zpid']
        except:
            scrape['zillowId'] = [None]
        
        try:
            scrape['homeStatus'] = script_data['homeStatus']
        except:
            scrape['homeStatus'] = [None]
        
        try:
            scrape['price'] = script_data['price']
        except:
            scrape['price'] = [None]
            
        try:
            scrape['zestimate'] = script_data['zestimate']
        except:
            scrape['zestimate'] = [None]
            
        try:
            scrape['dateSoldString'] = script_data['dateSoldString']
        except:
            scrape['dateSoldString'] = [None]
        
        # Save it as a DataFrame.
        df = pd.DataFrame(scrape, index=[0])
        
        # Return the DataFrame
        return df
        
    else:
        
        # Return an empty DataFrame.
        return pd.DataFrame(data={
            'zillowId': script_data['zpid'],
            'homeStatus': script_data['homeStatus'],
            'price': [None],
            'zestimate': [None],
            'dateSoldString': [None]
        })

In [11]:
"""
save_data_to_csv

A helper function to save data as it is processed.

    Args:
      data: The sales data.
      existing_data: Whether or not there is existing data to append this to.
      filename: The name of the file that should be created to store the data.
      
    Returns:
      The DataFrame in its current state with the images processed.
"""

def save_data_to_csv(data, existing_data=None, filename='sales_data.csv'):

    # If a pandas DataFrame is not passed through for existing_data, just save the data.
    if isinstance(existing_data, pd.DataFrame) == False:
        data.to_csv(filename, index=False)

    # If there is existing data, append `data` to the existing data, and save it.
    elif isinstance(existing_data, pd.DataFrame) == True:
        data = pd.concat([existing_data, data])
        data.to_csv(filename, index=False)
    
    # Either way -- whatever is in this csv is now the source of truth. Read it in as
    # `existing_data, to which future loops will append.
    existing_data = pd.read_csv(filename, index_col=False)

    return existing_data

In [12]:
"""
build_url

The cleaned data no longer has the url. We'll quickly rebuild that from the zpid to scrape the relevant data.

    Args:
        zpid: the Zillow ID of the listing.
        
    Returns:
        A url of the listing's page that we can use to scrape the data.
"""

def build_url(zpid):
    return f'''https://www.zillow.com/homedetails/{zpid}_zpid/'''

In [13]:
"""
scrape_listing_sales_data

A function to check all listings in the dataset to see if they have sold, and scrape the appropriate data.

    Args:
        chunksize: how many records to process through before saving a file.
        df: the listings dataframe to reference.
        existing_data: Whether or not we need to point at an existing data set (useful
                       if the job is interrupted).
    
    Returns:
        A DataFrame of the sales data, which is automatically saved in the local directory.
"""

def scrape_listing_sales_data(chunkSize, df=listings, existing_data=None):        
    
    sales_data_list = []
    
    # Iterate through each listing.
    for i, row in tqdm(df.iterrows(), total=len(df), desc='Checking to see which listings have sold...'):
        
        # Attempt to scrape data for one listing.
        try:
            url = build_url(zpid=int(row['zillowId']))      # Construct webscraping URL.
            soup = get_soup(url)                            # Get bs4 object.
            script_data = fetch_script_data(soup)           # Extract structured listing data.
            sales_data = extract_sales_data(script_data)    # Extract relevant sales data.
            sales_data_list.append(sales_data)              # Append the results to the list.
            time.sleep(random.randint(5,15))                # Slow down so we don't get blocked by Zillow.
            
        # If there is an exception, print it and move on.
        except Exception as e:
            print(e)
            print(url)
            time.sleep(random.randint(10,30))
            pass
        
        # Only call this block in intervals that matches every chunk size.
        if i % chunkSize == 0:
            
            # Concatenate all the data so far, and save it. If there is existing data, it will concatenate with that.
            if len(sales_data_list) > 0:
                data = pd.concat(sales_data_list)
                existing_data = save_data_to_csv(data=data, existing_data = existing_data)

            # Clear out the list since we've already written the data from this chunk.
            sales_data_list.clear()
            
    
    # Finish up writing data for any values that weren't included in the final chunk (ex: if there are
    # 120 listings with a chunkSize of 50, this would capture the last 20.
    if len(sales_data_list) > 0:
        data = pd.concat(sales_data_list)
        existing_data = save_data_to_csv(data=data, existing_data = existing_data)

    return existing_data

In [None]:
# Execute the job!
results = scrape_listing_sales_data(chunkSize=100, df=listings, existing_data = True)

Checking to see which listings have sold...:   0%|         | 1/127494 [00:10<358:12:51, 10.11s/it]

In [None]:
results.head()