In [4]:
import requests
import json
import pandas as pd

# make a list of responses to store the html requests
response_list = []

# define the headers for the request
headers = {
    "authority": "www.zillow.com",
    "accept": "*/*",
    "accept-language": "en-US,en;q=0.9,it-IT;q=0.8,it;q=0.7",
    "dnt": "1",
    "sec-ch-ua": '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
    "sec-ch-ua-mobile": "?1",
    "sec-ch-ua-platform": '"Android"',
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36",
}

# define the parameters for the request such as the city, mapbounds, region and filter state

city = "seattle-wa"

# IMPORTANT** for search results with number of listings above 800, you must create a filter to make sure the number of listings stay under 800.  In this case, limiting the minimum and maximum rent will keep the number of listings under 800.
rent_intervals = [(0, 1699), (1700, 2199), (2200, 3199), (3200, 9000)]

# pagination for each search result
max_pages = 20
current_page = 1

# make a list of information to make the requests
request_info_list = []

# iterate through each rent_interval filter
for min_rent, max_rent in rent_intervals:
    # iterate through each page for the current rent_interval filter
    while current_page <= max_pages:
        params = {
            "mapBounds": {
                "west": -122.465159,
                "east": -122.224433,
                "south": 47.491912,
                "north": 47.734145,
            },
            "regionSelection": [{"regionId": 16037, "regionType": 6}],
            "filterState": {
                "fsba": {"value": False},
                "fsbo": {"value": False},
                "nc": {"value": False},
                "fore": {"value": False},
                "cmsn": {"value": False},
                "auc": {"value": False},
                "fr": {"value": True},
                "ah": {"value": True},
                "mf": {"value": False},
                "land": {"value": False},
                "manu": {"value": False},
                # monthy rent
                "mp": {"max": max_rent, "min": min_rent},
            },
            "pagination": {"currentPage": current_page},
        }
        # append the request information to the request_info_list
        request_info_list.append((current_page, params))

        # increase the page counter
        current_page += 1

    # reset the page counter
    current_page = 1

In [5]:
# iterate through each page in the request_info_list
for current_page, params in request_info_list:
    # store the html request for the current page of the current rent_interval
    response = requests.get(
        "https://www.zillow.com/"
        + f"{city}/"
        + "rentals/"
        + f"{current_page}_p/"
        + "?searchQueryState=",
        headers=headers,
        params=params,
    )

    # append the response to the response_list
    response_list.append(response)

In [6]:
from bs4 import BeautifulSoup


def parse_responses(response_list: list) -> list:
    """Parse the html responses into BeautifulSoup objects"""
    soup_list = []
    for response in response_list:
        soup = BeautifulSoup(response.text, "html.parser")
        soup_list.append(soup)
    return soup_list


soup_list = parse_responses(response_list)


In [10]:
print(soup_list[0].prettify())


<!DOCTYPE html>
<html class="no-js zsg-theme-modernized null" itemscope="" itemtype="http://schema.org/Organization" lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#" xmlns:product="http://ogp.me/ns/product#">
 <head>
  <link href="https://fonts.googleapis.com/css?family=Open+Sans:400,600,700&amp;display=swap" rel="stylesheet"/>
  <style type="text/css">
   @font-face{font-display:swap;font-family:Ivar Headline;font-style:normal;font-weight:600;src:url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-extended.woff2) format(&quot;woff2&quot;),url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-extended.woff) format(&quot;woff&quot;);unicode-range:u+0000-001f,u+0080-200f,u+2020-faff,u+fb10-ffff}@font-face{font-display:swap;font-family:Ivar Headline;font-style:normal;font-weight:600;src:url(//www.zillowstatic.com/static-zsg/LATEST/stat

In [9]:
def parse_search_pages(soup_list: list) -> list:
    apartment_info = []

    def get_API_request_tag(soup: object) -> str:
        API_request_tag = soup.find(
            "script",
            {
                "data-zrr-shared-data-key": "mobileSearchPageStore",
                "type": "application/json",
            },
        )
        return API_request_tag

    def parse_API_request_tag(API_request_tag: str) -> dict:
        """Parse the API request tag into a python dictionary.

        The API request tag is a script tag that contains a python dictionary.  The python dictionary contains multiple dictionaries.  The list_results dictionary contains the apartment information.

        Exceptions:
           If the API request tag is not found, return an empty dictionary."""
        try:
            json_API_request_tag = API_request_tag.text.replace(
                "<!--", ""
            ).replace("-->", "")
            API_request_dict = json.loads(json_API_request_tag)
            return API_request_dict
        except:
            pass

    def get_listing_results(API_request_dict: dict) -> list:
        """Find the list_results in the API request dictionary.

        The list_results contains the apartment information.

        Exceptions:
            If the list_results is not found, return an empty list."""
        listing_results = []
        try:
            for key, value in API_request_dict.items():
                if key == "cat1":
                    for key, value in value.items():
                        if key == "searchResults":
                            for key, value in value.items():
                                if key == "listResults":
                                    for listing in value:
                                        listing_results.append(listing)
            return listing_results
        except:
            pass

    # find the list_results json object for each soup in soup_list (contains the apartment information)
    for soup in soup_list:
        API_request_tag = get_API_request_tag(soup)
        API_request_dict = parse_API_request_tag(API_request_tag)
        listing_results = get_listing_results(API_request_dict)
        apartment_info.append(listing_results)
    return apartment_info


apartment_info = parse_search_pages(soup_list)
apartment_info_json = json.dumps(apartment_info)

In [14]:
import sys

print(sys.path)
sys.path.append(
    "C:/python-projects/zillow_webscraper/issues/zillow-webscraper/data"
)

['c:\\python-projects\\zillow_webscraper\\issues\\zillow-webscraper\\notebooks', 'C:\\Users\\work\\AppData\\Local\\Programs\\Python\\Python310\\python310.zip', 'C:\\Users\\work\\AppData\\Local\\Programs\\Python\\Python310\\DLLs', 'C:\\Users\\work\\AppData\\Local\\Programs\\Python\\Python310\\lib', 'C:\\Users\\work\\AppData\\Local\\Programs\\Python\\Python310', 'c:\\pythonvenv\\web_scraping', '', 'c:\\pythonvenv\\web_scraping\\lib\\site-packages', 'c:\\pythonvenv\\web_scraping\\lib\\site-packages\\win32', 'c:\\pythonvenv\\web_scraping\\lib\\site-packages\\win32\\lib', 'c:\\pythonvenv\\web_scraping\\lib\\site-packages\\Pythonwin']


In [16]:
with open(
    "C:/python-projects/zillow_webscraper/issues/zillow-webscraper/data/raw_v2/raw_search_info_2",
    "w",
) as f:
    f.write(apartment_info_json)