load the url list

In [1]:
import json

# open up the url_list
with open('C:/Projects/Housing_Price_Prediction/data_processing/bld_url_exts.json', 'r') as f:
    raw_url_ext_list = f.read()

url_ext_list = json.loads(raw_url_ext_list)

# **FIX PLS repeats in the urls probably from the problems in the rent interval ;.; going to fix later
url_ext_list = url_ext_list[0:540]

get's the building key from the url extensions and store it in a new param_list

In [2]:
# get the building keys from the url
request_params_list = []
for url_ext in url_ext_list:
    # get the last url extension
    last_ext = url_ext.split('/')[-2]

    # if it's not already the building key, get the building key from the last url extension
    split_last_ext_list = last_ext.split('-')
    if len(split_last_ext_list) > 1:
        if "_" not in split_last_ext_list[-1]:
            building_key = split_last_ext_list[-1]
    else:
        building_key = split_last_ext_list[0]

    # append the building key and the url to the param_list
    request_params_list.append((url_ext, building_key))

### makes html requests to each building's page and extracts information from zillow's backend server and store it in a response list

In [3]:
import requests
import time
from fake_useragent import UserAgent

# 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36'
# make a fake_useragent object
ua = UserAgent()

failed_request_count = 0
timeout = 0.1
# make a request for each listing and store the response in the response_list
response_list = []


for (url_ext, building_key) in request_params_list[0:5]:

    # the `header` parameter includes information about your browser's current status.  important for preventing captia prompts from zillow
    headers = {
        'authority': 'www.zillow.com',
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'client-id': 'vertical-living',
        'content-type': 'text/plain',
        'dnt': '1',
        'origin': 'https://www.zillow.com',
        'referer': 'https://www.zillow.com' + url_ext,
        'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
        'sec-ch-ua-mobile': '?1',
        'sec-ch-ua-platform': '"Android"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': ua.random,
    }

    data = '{"operationName":"BuildingQuery","variables":{"buildingKey":' + "\"" + building_key + "\"" + ',"cache":false,"latitude":null,"longitude":null,"lotId":null,"update":true},"extensions":{"persistedQuery":{"version":1,"sha256Hash":"a6b5cd248233c9b3da074a0c6daacf6e6388a9b3a7de3347043b2e1b41d095b6"}}}'

    response = requests.post('https://www.zillow.com/graphql/', headers=headers, data=data)

    # append the response if the response went through
    if response.status_code == 200:
        response_list.append(response)
    else:
        failed_request_count += 1
    
    # wait before the next request
    time.sleep(timeout)

### iterate through each response in response_list and performs a few steps:
1. parse it into a python dict
2. Extract the `building` key from the dict
3. appends it to the `bld_info_list`
4. removes all the useless keys from the each dict in the `bld_info_list`

The object from the reponse represents a python dict (or more precisely a dict of dicts), and the dictionary that contains the relevent building information is in the `building` dictionary.  This block iterates through each response and extracts that `building` dictionary to parse later


In [10]:
import json

bld_info_list = []
json_parse_error_count = 0
for response in response_list:

    # 1. parse the response into a python dict, if it fails to parse the response, increase the error count
    try:
        raw_bld_info_dict = json.loads(response.text)

        # 2. extract the building key
        for key, value in raw_bld_info_dict.items():
            if key == 'data':
                for key, value in value.items():
                    if key == 'building':
                        bld_info_dict = value

        # append the listing information to the listing information list
        bld_info_list.append(bld_info_dict)

    except json.JSONDecodeError as e:
        json_parse_error_count += 1

# 4. removes the useless keys
useless_keys = ['regionIds', 'adTargets', 'streetAddress', '__typename', 'breadcrumbs', 'streetViewTileImageUrlLocationModuleLatLong', 'streetViewTileImageUrlLocationModuleAddress', 'streetViewMetadataUrlMediaWallLatLong', 'streetViewMetadataUrlMediaWallAddress', 'mapTileGoogleMapUrlLocationModule', 'mapTileGoogleMapUrlFullWidthMax', 'streetViewTileImageUrlHalfWidthLatLong', 'streetViewTileImageUrlHalfWidthAddress', 'isWaitlisted', 'isInstantTourEnabled', 'isInstantTourCancellable', 'bestGuessTimezone', 'rentalInstantTour', 'amenitiesVRModels', 'galleryPhotos', 'galleryAmenityPhotos', 'fullAddress', 'bdpUrl', 'zpid', 'buildingPhoneNumber', 'county', 'ungroupedUnits', 'nearbyCities', 'nearbyNeighborhoods', 'country', 'nearbyZipcodes', 'nearbyBuildingLinks', 'comps', 'ppcLink', 'reviewsInfo', 'housingConnector', 'localProtections', 'buildingRentalPremiumPackagesInfo', 'homeInsights', 'bestMatchedUnit', 'photos', 'amenityPhotos', 'staticMap', 'staticMapSatellite', 'streetViewLatLong', 'streetViewAddress', 'thirdPartyVirtualTours', 'currency', 'specialOffers', 'listingMetadata', 'vaLoanStatus', 'engrain', 'homeTypes', 'providerInfoList', 'unitsVRModels', 'city', 'state', 'contactInfo']

for bld_info_dict in bld_info_list:
    for key in useless_keys:
        if key in bld_info_dict:
            del bld_info_dict[key]

# dump the contents into a json file
with open('C:/Projects/Housing_Price_Prediction/data_processing/raw_bld_info.json', 'w') as f:
    f.write(json.dumps(bld_info_list))