In [36]:
# Extracted from https://www.zillow.com/research/data/

In [9]:
import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install pyarrow
# !{sys.executable} -m pip install fastparquet

Collecting pyarrow
  Using cached https://files.pythonhosted.org/packages/0b/82/7f70296eb5167bc3bcee96a1460315af109affd7fef43e750c8ee4aac17c/pyarrow-12.0.1-cp37-cp37m-win_amd64.whl
Installing collected packages: pyarrow
Successfully installed pyarrow-12.0.1


You are using pip version 19.0.3, however version 24.0 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Collecting fastparquet

You are using pip version 19.0.3, however version 24.0 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.



  Downloading https://files.pythonhosted.org/packages/21/13/caf9b6d339c0eac11575a3c8f3524ea429417e205b7df1555276928cafa8/fastparquet-0.8.1-cp37-cp37m-win_amd64.whl (611kB)
Collecting cramjam>=2.3.0 (from fastparquet)
  Downloading https://files.pythonhosted.org/packages/31/6c/d9ca9f9ddf13258c2c4edcc244cd3efb0ffe87355cbf8250218911b362f3/cramjam-2.8.3-cp37-none-win_amd64.whl (1.6MB)
Collecting fsspec (from fastparquet)
  Downloading https://files.pythonhosted.org/packages/bd/64/f0d369ede0ca54fdd520bdee5086dbaf0af81dac53a2ce847bd1ec6e0bf1/fsspec-2023.1.0-py3-none-any.whl (143kB)
Installing collected packages: cramjam, fsspec, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-0.8.1 fsspec-2023.1.0


In [7]:
import pandas as pd
import os

## Extract ZHVI from Zillow

Zillow Home Value Index (ZHVI): A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.

Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes within the 5th to 35th percentile range for a given region).

Zillow also publishes ZHVI for all single-family residences ($, typical value for all single-family homes in a given region), for condo/coops ($), for all homes with 1, 2, 3, 4 and 5+ bedrooms ($), and the ZHVI per square foot ($, typical value of all homes per square foot calculated by taking the estimated home value for each home in a given region and dividing it by the home’s square footage).

In [33]:
def extract_zillow_data(bedrooms=4, force=False, cache=True):
    cache_filename = f'zillow_zhvi_{bedrooms}br'
    force = False
    if (os.path.isfile(f'../data/raw/{cache_filename}.parquet') & (not force) & cache) :
        zillow = pd.read_parquet(f'../data/raw/{cache_filename}.parquet')
    else:
        print('UPDATING ZILLOW DATA')
        zillow_url = f'https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_bdrmcnt_{bedrooms}_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv?t=1676324507'
        zillow = pd.read_csv(zillow_url)
        if cache:
            zillow.to_parquet(f'../data/raw/{cache_filename}.parquet',index=False)

    drop_cols = ['RegionID','SizeRank','RegionType']

    zillow_melted = zillow.drop(drop_cols, axis=1).melt(
        id_vars = ['RegionName','StateName','State','City','Metro','CountyName'],
        var_name = 'date',
        value_name='zhvi'
    ).rename(columns={'RegionName':'zip_code','StateName':'state_name','CountyName':'county_name'})

    zillow_melted.columns = [col.lower() for col in zillow_melted.columns]
    if cache:
        zillow_melted.to_parquet(f'../data/processed/{cache_filename}.parquet',index=False)
    zillow_melted['bedrooms'] = bedrooms
    return zillow_melted

In [34]:
datas = []
for br in range(1,6):
    print(f'extracting/cacheing zillow data for {br} bedroom units')
    datas.append(extract_zillow_data(br))

extracting/cacheing zillow data for 1 bedroom units
extracting/cacheing zillow data for 2 bedroom units
extracting/cacheing zillow data for 3 bedroom units
extracting/cacheing zillow data for 4 bedroom units
extracting/cacheing zillow data for 5 bedroom units


In [35]:
datas[0]

Unnamed: 0,zip_code,state_name,state,city,metro,county_name,date,zhvi,bedrooms
0,8701,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2000-01-31,46609.017382,1
1,11368,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,2000-01-31,,1
2,77084,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,2000-01-31,,1
3,11385,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,2000-01-31,,1
4,90011,CA,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2000-01-31,71377.658988,1
...,...,...,...,...,...,...,...,...,...
1337833,80860,CO,CO,Cripple Creek,"Colorado Springs, CO",Teller County,2024-05-31,154948.612889,1
1337834,98940,WA,WA,Ronald,"Ellensburg, WA",Kittitas County,2024-05-31,478408.805793,1
1337835,33854,FL,FL,Lake Wales,"Lakeland-Winter Haven, FL",Polk County,2024-05-31,94216.462232,1
1337836,10004,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",New York County,2024-05-31,872415.467846,1
