In [1]:
# Extracted from https://www.zillow.com/research/data/

In [2]:
import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install pyarrow
# !{sys.executable} -m pip install fastparquet

In [3]:
import pandas as pd
import os

## Extract ZHVI from Zillow

Zillow Home Value Index (ZHVI): A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.

Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes within the 5th to 35th percentile range for a given region).

Zillow also publishes ZHVI for all single-family residences ($, typical value for all single-family homes in a given region), for condo/coops ($), for all homes with 1, 2, 3, 4 and 5+ bedrooms ($), and the ZHVI per square foot ($, typical value of all homes per square foot calculated by taking the estimated home value for each home in a given region and dividing it by the home’s square footage).

In [4]:
def extract_zillow_data(bedrooms=4, force=False, cache=True):
    cache_filename = f'zillow_zhvi_{bedrooms}br'
    force = False
    if (os.path.isfile(f'../data/raw/{cache_filename}.parquet') & (not force) & cache) :
        zillow = pd.read_parquet(f'../data/raw/{cache_filename}.parquet')
    else:
        print('UPDATING ZILLOW DATA')
        zillow_url = f'https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_bdrmcnt_{bedrooms}_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv?t=1676324507'
        zillow = pd.read_csv(zillow_url)
        if cache:
            zillow.to_parquet(f'../data/raw/{cache_filename}.parquet',index=False)

    drop_cols = ['RegionID','SizeRank','RegionType']

    zillow_melted = zillow.drop(drop_cols, axis=1).melt(
        id_vars = ['RegionName','StateName','State','City','Metro','CountyName'],
        var_name = 'date',
        value_name='zhvi'
    ).rename(columns={'RegionName':'zip_code','StateName':'state_name','CountyName':'county_name'})

    zillow_melted.columns = [col.lower() for col in zillow_melted.columns]
    if cache:
        zillow_melted.to_parquet(f'../data/processed/{cache_filename}.parquet',index=False)
    zillow_melted['bedrooms'] = bedrooms
    return zillow_melted

In [5]:
datas = []
for br in range(1,6):
    print(f'extracting/cacheing zillow data for {br} bedroom units')
    datas.append(extract_zillow_data(br))

extracting/cacheing zillow data for 1 bedroom units
extracting/cacheing zillow data for 2 bedroom units
extracting/cacheing zillow data for 3 bedroom units
extracting/cacheing zillow data for 4 bedroom units
extracting/cacheing zillow data for 5 bedroom units


In [6]:
df = pd.concat(datas)
df

Unnamed: 0,zip_code,state_name,state,city,metro,county_name,date,zhvi,bedrooms
0,8701,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2000-01-31,4.660902e+04,1
1,11368,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,2000-01-31,,1
2,77084,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,2000-01-31,,1
3,11385,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,2000-01-31,,1
4,90011,CA,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2000-01-31,7.137766e+04,1
...,...,...,...,...,...,...,...,...,...
2498113,96141,CA,CA,Homewood,"Sacramento-Roseville-Folsom, CA",Placer County,2024-05-31,2.286039e+06,5
2498114,33924,FL,FL,,"Cape Coral-Fort Myers, FL",Lee County,2024-05-31,5.064301e+06,5
2498115,32461,FL,FL,Panama City Beach,"Crestview-Fort Walton Beach-Destin, FL",Walton County,2024-05-31,4.117191e+06,5
2498116,27972,NC,NC,Salvo,"Kill Devil Hills, NC",Dare County,2024-05-31,1.116508e+06,5


In [14]:
df.to_parquet('../data/processed/zillow_all_data.parquet', index=False)

In [10]:
df_zip_br = df.groupby(['zip_code','bedrooms']).last()
df_zip_br

In [13]:
df_zip_br.to_csv('../data/processed/zillow_current_snapshot.csv')

In [9]:
len(df.loc[df.date == '2024-05-31'].zip_code.unique())

20707

In [9]:
len(datas[0][['city','state']].drop_duplicates())

2712