**Predicting US Housing Prices at the Zip Code Level Using Google's Population Dynamics Foundation Model and Zillow Data**

## Useful Resources

- [Google's Population Dynamics Foundation Model (PDFM)](https://github.com/google-research/population-dynamics)
- Request access to PDFM embeddings [here](https://github.com/google-research/population-dynamics?tab=readme-ov-file#getting-access-to-the-embeddings)
- Zillow data can be accessed [here](https://www.zillow.com/research/data/)


[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)

In [None]:
# %pip install leafmap scikit-learn

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file

In [None]:
zhvi_url = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_zipcode.csv"
zhvi_file = "data/zillow_home_value_index_by_zipcode.csv"

In [None]:
if not os.path.exists(zhvi_file):
    download_file(zhvi_url, zhvi_file)

In [15]:
zhvi_df = pd.read_csv(zhvi_file, dtype={'RegionName': 'string'})
zhvi_df.index = zhvi_df['RegionName'].apply(lambda x: f'zip/{x}')
zhvi_df.head()

Unnamed: 0_level_0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,2024-09-30,2024-10-31
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zip/77494,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,209044.032747,...,486299.822703,488450.034543,491246.367234,493753.57531,495060.153069,495365.132391,494987.724791,495393.290898,496081.013222,497199.835159
zip/08701,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,138154.018163,...,588475.418739,591048.706991,596177.342835,604530.120559,611916.238322,617370.077059,622270.010994,627535.387875,633736.078245,639283.596573
zip/77449,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,102286.114622,...,275170.813362,275862.593787,276945.296053,277892.635056,278419.521895,278380.305608,278149.011828,277969.708313,277784.000058,277372.922029
zip/11368,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,148936.874942,...,450240.714587,448627.19888,451705.600698,456243.079221,460893.462389,461600.365198,461088.306901,460451.755953,461417.194067,461167.663426
zip/77084,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,101363.959236,...,269625.368017,270145.782158,271136.681166,272118.147175,272615.183491,272450.11478,272160.688023,271890.755361,271640.385722,271138.57461


In [None]:
embeddings_file_path = 'data/zcta_embeddings.csv'
zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index('place')
zipcode_embeddings.head()

Unnamed: 0_level_0,state,county,city,population,latitude,longitude,feature0,feature1,feature2,feature3,...,feature320,feature321,feature322,feature323,feature324,feature325,feature326,feature327,feature328,feature329
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
zip/97910,OR,Malheur County,Jordan Valley,609,42.749076,-117.511459,-0.138227,1.120377,0.0729,0.297442,...,-0.158891,-0.168708,1.231994,-0.155765,3.043214,-0.169749,0.177463,-0.001661,-0.00101,4.495589
zip/89412,NV,Washoe County,Gerlach,98,41.102934,-119.695361,-0.141379,1.422782,0.234269,0.159156,...,-0.157417,-0.043606,2.788701,-0.062547,3.700745,-0.169827,-0.13799,-0.024385,-0.000295,3.399393
zip/88030,NM,Luna County,Deming,24139,32.191634,-107.729431,-0.046666,1.414424,0.146803,1.113256,...,-0.000654,0.437475,4.229295,0.229199,2.098469,1.150497,0.716122,-0.116499,-0.051163,3.866543
zip/82633,WY,Converse County,Douglas,9478,43.02227,-105.41025,-0.090293,1.26628,0.447868,0.781861,...,-0.033771,0.579775,2.688665,0.175669,0.990921,1.644879,0.222517,-0.047864,-4.2e-05,7.453567
zip/59538,MT,Phillips County,Malta,2936,48.112019,-107.84552,-0.092886,1.256203,-0.050897,0.321954,...,-0.169915,-0.088829,0.338914,-0.102962,-0.156583,1.493696,2.259007,-0.161916,-0.001087,0.972243


In [6]:
data = zhvi_df.join(zipcode_embeddings, how='inner')
data.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,feature320,feature321,feature322,feature323,feature324,feature325,feature326,feature327,feature328,feature329
zip/77494,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,209044.032747,...,-1.06e-06,-0.000573,-0.0738464,3.741291,6.835684,7.062005,2.521905,1.494393,-0.0,0.663955
zip/08701,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,138154.018163,...,-2.5e-07,-0.009584,-0.00411716,-0.005641,-0.005549,-0.00048563,-2.3e-05,-0.0,-1e-08,-0.151206
zip/77449,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,102286.114622,...,-0.01749664,-0.00399,-0.1646668,4.616399,6.331781,6.382573,2.943955,1.990187,-4e-06,1.199221
zip/11368,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,148936.874942,...,-0.09927426,2.189546,-2.5e-07,6.165315,-0.082513,-1.3e-07,3.876507,-0.03121,-0.00044382,3.084769
zip/77084,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,101363.959236,...,-0.00660355,-0.015875,-0.1620866,4.832126,6.007081,6.24335,1.670585,2.623777,-7e-08,0.925346


In [7]:
embedding_features = [f'feature{x}' for x in range(330)]
label = "2024-10-31"

In [8]:
data = data.dropna(subset=[label])

In [9]:
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

evaluation_df = pd.DataFrame({'y': y_test, 'y_pred': y_pred})
# Evaluate the model
metrics = evaluate_model(evaluation_df)
print(metrics)

{'r2': 0.7848728329326631, 'r': 0.8859619923453613, 'rmse': 148377.85878495293, 'mae': 77043.23202355292, 'mape': 0.26190704308579915}


In [10]:
plot_actual_vs_predicted(evaluation_df, xlim=(0, 3_000_000), ylim=(0, 3_000_000))

In [11]:
k = 5
model = KNeighborsRegressor(n_neighbors=k)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

evaluation_df = pd.DataFrame({'y': y_test, 'y_pred': y_pred})
# Evaluate the model
metrics = evaluate_model(evaluation_df)
print(metrics)

{'r2': 0.7903375176369055, 'r': 0.8940145961357102, 'rmse': 146481.18079760857, 'mae': 65322.90745624193, 'mape': 0.20133938284734368}


In [12]:
plot_actual_vs_predicted(evaluation_df, xlim=(0, 3_000_000), ylim=(0, 3_000_000))