# Permit and Price Cleaning, authored by Noah Tamminga (ntamm@umich.edu).

For a final preparation step, we take the permit data and price data and apply some final cleaning steps and create indexes for the dataset to properly align with each other for analysis.

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Read in files for imputed permits from parquet file

permits = pd.read_parquet('data/imputed_permits_final.parquet.gzip')
permits.head()

measure,survey_date,region_code,division_code,county_name,num_units,bldgs,units,value,fips
0,200001,3,6,Autauga County,1_unit,13,13,690525,1001
1,200001,3,6,Autauga County,2_unit,0,0,0,1001
2,200001,3,6,Autauga County,3_4_unit,0,0,0,1001
3,200001,3,6,Autauga County,5_plus_unit,0,0,0,1001
4,200001,3,6,Lee County,1_unit,28,28,3392260,1081


To clean the permit file, we need to covert the current date record to a datetime value to align with the price dataset. Futhermore, since we won't be using the num_units field in this analysis, we need to group by a subset of columns to get a new aggregate for our subset of interest (all records down to the county month level). Finally, we rename some columns and apply the index for joining.

In [3]:
#Convert survey_date from yyyymm to yyyy-mm-dd for join to price dataset
final_permits = permits.copy()
final_permits['survey_date'] = pd.to_datetime(final_permits['survey_date'], format='%Y%m')

#Group permits by all columns except num_units and sum buildings, units, and value
final_permits = final_permits.groupby(['survey_date', 'fips', 'county_name', 'region_code', 'division_code']).agg({'bldgs': 'sum', 'units': 'sum', 'value': 'sum'}).reset_index()

#Need to rename survey date and fips for index to join right
final_permits = final_permits.rename(columns = {
    'survey_date': 'DATE',
    'fips': 'FIPS'
})

#Need to create index on survey_date and fips for quick joining
final_permits = final_permits.set_index(['DATE', 'FIPS'])

final_permits

Unnamed: 0_level_0,measure,county_name,region_code,division_code,bldgs,units,value
DATE,FIPS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,01001,Autauga County,3,6,13,13,690525
2000-01-01,01081,Lee County,3,6,32,63,5558536
2000-01-01,01113,Russell County,3,6,3,4,343000
2000-01-01,01125,Tuscaloosa County,3,6,56,60,5353849
2000-01-01,02013,Aleutians East Borough,4,9,0,0,0
...,...,...,...,...,...,...,...
2025-03-01,56037,Sweetwater County ...,4,8,3,3,1346359
2025-03-01,56039,Teton County ...,4,8,7,11,10894087
2025-03-01,56041,Uinta County ...,4,8,6,6,1033377
2025-03-01,56043,Washakie County ...,4,8,0,0,0


In [4]:
final_permits.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 329614 entries, (Timestamp('2000-01-01 00:00:00'), '01001') to (Timestamp('2025-03-01 00:00:00'), '56045')
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   county_name    329614 non-null  object
 1   region_code    329614 non-null  int64 
 2   division_code  329614 non-null  int64 
 3   bldgs          329614 non-null  int64 
 4   units          329614 non-null  int64 
 5   value          329614 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 16.4+ MB


In [5]:
# final_permits.to_parquet('data/permits_final.parquet.gzip',
#               compression='gzip')

In [6]:
#Read in home price agg by county for correlation and autocorr analysis

price = pd.read_parquet('data/home_price_agg_by_county.parquet.gzip')
price.head()

Unnamed: 0,FIPS_CODE,REGION,PROPERTY_TYPE,PERIOD_BEGIN,INVENTORY,HOMES_SOLD,MEDIAN_SALE_PRICE_interpolated,MEDIAN_SALE_PRICE_interpolated_inflation_adj_2012-01,MEDIAN_LIST_PRICE_interpolated,MEDIAN_LIST_PRICE_interpolated_inflation_adj_2012-01,...,MEDIAN_SALE_PRICE_no_outliers,MEDIAN_SALE_PRICE_no_outliers_inflation_adj_2012-01,MEDIAN_LIST_PRICE_no_outliers,MEDIAN_LIST_PRICE_no_outliers_inflation_adj_2012-01,CPI,original_record,MEDIAN_SALE_PRICE_is_interpolated,MEDIAN_LIST_PRICE_is_interpolated,MEDIAN_SALE_PRICE_is_outlier,MEDIAN_LIST_PRICE_is_outlier
229742,28137,"Tate County, MS",All Residential,2023-11-01,53,8,284000,384024,264900,358197,...,284000.0,384024.0,264900.0,358197.0,308.087,True,False,False,False,False
473092,54045,"Logan County, WV",All Residential,2023-01-01,46,9,72500,95606,70000,92309,...,72500.0,95606.0,70000.0,92309.0,300.456,True,False,False,False,False
315697,38053,"McKenzie County, ND",All Residential,2013-06-01,23,0,250000,255051,156600,159764,...,,,,,232.445,False,True,True,False,False
119154,18131,"Pulaski County, IN",All Residential,2021-07-01,33,17,145000,173080,174900,208770,...,145000.0,173080.0,174900.0,208770.0,271.965,True,False,False,False,False
382279,47025,"Claiborne County, TN",All Residential,2015-04-01,184,19,118500,122858,164900,170965,...,118500.0,122858.0,164900.0,170965.0,236.222,True,False,False,False,False


Similar to the permit dataset, we need to perform some slight modifications to the price dataset. These modifications include setting our datetime value, cleaning up some column names, selecting our subset of fields we want to analyze, and creating our DATE plus FIPS index.

In [7]:
price_for_permits = price.copy()
price_for_permits['DATE'] = pd.to_datetime(price_for_permits['PERIOD_BEGIN'])

#Only need subset of price: fips, date, inventory, homes sold, and the interpolated fields
price_for_permits = price_for_permits.rename(columns = {
    'FIPS_CODE': 'FIPS',
    'MEDIAN_SALE_PRICE_interpolated': 'SALE_PRICE',
    'MEDIAN_SALE_PRICE_interpolated_inflation_adj_2012-01': 'SALE_PRICE_ADJ',
    'MEDIAN_LIST_PRICE_interpolated': 'LIST_PRICE',
    'MEDIAN_LIST_PRICE_interpolated_inflation_adj_2012-01': 'LIST_PRICE_ADJ'
})

price_for_permits = price_for_permits[['FIPS', 'DATE', 'INVENTORY', 'HOMES_SOLD', 'SALE_PRICE', 'SALE_PRICE_ADJ', 'LIST_PRICE', 'LIST_PRICE_ADJ']]

#Still need to backfill 0 up to the 5 digit code for fips and create index on date and fips for joining
price_for_permits['FIPS'] = price_for_permits['FIPS'].astype(str).str.zfill(5)
price_for_permits = price_for_permits.set_index(['DATE', 'FIPS'])

In [8]:
price_for_permits.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10000 entries, (Timestamp('2023-11-01 00:00:00'), '28137') to (Timestamp('2020-01-01 00:00:00'), '19019')
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   INVENTORY       9887 non-null   Int64
 1   HOMES_SOLD      10000 non-null  Int64
 2   SALE_PRICE      10000 non-null  Int64
 3   SALE_PRICE_ADJ  10000 non-null  Int64
 4   LIST_PRICE      9919 non-null   Int64
 5   LIST_PRICE_ADJ  9919 non-null   Int64
dtypes: Int64(6)
memory usage: 659.6+ KB


In [9]:
na_values = price_for_permits[price_for_permits.isna().any(axis=1)]
na_values

Unnamed: 0_level_0,Unnamed: 1_level_0,INVENTORY,HOMES_SOLD,SALE_PRICE,SALE_PRICE_ADJ,LIST_PRICE,LIST_PRICE_ADJ
DATE,FIPS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-01,22063,,90,142350,147002,159900,165126
2022-04-01,46007,1,0,93507,118435,,
2016-12-01,48261,1,0,438000,466442,,
2024-02-01,46121,,0,180000,245714,,
2022-09-01,28103,1,0,84940,110507,,
...,...,...,...,...,...,...,...
2019-06-01,46075,,0,103186,115582,,
2020-09-01,29013,,20,126000,143782,166250,189713
2018-11-01,46137,,0,111619,123745,,
2014-04-01,20107,,10,75000,77839,81000,84067


In [10]:
# price_for_permits.to_parquet('data/price_final.parquet.gzip',
#               compression='gzip')