In [2]:
import glob

# All files and directories ending with .txt and that don't begin with a dot:
print(glob.glob("../chosen_data/crosswalk/zip_tract/*2020.xlsx"))

['../chosen_data/crosswalk/zip_tract\\zip_tract_032020.xlsx', '../chosen_data/crosswalk/zip_tract\\zip_tract_062020.xlsx', '../chosen_data/crosswalk/zip_tract\\zip_tract_092020.xlsx', '../chosen_data/crosswalk/zip_tract\\zip_tract_122020.xlsx']


In [54]:
import glob
import pandas as pd
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# To Create Modular
# Place in class
# Allow for different file schemas (Local, S3?)
# Maybe load into duckdb/spark/databricks and then proces the start and end date from the file path?

## Reads from disk in the expected format <type>_<end_month><year>.xlsx
def load_hud_crosswalk_from_local(local_path, year):
    # Find all crosswalk files in the directory
    files_to_load = glob.glob(f"{local_path}/*{year}.xlsx")

    # Build dataframes with start and end dates
    dfs = []
    pattern = r".*(\d{2})(?=\d{4}\.xlsx$)"
    for file in files_to_load:

        # Pull the month out of the expected file format
        match = re.search(pattern, file)
        if not match:
            raise ValueError("No month found in the file path")
        
        # Build the start and end date based on how the crosswalk data is written
        month = match.group(1)
        date = datetime.strptime(f"{year}-{month}-01", "%Y-%m-%d")
        start_date = date - relativedelta(months=2)

        if month == "12":
            end_date = date.replace(month=12, day=31)
        else:
            end_date = date.replace(month=date.month+1, day=1) - timedelta(days=1)
        
        # Load the XLSX, apply the start and end dates. Union, and Return
        df = pd.read_excel(file)
        df['start_date'] = start_date.strftime("%Y-%m-%d")
        df['end_date'] = end_date.strftime("%Y-%m-%d")
        dfs += [df]
    df_ret = pd.concat(dfs, axis=0, ignore_index=True)
    return df_ret

In [55]:
from IPython.display import display, HTML

loaded_zip_tract_df = pd.concat([load_hud_crosswalk_from_local("../chosen_data/crosswalk/zip_tract/", 2020), load_hud_crosswalk_from_local("../chosen_data/crosswalk/zip_tract/", 2022)], axis=0, ignore_index=True)
loaded_cbsa_zip_df = pd.concat([load_hud_crosswalk_from_local("../chosen_data/crosswalk/cbsa_zip/", 2020), load_hud_crosswalk_from_local("../chosen_data/crosswalk/cbsa_zip/", 2022)], axis=0, ignore_index=True)
display(loaded_zip_tract_df)
display(loaded_cbsa_zip_df)


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,ZIP,TRACT,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO,start_date,end_date,USPS_ZIP_PREF_CITY,USPS_ZIP_PREF_STATE
0,501,36103158607,0.000000,1.000000,0.000000,1.000000,2020-01-01,2020-03-31,,
1,601,72001956800,0.014816,0.012563,0.044025,0.015412,2020-01-01,2020-03-31,,
2,601,72113071700,0.161060,0.206030,0.132075,0.163171,2020-01-01,2020-03-31,,
3,601,72001956600,0.172913,0.354271,0.358491,0.189069,2020-01-01,2020-03-31,,
4,601,72001956700,0.651211,0.427136,0.465409,0.632348,2020-01-01,2020-03-31,,
...,...,...,...,...,...,...,...,...,...,...
1377508,99163,53075000500,0.019933,0.001311,0.006263,0.018026,2022-10-01,2022-12-31,PULLMAN,WA
1377509,99163,53075000200,0.205311,0.115334,0.284621,0.207864,2022-10-01,2022-12-31,PULLMAN,WA
1377510,99163,53075001000,0.001415,0.000000,0.000000,0.001239,2022-10-01,2022-12-31,PULLMAN,WA
1377511,78635,48171950100,0.016393,0.000000,0.000000,0.015385,2022-10-01,2022-12-31,HYE,TX


Unnamed: 0,CBSA,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO,start_date,end_date,USPS_ZIP_PREF_CITY,USPS_ZIP_PREF_STATE
0,10100,57445,0.044115,0.016156,0.037037,0.041101,2020-01-01,2020-03-31,,
1,10100,57456,0.000046,0.000000,0.000000,0.000039,2020-01-01,2020-03-31,,
2,10100,57441,0.014690,0.003401,0.000639,0.012785,2020-01-01,2020-03-31,,
3,10100,57474,0.005792,0.003401,0.001277,0.005295,2020-01-01,2020-03-31,,
4,10100,57401,0.751576,0.788265,0.933589,0.766139,2020-01-01,2020-03-31,,
...,...,...,...,...,...,...,...,...,...,...
379438,99999,30821,0.000056,0.000016,0.000000,0.000052,2022-10-01,2022-12-31,NORWOOD,GA
379439,99999,31036,0.000437,0.000586,0.000162,0.000440,2022-10-01,2022-12-31,HAWKINSVILLE,GA
379440,99999,49706,0.000288,0.000078,0.000141,0.000271,2022-10-01,2022-12-31,ALANSON,MI
379441,99999,49745,0.000066,0.000010,0.000000,0.000060,2022-10-01,2022-12-31,HESSEL,MI


In [62]:
import glob
import pandas as pd
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Todo: Can we find zillow crosswalk that supports multi year data?
# How much will this break it? Move forward and evaluate/document the required potential changes? 

# Reads the singular zillow cross talk from disk
def load_zillow_crosswalk_from_local(local_path):
    df_zillow_county_crosswalk = pd.read_csv(local_path)
    return df_zillow_county_crosswalk

In [63]:

df_zillow_county_crosswalk = load_zillow_crosswalk_from_local("../chosen_data/zillow/CountyCrossWalk_Zillow.csv")
display(df_zillow_county_crosswalk)

Unnamed: 0,CountyName,StateName,StateFIPS,CountyFIPS,MetroName_Zillow,CBSAName,CountyRegionID_Zillow,MetroRegionID_Zillow,FIPS,CBSACode
0,Pike,Pennsylvania,42,103,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",280,394913.0,42103,35620.0
1,Bronx,New York,36,5,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",401,394913.0,36005,35620.0
2,Essex,New Jersey,34,13,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",504,394913.0,34013,35620.0
3,Kings,New York,36,47,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",581,394913.0,36047,35620.0
4,Ocean,New Jersey,34,29,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",659,394913.0,34029,35620.0
...,...,...,...,...,...,...,...,...,...,...
3139,Allen,Kansas,20,1,,,368,,20001,
3140,Anson,North Carolina,37,7,,,374,,37007,
3141,Atoka,Oklahoma,40,5,,,375,,40005,
3142,Avery,North Carolina,37,11,,,376,,37011,


In [84]:
import duckdb

# create a connection to a file called 'test_database.db'
con = duckdb.connect("../test_database.db")

In [85]:
# These have to become merge statements or partition delete/appends for multiple data loads.
# Silver tables should be more specific in what columns they choose. 
con.sql("""
CREATE OR REPLACE TABLE crosswalk_zip_tract AS
    SELECT * FROM loaded_zip_tract_df;
           """)

con.sql("""
CREATE OR REPLACE TABLE crosswalk_cbsa_zip AS
    SELECT * FROM loaded_cbsa_zip_df;
           """)

con.sql("""
CREATE OR REPLACE TABLE crosswalk_zillow AS
    SELECT * FROM df_zillow_county_crosswalk;
           """)

In [86]:

con.sql("show tables")

┌─────────────────────┐
│        name         │
│       varchar       │
├─────────────────────┤
│ crosswalk_cbsa_zip  │
│ crosswalk_zillow    │
│ crosswalk_zip_tract │
│ svi                 │
└─────────────────────┘

In [87]:

con.sql("SELECT * FROM crosswalk_cbsa_zip LIMIT 10")

┌───────┬───────┬───────────────────────┬────────────────────────┬───────────────────────┬───────────────────────┬────────────┬────────────┬────────────────────┬─────────────────────┐
│ CBSA  │  ZIP  │       RES_RATIO       │       BUS_RATIO        │       OTH_RATIO       │       TOT_RATIO       │ start_date │  end_date  │ USPS_ZIP_PREF_CITY │ USPS_ZIP_PREF_STATE │
│ int64 │ int64 │        double         │         double         │        double         │        double         │  varchar   │  varchar   │      varchar       │       varchar       │
├───────┼───────┼───────────────────────┼────────────────────────┼───────────────────────┼───────────────────────┼────────────┼────────────┼────────────────────┼─────────────────────┤
│ 10100 │ 57445 │   0.04411492122335496 │   0.016156462585034014 │  0.037037037037037035 │  0.041101262844144636 │ 2020-01-01 │ 2020-03-31 │ NULL               │ NULL                │
│ 10100 │ 57456 │ 4.633920296570899e-05 │                    0.0 │              

In [88]:

con.sql("SELECT * FROM crosswalk_zillow LIMIT 10")

┌────────────┬──────────────┬───────────┬────────────┬──────────────────┬───────────────────────────────────────┬───────────────────────┬──────────────────────┬───────┬──────────┐
│ CountyName │  StateName   │ StateFIPS │ CountyFIPS │ MetroName_Zillow │               CBSAName                │ CountyRegionID_Zillow │ MetroRegionID_Zillow │ FIPS  │ CBSACode │
│  varchar   │   varchar    │   int64   │   int64    │     varchar      │                varchar                │         int64         │        double        │ int64 │  double  │
├────────────┼──────────────┼───────────┼────────────┼──────────────────┼───────────────────────────────────────┼───────────────────────┼──────────────────────┼───────┼──────────┤
│ Pike       │ Pennsylvania │        42 │        103 │ New York, NY     │ New York-Newark-Jersey City, NY-NJ-PA │                   280 │             394913.0 │ 42103 │  35620.0 │
│ Bronx      │ New York     │        36 │          5 │ New York, NY     │ New York-Newark-Jersey Cit

In [89]:

con.sql("SELECT * FROM crosswalk_zip_tract LIMIT 10")

┌───────┬─────────────┬──────────────────────┬───────────────────────┬──────────────────────┬───────────────────────┬────────────┬────────────┬────────────────────┬─────────────────────┐
│  ZIP  │    TRACT    │      RES_RATIO       │       BUS_RATIO       │      OTH_RATIO       │       TOT_RATIO       │ start_date │  end_date  │ USPS_ZIP_PREF_CITY │ USPS_ZIP_PREF_STATE │
│ int64 │    int64    │        double        │        double         │        double        │        double         │  varchar   │  varchar   │      varchar       │       varchar       │
├───────┼─────────────┼──────────────────────┼───────────────────────┼──────────────────────┼───────────────────────┼────────────┼────────────┼────────────────────┼─────────────────────┤
│   501 │ 36103158607 │                  0.0 │                   1.0 │                  0.0 │                   1.0 │ 2020-01-01 │ 2020-03-31 │ NULL               │ NULL                │
│   601 │ 72001956800 │  0.01481610597873453 │   0.01256281407035

In [90]:
con.commit()
con.close()

: 