# Database Connection

In [1]:
import duckdb

# create a connection to a file called 'test_database.db'
con = duckdb.connect("../database_jupyter.db")

# Create Tables
If they do not exist create the expected tables

Would need to include code for table migration/translation when new SVI datasets are downloaded

We will not precreate Zillows Bronze table, as those will be overwrites exclusively and then need to be pivoted. 

In [9]:
cont = duckdb.connect("../test_database.db")
cont.sql("SHOW TABLES")

┌────────────────────────┐
│          name          │
│        varchar         │
├────────────────────────┤
│ crosswalk_cbsa_zip     │
│ crosswalk_zillow       │
│ crosswalk_zillow_tract │
│ crosswalk_zip_tract    │
│ svi                    │
│ zillow_mean_to_pending │
└────────────────────────┘

In [109]:
create_table_hud_cbsa_zip = "CREATE OR REPLACE TABLE crosswalk_hud_cbsa_zip (CBSA BIGINT, ZIP BIGINT, RES_RATIO DOUBLE, BUS_RATIO DOUBLE, OTH_RATIO DOUBLE, TOT_RATIO DOUBLE, start_date DATE, end_date DATE, load_quarter VARCHAR)"
create_table_hud_zip_tract = "CREATE OR REPLACE TABLE crosswalk_hud_zip_tract (ZIP BIGINT, TRACT BIGINT, RES_RATIO DOUBLE, BUS_RATIO DOUBLE, OTH_RATIO DOUBLE, TOT_RATIO DOUBLE, start_date DATE, end_date DATE, load_quarter VARCHAR)"
create_table_zillow_zillow_cbsa = "CREATE OR REPLACE TABLE crosswalk_zillow_zillow_cbsa (CountyName varchar, StateName varchar, StateFIPS INT64, CountyFIPS INT64, MetroName_Zillow varchar, CBSAName varchar, CountyRegionID_Zillow INT64, MetroRegionID_Zillow DOUBLE, FIPS INT64, CBSACode DOUBLE)"
create_table_svi = "CREATE OR REPLACE TABLE svi (STATE VARCHAR, ST_ABBR VARCHAR, STCNTY BIGINT, COUNTY VARCHAR, FIPS BIGINT, LOCATION VARCHAR, RPL_THEME1 DOUBLE, RPL_THEME2 DOUBLE, RPL_THEME3 DOUBLE, RPL_THEME4 DOUBLE, RPL_THEMES DOUBLE, year BIGINT)"
# con.sql(create_table_hud_cbsa_zip)
# con.sql(create_table_hud_zip_tract)
# con.sql(create_table_zillow_zillow_cbsa)
con.sql(create_table_svi)
con.sql("SHOW TABLES")

┌──────────────────────────────┐
│             name             │
│           varchar            │
├──────────────────────────────┤
│ crosswalk_hud_cbsa_zip       │
│ crosswalk_hud_zip_tract      │
│ crosswalk_zillow_zillow_cbsa │
│ svi                          │
└──────────────────────────────┘

# Bronze

### Cross Walk Tables

In [88]:
import glob
import pandas as pd
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# To Create Modular
# Place in class
# Allow for different file schemas (Local, S3?)
# Maybe load into duckdb/spark/databricks and then proces the start and end date from the file path?

## Reads from disk in the expected format <type>_<end_month><year>.xlsx
def load_hud_crosswalk_from_local(local_path, year):
    
    # Find all crosswalk files in the directory
    files_to_load = glob.glob(f"{local_path}/*{year}.xlsx")

    # Build dataframes with start and end dates
    dfs = []
    pattern = r"(\w+)_(\w+)_(\d{2})(\d{4})\.xlsx"
    for file in files_to_load:

        # Pull the month out of the expected file format
        match = re.search(pattern, file)
        if not match:
            raise ValueError("No month found in the file path")
        
        # Build the start and end date based on how the crosswalk data is written
        # Find the cols that this crosswalk table 
        cola = match.group(1)
        colb = match.group(2)

        month = match.group(3)
        date = datetime.strptime(f"{year}-{month}-01", "%Y-%m-%d")
        start_date = date - relativedelta(months=2)

        if month == "12":
            end_date = date.replace(month=12, day=31)
        else:
            end_date = date.replace(month=date.month+1, day=1) - timedelta(days=1)
        
        load_quarter = f"{month}{year}"
        
        # Load the XLSX, apply the start and end dates. Union, and Return
        df = pd.read_excel(file)[[cola.upper(), colb.upper(), "RES_RATIO", "BUS_RATIO", "OTH_RATIO", "TOT_RATIO"]]
        df['start_date'] = start_date.strftime("%Y-%m-%d")
        df['end_date'] = end_date.strftime("%Y-%m-%d")
        df['load_quarter'] = load_quarter
        dfs += [df]
    df_ret = pd.concat(dfs, axis=0, ignore_index=True)
    return df_ret

Load years separatly to test partition overwrite

In [89]:
from IPython.display import display, HTML

date_to_load = 2020
loaded_zip_tract_df = load_hud_crosswalk_from_local("../chosen_data/crosswalk/zip_tract/", date_to_load)
loaded_cbsa_zip_df = load_hud_crosswalk_from_local("../chosen_data/crosswalk/cbsa_zip/", date_to_load)

Delete and Insert into tables since these would be overwrite of entire years if loaded

In [90]:
    con.sql("DELETE FROM crosswalk_hud_cbsa_zip WHERE load_quarter IN (SELECT DISTINCT load_quarter FROM loaded_cbsa_zip_df)")
con.sql("DELETE FROM crosswalk_hud_zip_tract WHERE load_quarter IN (SELECT DISTINCT load_quarter FROM loaded_zip_tract_df)")

In [91]:
con.sql("""
INSERT INTO crosswalk_hud_zip_tract FROM
    (SELECT * FROM loaded_zip_tract_df);
           """)

con.sql("""
INSERT INTO crosswalk_hud_cbsa_zip FROM
    (SELECT * FROM loaded_cbsa_zip_df);
           """)

In [92]:
con.sql("SELECT DISTINCT load_quarter FROM crosswalk_hud_cbsa_zip")

┌──────────────┐
│ load_quarter │
│   varchar    │
├──────────────┤
│ 062020       │
│ 092020       │
│ 032020       │
│ 122020       │
└──────────────┘

In [93]:
from IPython.display import display, HTML

date_to_load = 2022
loaded_zip_tract_df = load_hud_crosswalk_from_local("../chosen_data/crosswalk/zip_tract/", date_to_load)
loaded_cbsa_zip_df = load_hud_crosswalk_from_local("../chosen_data/crosswalk/cbsa_zip/", date_to_load)

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [94]:
con.sql("DELETE FROM crosswalk_hud_cbsa_zip WHERE load_quarter IN (SELECT DISTINCT load_quarter FROM loaded_cbsa_zip_df)")
con.sql("DELETE FROM crosswalk_hud_zip_tract WHERE load_quarter IN (SELECT DISTINCT load_quarter FROM loaded_zip_tract_df)")

In [95]:
con.sql("""
INSERT INTO crosswalk_hud_zip_tract FROM
    (SELECT * FROM loaded_zip_tract_df);
           """)

con.sql("""
INSERT INTO crosswalk_hud_cbsa_zip FROM
    (SELECT * FROM loaded_cbsa_zip_df);
           """)

In [96]:
con.sql("SELECT DISTINCT load_quarter FROM crosswalk_hud_cbsa_zip")

┌──────────────┐
│ load_quarter │
│   varchar    │
├──────────────┤
│ 122020       │
│ 032022       │
│ 092022       │
│ 122022       │
│ 092020       │
│ 032020       │
│ 062022       │
│ 062020       │
└──────────────┘

### Zillow Cross Walk

In [97]:
import glob
import pandas as pd
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# Todo: Can we find zillow crosswalk that supports multi year data?
# How much will this break it? Move forward and evaluate/document the required potential changes? 

# Reads the singular zillow cross talk from disk
def load_zillow_crosswalk_from_local(local_path):
    df_zillow_county_crosswalk = pd.read_csv(local_path)
    return df_zillow_county_crosswalk

In [98]:

df_zillow_county_crosswalk = load_zillow_crosswalk_from_local("../chosen_data/zillow/CountyCrossWalk_Zillow.csv")

In [99]:
con.sql("DELETE FROM crosswalk_zillow_zillow_cbsa")

con.sql("""
INSERT INTO crosswalk_zillow_zillow_cbsa FROM
    (SELECT * FROM df_zillow_county_crosswalk);
           """)

In [100]:
con.sql("SELECT * FROM df_zillow_county_crosswalk")

┌────────────┬────────────────┬───────────┬────────────┬──────────────────┬───────────────────────────────────────┬───────────────────────┬──────────────────────┬───────┬──────────┐
│ CountyName │   StateName    │ StateFIPS │ CountyFIPS │ MetroName_Zillow │               CBSAName                │ CountyRegionID_Zillow │ MetroRegionID_Zillow │ FIPS  │ CBSACode │
│  varchar   │    varchar     │   int64   │   int64    │     varchar      │                varchar                │         int64         │        double        │ int64 │  double  │
├────────────┼────────────────┼───────────┼────────────┼──────────────────┼───────────────────────────────────────┼───────────────────────┼──────────────────────┼───────┼──────────┤
│ Pike       │ Pennsylvania   │        42 │        103 │ New York, NY     │ New York-Newark-Jersey City, NY-NJ-PA │                   280 │             394913.0 │ 42103 │  35620.0 │
│ Bronx      │ New York       │        36 │          5 │ New York, NY     │ New York-Newar

### SVI

In [106]:
import glob
import pandas as pd
import re

# Reads from disk in the expected format SVI_<year>_US.csv
# How do we record and pass in transformations from old to new? And check what the most up to date version in the database is?
# Where do we store those transformations. How do we pass in the most recent data in the database so we know which transformations are needed between versions?
# Loading in 2022 to start then plan for the required configuration between versions. 
def load_svi_from_local(local_path, year):
    # Find all crosswalk files in the directory
    files_to_load = glob.glob(f"{local_path}/SVI_{year}_US.csv")

    if not files_to_load:
        raise ValueError(f"No file found for year {year} in {local_path}!")
    if len(files_to_load) > 1:
        raise ValueError(f"Multiple files found for {year} in {local_path}!\n{files_to_load}")

    file = files_to_load[0]
    df_ret = pd.read_csv(file)[["STATE", "ST_ABBR", "STCNTY" ,"COUNTY" ,"FIPS" ,"LOCATION" , "RPL_THEME1", "RPL_THEME2", "RPL_THEME3", "RPL_THEME4", "RPL_THEMES"]]
    df_ret['year'] = year
    return df_ret

In [108]:
df_svi = load_svi_from_local("../chosen_data/svi/", 2022)

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ STATE       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ST_ABBR     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ STCNTY      │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ COUNTY      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ FIPS        │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ LOCATION    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ RPL_THEME1  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ RPL_THEME2  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ RPL_THEME3  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ RPL_THEME4  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ RPL_THEMES  │ DOUB

In [111]:

con.sql("DELETE FROM svi WHERE year IN (SELECT DISTINCT year FROM df_svi)")

con.sql("""
INSERT INTO svi FROM
    (SELECT * FROM df_svi);
           """)

In [113]:
df_svi = load_svi_from_local("../chosen_data/svi/", 2020)

In [114]:

con.sql("DELETE FROM svi WHERE year IN (SELECT DISTINCT year FROM df_svi)")

con.sql("""
INSERT INTO svi FROM
    (SELECT * FROM df_svi);
           """)

### Zillow Datasets

In [116]:
import glob
import pandas as pd
import re

# Zillow file will always be an overwrite due to how the data apears in the columns
def load_zillow_from_local(local_path, zillow_file):
    # Find all crosswalk files in the directory
    files_to_load = glob.glob(f"{local_path}/{zillow_file}")

    if not files_to_load:
        raise ValueError(f"No file found for year {zillow_file} in {local_path}!")
    if len(files_to_load) > 1:
        # Pick the most recent one here probably?
        raise ValueError(f"Multiple files found for {zillow_file} in {local_path}!\n{files_to_load}")
    
    file = files_to_load[0]
    df_ret = pd.read_csv(file)
    return df_ret

In [117]:
df_zillow_mean_to_pending = load_zillow_from_local("../chosen_data/zillow/", "Metro_mean_doz_pending_uc_sfrcondo_month.csv")

In [118]:
con.sql("""
CREATE OR REPLACE TABLE zillow_mean_to_pending AS
    SELECT * FROM df_zillow_mean_to_pending;
           """)

In [120]:
con.sql("SHOW TABLES")

┌──────────────────────────────┐
│             name             │
│           varchar            │
├──────────────────────────────┤
│ crosswalk_hud_cbsa_zip       │
│ crosswalk_hud_zip_tract      │
│ crosswalk_zillow_zillow_cbsa │
│ svi                          │
│ zillow_mean_to_pending       │
└──────────────────────────────┘

In [127]:
df_zillow_zhvi_sfr_zip = load_zillow_from_local("../chosen_data/zillow/", "Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv")

In [134]:
con.sql("""
CREATE OR REPLACE TABLE zillow_zhvi_sfr_zip_bronze AS
    SELECT * FROM df_zillow_zhvi_sfr_zip;
           """)

In [135]:
con.sql("SHOW TABLES")

┌──────────────────────────────┐
│             name             │
│           varchar            │
├──────────────────────────────┤
│ crosswalk_hud_cbsa_zip       │
│ crosswalk_hud_zip_tract      │
│ crosswalk_zillow_zillow_cbsa │
│ svi                          │
│ zillow_mean_to_pending       │
│ zillow_zhvi_sfr_zip_bronze   │
└──────────────────────────────┘

# Silver Tables

* crosswalk zillowid to tract
* Zillow pivoted
* Zillow with SVI


In [175]:
con.sql("""
CREATE OR REPLACE TABLE tracts_per_zip_silver AS
(SELECT 
    ZIP as zip, 
    start_date, 
    end_date, 
    len(list(TRACT)) as len_tracts, 
    list(TRACT) as tracts 
FROM (
    SELECT 
        ZIP, 
        TRACT, 
        start_date, 
        end_date 
    FROM crosswalk_hud_zip_tract 
    GROUP BY 
        ZIP, 
        TRACT, 
        start_date, 
        end_date
    ) 
GROUP BY
    ZIP, 
    start_date, 
    end_date 
ORDER BY zip, start_date DESC)
""")

In [176]:
con.sql("SELECT  * FROM tracts_per_zip_silver")

┌───────┬────────────┬────────────┬────────────┬──────────────────────────────────────────────────────┐
│  zip  │ start_date │  end_date  │ len_tracts │                        tracts                        │
│ int64 │    date    │    date    │   int64    │                       int64[]                        │
├───────┼────────────┼────────────┼────────────┼──────────────────────────────────────────────────────┤
│   501 │ 2022-10-01 │ 2022-12-31 │          1 │ [36103158607]                                        │
│   501 │ 2022-07-01 │ 2022-09-30 │          1 │ [36103158607]                                        │
│   501 │ 2022-04-01 │ 2022-06-30 │          1 │ [36103158607]                                        │
│   501 │ 2022-01-01 │ 2022-03-31 │          1 │ [36103158607]                                        │
│   501 │ 2020-10-01 │ 2020-12-31 │          1 │ [36103158607]                                        │
│   501 │ 2020-07-01 │ 2020-09-30 │          1 │ [36103158607]  

In [136]:
con.sql("""
CREATE OR REPLACE TABLE zillow_zhvi_sfr_zip_silver AS
(UNPIVOT zillow_zhvi_sfr_zip_bronze
ON COLUMNS(* EXCLUDE (RegionID, SizeRank, RegionName, RegionType, StateName, State, City, Metro, CountyName))
INTO
NAME date
VALUE zhvi)""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [138]:
con.sql("SHOW TABLES")

┌──────────────────────────────┐
│             name             │
│           varchar            │
├──────────────────────────────┤
│ crosswalk_hud_cbsa_zip       │
│ crosswalk_hud_zip_tract      │
│ crosswalk_zillow_zillow_cbsa │
│ svi                          │
│ zillow_mean_to_pending       │
│ zillow_zhvi_sfr_zip_bronze   │
│ zillow_zhvi_sfr_zip_silver   │
└──────────────────────────────┘

In [170]:
con.sql("SELECT * FROM zillow_zhvi_sfr_zip_silver")

┌──────────┬──────────┬────────────┬────────────┬───────────┬─────────┬────────────┬──────────────────────────────────────┬────────────────────┬────────────┬────────────────────┐
│ RegionID │ SizeRank │ RegionName │ RegionType │ StateName │  State  │    City    │                Metro                 │     CountyName     │    date    │        zhvi        │
│  int64   │  int64   │   int64    │  varchar   │  varchar  │ varchar │  varchar   │               varchar                │      varchar       │  varchar   │       double       │
├──────────┼──────────┼────────────┼────────────┼───────────┼─────────┼────────────┼──────────────────────────────────────┼────────────────────┼────────────┼────────────────────┤
│    91982 │        1 │      77494 │ zip        │ TX        │ TX      │ Katy       │ Houston-The Woodlands-Sugar Land, TX │ Fort Bend County   │ 2000-01-31 │ 209467.46053724916 │
│    91982 │        1 │      77494 │ zip        │ TX        │ TX      │ Katy       │ Houston-The Woodland

In [173]:
con.sql("""
SELECT * FROM zillow_zhvi_sfr_zip_silver WHERE RegionName = 21771
""")

┌──────────┬──────────┬────────────┬────────────┬───────────┬─────────┬────────────┬───────────────────────────────┬────────────────┬────────────┬────────────────────┐
│ RegionID │ SizeRank │ RegionName │ RegionType │ StateName │  State  │    City    │             Metro             │   CountyName   │    date    │        zhvi        │
│  int64   │  int64   │   int64    │  varchar   │  varchar  │ varchar │  varchar   │            varchar            │    varchar     │  varchar   │       double       │
├──────────┼──────────┼────────────┼────────────┼───────────┼─────────┼────────────┼───────────────────────────────┼────────────────┼────────────┼────────────────────┤
│    67030 │     3358 │      21771 │ zip        │ MD        │ MD      │ Mount Airy │ Baltimore-Columbia-Towson, MD │ Carroll County │ 2000-01-31 │  205133.8666031565 │
│    67030 │     3358 │      21771 │ zip        │ MD        │ MD      │ Mount Airy │ Baltimore-Columbia-Towson, MD │ Carroll County │ 2000-02-29 │ 205289.007956

In [193]:
con.sql("""
CREATE OR REPLACE TABLE zillow_zhvi_sfr_zip_with_tracts_silver AS (
    SELECT 
        RegionName, 
        zip, 
        date, 
        zhvi, 
        list_sort(tracts) as tracts
    FROM (
        SELECT 
            * 
        FROM zillow_zhvi_sfr_zip_silver z 
        LEFT JOIN tracts_per_zip_silver cw 
        ON 
            z.RegionName = cw.zip AND 
            z.date::DATE >= cw.start_date AND 
            z.date::DATE <= cw.end_date
    ) 
WHERE tracts is not null)
""")

In [194]:
con.sql("SELECT * FROM zillow_zhvi_sfr_zip_with_tracts_silver WHERE RegionName = 21771")

┌────────────┬───────┬────────────┬────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ RegionName │  zip  │    date    │        zhvi        │                                                                                  tracts                                                                                   │
│   int64    │ int64 │  varchar   │       double       │                                                                                  int64[]                                                                                  │
├────────────┼───────┼────────────┼────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│      21771 │ 21771 │ 2020-01-31 │ 444625.84303602297 │ [24013509002, 24013513001, 

In [195]:
con.sql("SELECT * FROM svi")

┌────────────┬─────────┬────────┬───────────────────┬────────────┬───────────────────────────────────────────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬───────┐
│   STATE    │ ST_ABBR │ STCNTY │      COUNTY       │    FIPS    │                     LOCATION                      │ RPL_THEME1 │ RPL_THEME2 │ RPL_THEME3 │ RPL_THEME4 │ RPL_THEMES │ year  │
│  varchar   │ varchar │ int64  │      varchar      │   int64    │                      varchar                      │   double   │   double   │   double   │   double   │   double   │ int64 │
├────────────┼─────────┼────────┼───────────────────┼────────────┼───────────────────────────────────────────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼───────┤
│ Alabama    │ AL      │   1001 │ Autauga County    │ 1001020100 │ Census Tract 201; Autauga County; Alabama         │      0.488 │     0.6149 │      0.381 │     0.1478 │     0.3635 │  2022 │
│ Alabama    │ AL      │   1001 │ Autaug

In [203]:
con.sql("""
SELECT * FROM (
    SELECT 
        RegionName,
        zip,
        date,
        tracts,
        zhvi,
        list(RPL_THEME1),
        list(RPL_THEME2),
        list(RPL_THEME3),
        list(RPL_THEME4),
        list(RPL_THEMES)
    FROM (SELECT * FROM zillow_zhvi_sfr_zip_with_tracts_silver z LEFT JOIN svi s ON s.year = year(z.date::DATE) AND s.FIPS IN z.tracts)
    GROUP BY (
        RegionName,
        zip,
        date,
        tracts,
        zhvi
    )
) 
WHERE zip = 21771
""")

: 

: 

In [8]:
con.sql("SELECT * FROM svi")

┌────────────┬─────────┬────────┬───────────────────┬────────────┬───────────────────────────────────────────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬───────┐
│   STATE    │ ST_ABBR │ STCNTY │      COUNTY       │    FIPS    │                     LOCATION                      │ RPL_THEME1 │ RPL_THEME2 │ RPL_THEME3 │ RPL_THEME4 │ RPL_THEMES │ year  │
│  varchar   │ varchar │ int64  │      varchar      │   int64    │                      varchar                      │   double   │   double   │   double   │   double   │   double   │ int64 │
├────────────┼─────────┼────────┼───────────────────┼────────────┼───────────────────────────────────────────────────┼────────────┼────────────┼────────────┼────────────┼────────────┼───────┤
│ Alabama    │ AL      │   1001 │ Autauga County    │ 1001020100 │ Census Tract 201; Autauga County; Alabama         │      0.488 │     0.6149 │      0.381 │     0.1478 │     0.3635 │  2022 │
│ Alabama    │ AL      │   1001 │ Autaug

In [12]:
con.sql("""
CREATE OR REPLACE TABLE zillow_zhvi_sfr_zip_with_svi_silver AS (
    SELECT 
        RegionName,
        zip,
        date,
        zhvi,
        mean(RPL_THEME1) AS mean_rpl_theme1,
        median(RPL_THEME1) AS median_rpl_theme1,
        mean(RPL_THEME2) AS mean_rpl_theme2,
        median(RPL_THEME2) AS median_rpl_theme2,
        mean(RPL_THEME3) AS mean_rpl_theme3,
        median(RPL_THEME4) AS median_rpl_theme3,
        mean(RPL_THEME4) AS mean_rpl_theme4,
        median(RPL_THEME4) AS median_rpl_theme4,
        mean(RPL_THEMES) AS mean_rpl_themes,
        median(RPL_THEMES) AS median_rpl_themes,
    FROM (SELECT * FROM zillow_zhvi_sfr_zip_with_tracts_silver z LEFT JOIN svi s ON s.year = year(z.date::DATE) AND s.FIPS IN z.tracts) GROUP BY         
        RegionName,
        zip,
        date,
        zhvi
)
"""
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [15]:
con.sql("SELECT * FROM zillow_zhvi_sfr_zip_with_svi_silver ORDER BY zip, date").show(max_rows=100)

┌────────────┬───────┬────────────┬────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬─────────────────────┬─────────────────────┐
│ RegionName │  zip  │    date    │        zhvi        │   mean_rpl_theme1   │  median_rpl_theme1  │   mean_rpl_theme2   │  median_rpl_theme2  │   mean_rpl_theme3   │  median_rpl_theme3   │   mean_rpl_theme4    │  median_rpl_theme4   │   mean_rpl_themes   │  median_rpl_themes  │
│   int64    │ int64 │  varchar   │       double       │       double        │       double        │       double        │       double        │       double        │        double        │        double        │        double        │       double        │       double        │
├────────────┼───────┼────────────┼────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼────────────────

In [16]:
con.commit()
con.close()