# Data download

https://www.neighborhoodatlas.medicine.wisc.edu/

In [1]:
# Load duckdb, which lets us efficiently load large files
import duckdb

# Load pandas, which lets us manipulate dataframes
import pandas as pd

# Load polars, which enables loading of parquet files
import polars as pl

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# Set configrations on jupysql to directly output data to Pandas and to simplify the output that is printed to the notebook.
%config SqlMagic.autopandas = True

%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect jupysql to DuckDB using a SQLAlchemy-style connection string. Either connect to an in memory DuckDB, or a file backed db.
%sql duckdb:///:memory:

## Minimal duckdb query to standardize the ADI file 

Prompt for Claude:

```
%%sql
SELECT *
FROM read_csv('https://data.cityofnewyork.us/api/views/erm2-nwe9/rows.csv?accessType=DOWNLOAD',
    header=True,
    delim=',',
    quote='"',
    columns={'Unique Key': 'BIGINT',
    'Created Date': 'VARCHAR',
    'Closed Date': 'VARCHAR',
    'Agency': 'VARCHAR',
    'Agency Name': 'VARCHAR',
    'Complaint Type': 'VARCHAR',
    'Descriptor': 'VARCHAR',
    'Location Type': 'VARCHAR',
    'Incident Zip': 'VARCHAR',
    'Incident Address': 'VARCHAR',
    'Street Name': 'VARCHAR',
    'Cross Street 1': 'VARCHAR',
    'Cross Street 2': 'VARCHAR',
    'Intersection Street 1': 'VARCHAR',
    'Intersection Street 2': 'VARCHAR',
    'Address Type': 'VARCHAR',
    'City': 'VARCHAR',
    'Landmark': 'VARCHAR',
    'Facility Type': 'VARCHAR',
    'Status': 'VARCHAR',
    'Due Date': 'VARCHAR',
    'Resolution Description': 'VARCHAR',
    'Resolution Action Updated Date': 'VARCHAR',
    'Community Board': 'VARCHAR',
    'BBL': 'VARCHAR',
    'Borough': 'VARCHAR',
    'X Coordinate (State Plane)': 'VARCHAR',
    'Y Coordinate (State Plane)': 'VARCHAR',
    'Open Data Channel Type': 'VARCHAR',
    'Park Facility Name': 'VARCHAR',
    'Park Borough': 'VARCHAR',
    'Vehicle Type': 'VARCHAR',
    'Taxi Company Borough': 'VARCHAR',
    'Taxi Pick Up Location': 'VARCHAR',
    'Bridge Highway Name': 'VARCHAR',
    'Bridge Highway Direction': 'VARCHAR',
    'Road Ramp': 'VARCHAR',
    'Bridge Highway Segment': 'VARCHAR',
    'Latitude': 'DOUBLE',
    'Longitude': 'DOUBLE',
    'Location': 'VARCHAR'}) 
LIMIT 10;

Please use the above query example and rewrite it for the file at `/Users/me/Downloads/adi-download/US_2021_ADI_Census\ Block\ Group_v4.csv` that has the following header:

```"","GISJOIN","ADI_NATRANK","ADI_STATERNK","FIPS"
"1","G01000100201001","74","5","010010201001"
"2","G01000100201002","74","5","010010201002"
"3","G01000100202001","84","7","010010202001"
"4","G01000100202002","84","7","010010202002"
"5","G01000100203001","79","6","010010203001"
"6","G01000100203002","79","6","010010203002"
"7","G01000100204001","64","3","010010204001"
"8","G01000100204002","64","3","010010204002"
"9","G01000100204003","64","3","010010204003"
```
```

In [2]:
%%sql
SELECT *
FROM read_csv('/Users/brenstockdale/Downloads/Data Downloads/adi-download-usa/US_2021_ADI_Census Block Group_v4.csv', 
  header=True, 
  delim=',',
  quote='"',
  skip=2,
  columns={'INDEX': 'INT',
           'GISJOIN': 'VARCHAR',
           'ADI_NATRANK': 'VARCHAR',  
           'ADI_STATERNK': 'VARCHAR',
           'FIPS': 'VARCHAR'})
LIMIT 10;

Unnamed: 0,INDEX,GISJOIN,ADI_NATRANK,ADI_STATERNK,FIPS
0,3,G01000100202001,84,7,10010202001
1,4,G01000100202002,84,7,10010202002
2,5,G01000100203001,79,6,10010203001
3,6,G01000100203002,79,6,10010203002
4,7,G01000100204001,64,3,10010204001
5,8,G01000100204002,64,3,10010204002
6,9,G01000100204003,64,3,10010204003
7,10,G01000100204004,64,3,10010204004
8,11,G01000100205011,65,4,10010205011
9,12,G01000100205012,65,4,10010205012


In [3]:
%%sql
CREATE TABLE test (
    name            VARCHAR,
    ind             INT,
);

Unnamed: 0,Count


In [4]:
%%sql
INSERT INTO test VALUES ('first', 1);

Unnamed: 0,Count
0,1


In [5]:
%%sql
INSERT INTO test VALUES ('second', NULL);

Unnamed: 0,Count
0,1


In [6]:
%%sql
DESCRIBE test;

Unnamed: 0,column_name,column_type,null,key,default,extra
0,name,VARCHAR,YES,,,
1,ind,INTEGER,YES,,,


In [7]:
%%sql 
SELECT * FROM test;

Unnamed: 0,name,ind
0,first,1.0
1,second,


# Testing visualizations using a parquet file

In [8]:
import vegafusion as vf
import polars as pl
import altair as alt
#from vega_datasets import data
alt.data_transformers.disable_max_rows()
alt.renderers.enable('html')

# Configure DuckDB connection
vf.runtime.set_connection("duckdb")

# Enable Mime Renderer
vf.enable(row_limit=100000000)

vegafusion.enable(mimetype='html', row_limit=100000000, embed_options=None)

In [9]:
area_deprivation_index = pl.read_parquet('/Users/brenstockdale/Downloads/Data Downloads/poverty.parquet')

In [10]:
area_deprivation_index[50:70]

TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
str,str,str,i64,f64,str
"""R1701""","""0400000US39""","""Ohio""",55,13.4,"""+/-0.3"""
"""R1701""","""0400000US40""","""Oklahoma""",55,15.6,"""+/-0.4"""
"""R1701""","""0400000US41""","""Oregon""",55,12.2,"""+/-0.4"""
"""R1701""","""0400000US42""","""Pennsylvania""",55,12.1,"""+/-0.2"""
"""R1701""","""0400000US44""","""Rhode Island""",55,11.4,"""+/-0.8"""
"""R1701""","""0400000US45""","""South Carolina…",55,14.6,"""+/-0.5"""
"""R1701""","""0400000US46""","""South Dakota""",55,12.3,"""+/-0.8"""
"""R1701""","""0400000US47""","""Tennessee""",55,13.6,"""+/-0.4"""
"""R1701""","""0400000US48""","""Texas""",55,14.2,"""+/-0.3"""
"""R1701""","""0400000US49""","""Utah""",55,8.6,"""+/-0.5"""


## Load FIPS / GEOID from census data

In [11]:
%%sql
CREATE TABLE census_blocks AS SELECT * FROM 'https://public.datathinking.org/census.gov%2Fcensus_block_groups_latitude_longitude.parquet'

Unnamed: 0,Count
0,242384


In [67]:
%%sql

SELECT * FROM census_blocks LIMIT 20;

Unnamed: 0,GEOID,latitude,longitude
0,60590422062,33.451279,-117.649586
1,60376025051,33.90737,-118.341669
2,60590421092,33.442898,-117.598564
3,360810113002,40.776412,-73.908686
4,360290059004,42.937805,-78.901426
5,360050442003,40.904587,-73.8454
6,60855033123,37.35352,-121.798139
7,360290024002,42.894579,-78.806804
8,60590018014,33.860834,-117.979217
9,360610247005,40.839305,-73.944775


In [12]:
%%sql
CREATE TABLE poverty_percent AS SELECT * FROM '/Users/brenstockdale/Downloads/Data Downloads/poverty.parquet'

Unnamed: 0,Count
0,7285


In [13]:
poverty_df = pd.read_parquet('/Users/brenstockdale/Downloads/Data Downloads/poverty.parquet')

In [14]:
poverty_df.head()

Unnamed: 0,TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
0,R1701,0100000US,United States,55,12.8,+/-0.1
1,R1701,0100001US,United States -- Urban,55,13.2,+/-0.1
2,R1701,0100043US,United States -- Rural,55,10.9,+/-0.1
3,R1701,0100089US,United States -- American Indian Reservation a...,55,23.4,+/-0.8
4,R1701,0100091US,United States -- Oklahoma Tribal Statistical Area,55,15.8,+/-0.5


In [44]:
#poverty_df = poverty_df.rename(columns={"'TBLID'": "TBLID"})
#poverty_df = poverty_df.rename(columns={"'GEOID'": "GEOID"})
#poverty_df = poverty_df.rename(columns={"'GEONAME'": "GEONAME"})
#poverty_df = poverty_df.rename(columns={"'PROFLN'": "PROFLN"})
#poverty_df = poverty_df.rename(columns={"'ESTIMATE'": "ESTIMATE"})
#poverty_df = poverty_df.rename(columns={"'MG_ERROR'": "MG_ERROR"})

In [51]:
poverty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7285 entries, 0 to 7284
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TBLID     7285 non-null   object 
 1   GEOID     7285 non-null   object 
 2   GEONAME   7285 non-null   object 
 3   PROFLN    7285 non-null   int64  
 4   ESTIMATE  7285 non-null   float64
 5   MG_ERROR  7285 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 341.6+ KB


In [None]:
%%sql
UPDATE poverty_percent 
SET GEONAME = REPLACE(GEOID, 'US', '')
WHERE GEONAME LIKE '%US'



In [None]:
%%sql
CREATE TABLE poverty_county AS
SELECT * FROM poverty_percent WHERE (INSTR(GEONAME, 'County') > 0) OR (INSTR(GEONAME, 'county') > 0)

Unnamed: 0,Count
0,2689


In [48]:
%%sql
SELECT * FROM poverty_county LIMIT 1000;

Unnamed: 0,TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
0,R1701,0500000US01003,"Baldwin County, Alabama",55,10.8,+/-2.2
1,R1701,0500000US01015,"Calhoun County, Alabama",55,19.8,+/-3.5
2,R1701,0500000US01043,"Cullman County, Alabama",55,13.0,+/-3.7
3,R1701,0500000US01049,"DeKalb County, Alabama",55,21.3,+/-4.6
4,R1701,0500000US01051,"Elmore County, Alabama",55,11.5,+/-3.2
...,...,...,...,...,...,...
802,R1701,330M600US356,"Macon-Bibb County--Warner Robins, GA CSA",55,17.0,+/-2.0
803,R1701,400C100US03763,"Athens-Clarke County, GA Urbanized Area (2010)",55,20.2,+/-3.0
804,R1701,400C100US04222,"Augusta-Richmond County, GA--SC Urbanized Area...",55,16.2,+/-2.0
805,R1701,400C100US43669,"Kailua (Honolulu County)--Kaneohe, HI Urbanize...",55,6.5,+/-2.3


In [47]:
%%sql

DELETE FROM poverty_county WHERE length(GEOID) > 14;

Unnamed: 0,Count
0,0


In [45]:
%%sql

SELECT * FROM poverty_percent LIMIT 1000;

Unnamed: 0,TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
0,R1701,0500000US01003,"Baldwin County, Alabama",55,10.8,+/-2.2
1,R1701,0500000US01015,"Calhoun County, Alabama",55,19.8,+/-3.5
2,R1701,0500000US01043,"Cullman County, Alabama",55,13.0,+/-3.7
3,R1701,0500000US01049,"DeKalb County, Alabama",55,21.3,+/-4.6
4,R1701,0500000US01051,"Elmore County, Alabama",55,11.5,+/-3.2
...,...,...,...,...,...,...
995,R1701,310M600US22060,"Faribault-Northfield, MN Micro Area",55,9.1,+/-2.9
996,R1701,310M600US22100,"Farmington, MO Micro Area",55,18.6,+/-5.0
997,R1701,310M600US22140,"Farmington, NM Metro Area",55,26.7,+/-3.6
998,R1701,310M600US22180,"Fayetteville, NC Metro Area",55,14.8,+/-1.1


In [None]:
%%sql

DELETE FROM poverty_county WHERE (INSTR(GEONAME, 'County') > 0) OR (INSTR(GEONAME, 'county') > 0)

In [55]:
%%sql

SELECT * FROM census_blocks LIMIT 10;

Unnamed: 0,GEOID,longitude,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,AFFGEOID,NAME,NAMELSAD,LSAD,ALAND,latitude
0,320030016112,-115.069852,32,3,1611,2,1500000US320030016112,2,Block Group 2,BG,663342,36.148129
1,320050011001,-119.732187,32,5,1100,1,1500000US320050011001,1,Block Group 1,BG,2022004,38.883508
2,320030019011,-115.128058,32,3,1901,1,1500000US320030019011,1,Block Group 1,BG,209570,36.141856
3,320310035102,-119.737654,32,31,3510,2,1500000US320310035102,2,Block Group 2,BG,594717,39.588675
4,320030002042,-115.1666,32,3,204,2,1500000US320030002042,2,Block Group 2,BG,1144494,36.150667
5,320310035173,-119.67395,32,31,3517,3,1500000US320310035173,3,Block Group 3,BG,625323,39.61295
6,320310032073,-119.750902,32,31,3207,3,1500000US320310032073,3,Block Group 3,BG,25566087,39.305116
7,350619703062,-106.668706,35,61,970306,2,1500000US350619703062,2,Block Group 2,BG,8828015,34.778454
8,350280005004,-106.290577,35,28,500,4,1500000US350280005004,4,Block Group 4,BG,53076356,35.80287
9,360179702022,-75.728929,36,17,970202,2,1500000US360179702022,2,Block Group 2,BG,114826135,42.655981


In [65]:
%%sql

SELECT * FROM census_blocks WHERE AFFGEOID LIKE '%500000US01003%'

Unnamed: 0,GEOID,longitude,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,AFFGEOID,NAME,NAMELSAD,LSAD,ALAND,latitude
0,010030114182,-87.640555,01,003,011418,2,1500000US010030114182,2,Block Group 2,BG,13427709,30.263125
1,010030107102,-87.872939,01,003,010710,2,1500000US010030107102,2,Block Group 2,BG,7337867,30.620325
2,010030107091,-87.868958,01,003,010709,1,1500000US010030107091,1,Block Group 1,BG,18856416,30.574653
3,010030107092,-87.843970,01,003,010709,2,1500000US010030107092,2,Block Group 2,BG,13813181,30.618773
4,010030114111,-87.695312,01,003,011411,1,1500000US010030114111,1,Block Group 1,BG,6444709,30.342240
...,...,...,...,...,...,...,...,...,...,...,...,...
112,010030114133,-87.652532,01,003,011413,3,1500000US010030114133,3,Block Group 3,BG,8748072,30.307268
113,010030105003,-87.766044,01,003,010500,3,1500000US010030105003,3,Block Group 3,BG,8840986,30.908131
114,010030103003,-87.881134,01,003,010300,3,1500000US010030103003,3,Block Group 3,BG,169059763,30.887916
115,010030111021,-87.842254,01,003,011102,1,1500000US010030111021,1,Block Group 1,BG,25397195,30.462849


In [77]:
%%sql
SELECT * FROM census_blocks WHERE length(AFFGEOID) = 21;

Unnamed: 0,GEOID,longitude,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,AFFGEOID,NAME,NAMELSAD,LSAD,ALAND,latitude
0,320030016112,-115.069852,32,003,001611,2,1500000US320030016112,2,Block Group 2,BG,663342,36.148129
1,320050011001,-119.732187,32,005,001100,1,1500000US320050011001,1,Block Group 1,BG,2022004,38.883508
2,320030019011,-115.128058,32,003,001901,1,1500000US320030019011,1,Block Group 1,BG,209570,36.141856
3,320310035102,-119.737654,32,031,003510,2,1500000US320310035102,2,Block Group 2,BG,594717,39.588675
4,320030002042,-115.166600,32,003,000204,2,1500000US320030002042,2,Block Group 2,BG,1144494,36.150667
...,...,...,...,...,...,...,...,...,...,...,...,...
242379,410379602002,-120.345493,41,037,960200,2,1500000US410379602002,2,Block Group 2,BG,29143862,42.225239
242380,391517112023,-81.592984,39,151,711202,3,1500000US391517112023,3,Block Group 3,BG,9098005,40.864320
242381,420430248012,-76.850705,42,043,024801,2,1500000US420430248012,2,Block Group 2,BG,42952455,40.503264
242382,290950129062,-94.501158,29,095,012906,2,1500000US290950129062,2,Block Group 2,BG,976159,38.948447


All entries in the AFFGEOID column are 21 characters long, so to match with the geoids in the 'poverty_county' table, I can
remove the first character of each (since they do not match up on the tables), and remove the last 7 of the AFFGEOID for now

Altered AFFGEOID table will be created as census_alt

Altered poverty table will be saved as poverty_alt

In [78]:
%%sql

CREATE TABLE census_alt_test AS
SELECT
    *,
    SUBSTRING(AFFGEOID, 2, LENGTH(AFFGEOID) - 8) AS modified_AFFGEOID
FROM
    census_blocks;

Unnamed: 0,Count
0,242384


In [80]:
%%sql
SELECT * FROM census_alt_test LIMIT 10;

Unnamed: 0,GEOID,longitude,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,AFFGEOID,NAME,NAMELSAD,LSAD,ALAND,latitude,modified_AFFGEOID
0,320030016112,-115.069852,32,3,1611,2,1500000US320030016112,2,Block Group 2,BG,663342,36.148129,500000US32003
1,320050011001,-119.732187,32,5,1100,1,1500000US320050011001,1,Block Group 1,BG,2022004,38.883508,500000US32005
2,320030019011,-115.128058,32,3,1901,1,1500000US320030019011,1,Block Group 1,BG,209570,36.141856,500000US32003
3,320310035102,-119.737654,32,31,3510,2,1500000US320310035102,2,Block Group 2,BG,594717,39.588675,500000US32031
4,320030002042,-115.1666,32,3,204,2,1500000US320030002042,2,Block Group 2,BG,1144494,36.150667,500000US32003
5,320310035173,-119.67395,32,31,3517,3,1500000US320310035173,3,Block Group 3,BG,625323,39.61295,500000US32031
6,320310032073,-119.750902,32,31,3207,3,1500000US320310032073,3,Block Group 3,BG,25566087,39.305116,500000US32031
7,350619703062,-106.668706,35,61,970306,2,1500000US350619703062,2,Block Group 2,BG,8828015,34.778454,500000US35061
8,350280005004,-106.290577,35,28,500,4,1500000US350280005004,4,Block Group 4,BG,53076356,35.80287,500000US35028
9,360179702022,-75.728929,36,17,970202,2,1500000US360179702022,2,Block Group 2,BG,114826135,42.655981,500000US36017


In [81]:
%%sql
CREATE TABLE census_alt AS
SELECT * FROM census_alt_test;

Unnamed: 0,Count
0,242384


Was able to succesfully make the matching geoid in the column 'modified_AFFGEOID' in table census_alt


In [82]:
%%sql
UPDATE poverty_county
SET GEOID = SUBSTRING(GEOID, 2);

Unnamed: 0,Count
0,807


In [85]:
%%sql
SELECT * FROM poverty_county LIMIT 1000;

Unnamed: 0,TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
0,R1701,500000US01003,"Baldwin County, Alabama",55,10.8,+/-2.2
1,R1701,500000US01015,"Calhoun County, Alabama",55,19.8,+/-3.5
2,R1701,500000US01043,"Cullman County, Alabama",55,13.0,+/-3.7
3,R1701,500000US01049,"DeKalb County, Alabama",55,21.3,+/-4.6
4,R1701,500000US01051,"Elmore County, Alabama",55,11.5,+/-3.2
...,...,...,...,...,...,...
802,R1701,30M600US356,"Macon-Bibb County--Warner Robins, GA CSA",55,17.0,+/-2.0
803,R1701,00C100US03763,"Athens-Clarke County, GA Urbanized Area (2010)",55,20.2,+/-3.0
804,R1701,00C100US04222,"Augusta-Richmond County, GA--SC Urbanized Area...",55,16.2,+/-2.0
805,R1701,00C100US43669,"Kailua (Honolulu County)--Kaneohe, HI Urbanize...",55,6.5,+/-2.3


In [86]:
%%sql 
CREATE TABLE poverty_latlong_alt AS
SELECT
    poverty_county.GEONAME,
    poverty_county.GEOID,
    poverty_county.ESTIMATE,
    poverty_county.MG_ERROR,
    census_alt.longitude,
    census_alt.latitude
FROM
    poverty_county
LEFT JOIN
    census_alt
ON
    poverty_county.GEOID = census_alt."modified_AFFGEOID";


Unnamed: 0,Count
0,190485


In [92]:
%%sql 
SELECT * FROM poverty_latlong_alt LIMIT 200000;

Unnamed: 0,GEONAME,GEOID,ESTIMATE,MG_ERROR,longitude,latitude
0,"Clark County, Nevada",500000US32003,15.1,+/-0.8,-115.069852,36.148129
1,"Clark County, Nevada",500000US32003,15.1,+/-0.8,-115.128058,36.141856
2,"Washoe County, Nevada",500000US32031,11.2,+/-1.4,-119.737654,39.588675
3,"Clark County, Nevada",500000US32003,15.1,+/-0.8,-115.166600,36.150667
4,"Washoe County, Nevada",500000US32031,11.2,+/-1.4,-119.673950,39.612950
...,...,...,...,...,...,...
190480,Louisville/Jefferson County--Elizabethtown--Ba...,30M600US350,12.2,+/-0.9,,
190481,"Athens-Clarke County, GA Urbanized Area (2010)",00C100US03763,20.2,+/-3.0,,
190482,"Augusta-Richmond County, GA--SC Urbanized Area...",00C100US04222,16.2,+/-2.0,,
190483,"Louisville/Jefferson County, KY-IN Metro Area",10M600US31140,12.1,+/-0.9,,


In [95]:
%%sql
SELECT * FROM poverty_latlong_alt WHERE latitude IS NULL LIMIT 1000;

Unnamed: 0,GEONAME,GEOID,ESTIMATE,MG_ERROR,longitude,latitude
0,"Athens-Clarke County, GA Metro Area",10M600US12020,16.9,+/-2.0,,
1,"Louisville/Jefferson County, KY--IN Urbanized ...",00C100US51755,13.3,+/-1.1,,
2,"Macon-Bibb County--Warner Robins, GA CSA",30M600US356,17.0,+/-2.0,,
3,"Macon-Bibb County, GA Metro Area",10M600US31420,20.8,+/-2.8,,
4,"Augusta-Richmond County, GA-SC Metro Area",10M600US12260,15.8,+/-1.6,,
5,"Atlanta--Athens-Clarke County--Sandy Springs, ...",30M600US122,12.0,+/-0.5,,
6,Louisville/Jefferson County--Elizabethtown--Ba...,30M600US350,12.2,+/-0.9,,
7,"Athens-Clarke County, GA Urbanized Area (2010)",00C100US03763,20.2,+/-3.0,,
8,"Augusta-Richmond County, GA--SC Urbanized Area...",00C100US04222,16.2,+/-2.0,,
9,"Louisville/Jefferson County, KY-IN Metro Area",10M600US31140,12.1,+/-0.9,,


In [96]:
%%sql

SELECT GEOID, COUNT(GEOID) AS count
FROM poverty_latlong_alt
GROUP BY GEOID
HAVING COUNT(GEOID) > 1;

Unnamed: 0,GEOID,count
0,500000US36081,1796
1,500000US36103,1058
2,500000US25025,681
3,500000US36059,1134
4,500000US51153,285
...,...,...
791,500000US04001,56
792,500000US48037,80
793,500000US40027,237
794,500000US35043,88


The above cell shows that there are a lot of duplicate GEOIDs, which makes sense as there are ~190,000 rows

In [99]:
%%sql

SELECT * FROM poverty_latlong_alt WHERE GEOID LIKE '500000US36081';

Unnamed: 0,GEONAME,GEOID,ESTIMATE,MG_ERROR,longitude,latitude
0,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.887012,40.720180
1,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.891042,40.561945
2,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.906678,40.707294
3,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.798223,40.757069
4,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.752770,40.703526
...,...,...,...,...,...,...
1791,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.931156,40.756162
1792,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.791235,40.693887
1793,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.927796,40.737111
1794,"Queens County, New York",500000US36081,13.5,+/-0.8,-73.820111,40.736575


In [107]:
%%sql
SELECT GEONAME FROM poverty_latlong_alt GROUP BY GEONAME;

Unnamed: 0,GEONAME
0,"Queens County, New York"
1,"Suffolk County, New York"
2,"Suffolk County, Massachusetts"
3,"Nassau County, New York"
4,"Prince William County, Virginia"
...,...
802,"Stearns County, Minnesota"
803,"Davidson County, North Carolina"
804,"Gallatin County, Montana"
805,Louisville/Jefferson County--Elizabethtown--Ba...


This GEOID was duplicated over 1750 times. All the estimate values are exactly the same, so this means we will not be losing data for the
visualization of the counties if I remove the duplicates, as we only need one.

In [102]:
%%sql
CREATE TABLE temp_table AS
SELECT MIN(id) AS min_id, GEOID
FROM poverty_latlong_alt
GROUP BY GEOID;

Unnamed: 0,TBLID,GEOID,GEONAME,PROFLN,ESTIMATE,MG_ERROR
0,R1701,500000US01003,"Baldwin County, Alabama",55,10.8,+/-2.2
1,R1701,500000US01015,"Calhoun County, Alabama",55,19.8,+/-3.5
2,R1701,500000US01043,"Cullman County, Alabama",55,13.0,+/-3.7
3,R1701,500000US01049,"DeKalb County, Alabama",55,21.3,+/-4.6
4,R1701,500000US01051,"Elmore County, Alabama",55,11.5,+/-3.2
...,...,...,...,...,...,...
802,R1701,30M600US356,"Macon-Bibb County--Warner Robins, GA CSA",55,17.0,+/-2.0
803,R1701,00C100US03763,"Athens-Clarke County, GA Urbanized Area (2010)",55,20.2,+/-3.0
804,R1701,00C100US04222,"Augusta-Richmond County, GA--SC Urbanized Area...",55,16.2,+/-2.0
805,R1701,00C100US43669,"Kailua (Honolulu County)--Kaneohe, HI Urbanize...",55,6.5,+/-2.3


In [108]:
%%sql
COPY (
  SELECT * FROM poverty_latlong_alt
) TO '/Users/brenstockdale/Downloads/Data Downloads/adi-download-usa/poverty_latlong.parquet' (COMPRESSION ZSTD);

Unnamed: 0,Count
0,190485


In [109]:
poverty_latlong = pl.read_parquet('/Users/brenstockdale/Downloads/Data Downloads/adi-download-usa/poverty_latlong.parquet')

In [110]:
poverty_latlong.head()

GEONAME,GEOID,ESTIMATE,MG_ERROR,longitude,latitude
str,str,f64,str,f64,f64
"""Clark County, …","""500000US32003""",15.1,"""+/-0.8""",-115.069852,36.148129
"""Clark County, …","""500000US32003""",15.1,"""+/-0.8""",-115.128058,36.141856
"""Washoe County,…","""500000US32031""",11.2,"""+/-1.4""",-119.737654,39.588675
"""Clark County, …","""500000US32003""",15.1,"""+/-0.8""",-115.1666,36.150667
"""Washoe County,…","""500000US32031""",11.2,"""+/-1.4""",-119.67395,39.61295


In [111]:
alt.Chart(poverty_latlong.sample(10000)).mark_circle(size=3).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    # size='ADI_STATERNK:Q',
    # color='count()',
    tooltip=['GEOID:N', 'ADI_NATRANK:Q', 'ADI_STATERNK:Q']
).project(
    type='albersUsa'
)