In [112]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import rowgenerators as rg
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [113]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [114]:
cc = pkg.reference('country_codes').dataframe()
cc = cc[['ISO3166-1-Alpha-3', 'ISO3166-1-Alpha-2','ISO3166-1-numeric', 'CLDR display name']]
cc.columns = ['country_3', 'country_2', 'numeric', 'country_name']
cc['numeric'] = cc.numeric.astype('Int64')


cc.head()

Unnamed: 0,country_3,country_2,numeric,country_name
0,TWN,TW,158,Taiwan
1,AFG,AF,4,Afghanistan
2,ALB,AL,8,Albania
3,DZA,DZ,12,Algeria
4,ASM,AS,16,American Samoa


In [121]:
countries = pd.DataFrame({
    'ref': "ISO3166:"+cc.numeric.astype(str).str.zfill(4),
    'geoid': None, 
    'name': cc.country_name,
    'name2': cc.country_3, 
    'category': 'country'
})
    
countries.head()
    

Unnamed: 0,ref,geoid,name,name2,category
0,ISO3166:0158,,Taiwan,TWN,country
1,ISO3166:0004,,Afghanistan,AFG,country
2,ISO3166:0008,,Albania,ALB,country
3,ISO3166:0012,,Algeria,DZA,country
4,ISO3166:0016,,American Samoa,ASM,country


In [116]:
states = pkg.reference('states').dataframe()
cbsa = pkg.reference('metros').dataframe()
counties = pkg.reference('counties').dataframe()

In [53]:
from tqdm.auto import tqdm

def foreach_state(url):
    
    minor_outlying_areas = ['VI', 'MP', 'GU', 'AS'] # Virgin Islands, Guam, etc. 
    
    frames = []
    errors = []
    for idx, r in tqdm(list(states.iterrows())):
        
        if r.STUSPS in minor_outlying_areas:
            continue
        
        try:
            frames.append(rg.dataframe(url.format(r.STUSPS)))
        except Exception as e:
            errors.append((e, r.STUSPS))
            
    return pd.concat(frames), errors

places, place_errors = foreach_state(pkg.reference('places').value)
     
print(len(place_errors))
places.head()

  0%|          | 0/56 [00:00<?, ?it/s]

0


Unnamed: 0,STATEFP,PLACEFP,PLACENS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,PCICBSA,PCINECTA,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,54,52060,2390615,5452060,Martinsburg,Martinsburg city,25,C1,Y,N,G4110,A,17173918,42877,39.4577694,-77.9782123,"POLYGON ((-78.02431 39.44383, -78.02271 39.447..."
1,54,8092,2390733,5408092,Blacksville,Blacksville town,43,C1,N,N,G4110,A,775505,24048,39.7156423,-80.214711,"POLYGON ((-80.22155 39.72121, -80.22151 39.721..."
2,54,76516,2391427,5476516,Star City,Star City town,43,C1,N,N,G4110,A,1265921,261961,39.6591556,-79.9866248,"POLYGON ((-79.99425 39.65447, -79.99423 39.654..."
3,54,40828,2391244,5440828,Jane Lew,Jane Lew town,43,C1,N,N,G4110,A,618089,18635,39.1107834,-80.4080091,"POLYGON ((-80.41503 39.11067, -80.41414 39.111..."
4,54,35284,2390232,5435284,Harpers Ferry,Harpers Ferry town,43,C1,N,N,G4110,A,1386068,229271,39.3252427,-77.7414418,"POLYGON ((-77.75475 39.32983, -77.75474 39.329..."


In [80]:
from geoid.tiger import State, Cbsa, County, Place

In [122]:
from geoid.tiger import TigerGeoid

frames = []
for e, gc, cat in ( (states, State, 'state'), (cbsa, Cbsa, 'metro'), (counties, County, 'county'), (places, Place,'place')):
    
    
    name2 = e['NAMELSAD'] if 'NAMELSAD' in e.columns else e['NAME']
    geoid =  str(gc.parse(e.iloc[0].GEOID).as_acs())
    
    frames.append(pd.DataFrame(dict(ref='geoid:'+geoid, geoid=geoid, name=e.NAME, name2=name2, category=cat)))
    
names = pd.concat(frames+[countries])
names.sample(20)
    

Unnamed: 0,ref,geoid,name,name2,category
408,geoid:16000US5452060,16000US5452060,Lamar,Lamar village,place
321,geoid:16000US5452060,16000US5452060,Mesita,Mesita CDP,place
238,geoid:16000US5452060,16000US5452060,Arthurdale,Arthurdale CDP,place
759,geoid:16000US5452060,16000US5452060,Pelican Marsh,Pelican Marsh CDP,place
812,geoid:05000US31039,05000US31039,Norman,Norman County,county
2081,geoid:05000US31039,05000US31039,Lowndes,Lowndes County,county
214,geoid:16000US5452060,16000US5452060,Tony,Tony village,place
2310,geoid:05000US31039,05000US31039,Yadkin,Yadkin County,county
385,geoid:16000US5452060,16000US5452060,Langley,Langley CDP,place
42,geoid:16000US5452060,16000US5452060,St. John,St. John city,place
