In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from publicdata.census.files.metafiles import TableLookup
import shelve 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

# Setup importing form this package
pkg.set_sys_path()
import pylib # Import the code for the package
from pylib.census_path_processing import * 

pkg

In [3]:
year = 2020
release = 5

tl = TableLookup(year, release)

In [4]:
tdf = tl.tables_df
cdf = tl.columns_df

cdf = cdf[['uid', 'table_id', 'name', 'sex', 'raceeth', 'age', 'poverty_status', 'filtered_pathname']]\
    .rename(columns={'filtered_pathname':'path', 'uid':'column_id'})

tdf = tdf[['table_id', 'title', 'bare_title', 'stripped_title','universe', 'subject', 'race',
       'age', 'sex']].copy().rename(columns={'race':'raceeth'})

tdf.head()

Unnamed: 0,table_id,title,bare_title,stripped_title,universe,subject,raceeth,age,sex
0,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all
1,B01001A,Sex By Age (White Alone),Sex By Age,Population,People who are White alone,Age-Sex,white,all,all
2,B01001B,Sex By Age (Black Or African American Alone),Sex By Age,Population,Black or African American alone,Age-Sex,black,all,all
3,B01001C,Sex By Age (American Indian And Alaska Native ...,Sex By Age,Population,People who are American Indian and Alaska Nati...,Age-Sex,aian,all,all
4,B01001D,Sex By Age (Asian Alone),Sex By Age,Population,People who are Asian alone,Age-Sex,asian,all,all


In [5]:
cdf.head()

Unnamed: 0,column_id,table_id,name,sex,raceeth,age,poverty_status,path
0,B01001_001,B01001,Total,all,all,all,all,
1,B01001_002,B01001,Male,male,all,all,all,
2,B01001_003,B01001,Under 5 years,male,all,000-005,all,
3,B01001_004,B01001,5 to 9 years,male,all,005-009,all,
4,B01001_005,B01001,10 to 14 years,male,all,010-014,all,


In [6]:
#
# Clean the pathnames
# 


if False:
    consecutive_err = 0
    while True:
        consecutive_err +=1
        try:
            run_path_cleaning()
            consecutive_err = 0
        except StopIteration:
            print("Done")
            break
        except Exception as e:
            print(e)

        if consecutive_err > 3:
            print("Too many errors")
            break


# Turn the database into a dataset, so we can save it in the package, for caching in future builds. 
# This is not currently loaded yet. 
with shelve.open('census_paths') as db:
    paths_df = pd.DataFrame(db.values())

paths_df.head()


Unnamed: 0,unique_id,path,name,description
0,B05007,/latin america/central america/mexico/entered ...,mexico_2010_later,People from Mexico who entered the U.S. after ...
1,B25129,/renter occupied/moved in 1990 to 1999,renter_moved_1990_1999,Renter occupied who moved in 1990 to 1999
2,B27011,/in labor force/employed/with health insurance...,employed_with_health_insurance,People who are employed and have health insura...
3,B26103F,/group quarters population/institutionalized g...,institutionalized_group_quarters_population,People who are living in institutionalized gro...
4,B26208,/group quarters population/noninstitutionalize...,group_quarters_population_noninstitutionalized...,People who live in noninstitutionalized group ...


In [7]:

with shelve.open('census_paths') as db:
    print(len(db))
    pdf = pd.DataFrame(db.values()).rename(
            columns={
                "unique_id": "table_id",
                "path": "filtered_path",
                "name": "path_name",
            }
        )


    mdf = add_rest_str(tdf, cdf)
    
    #mdf = mdf.merge(pdf, on=["table_id", "filtered_path"], how="left").copy()

    mdf["rest_description"] = mdf.apply(make_restricted_description, axis=1)

    mdf["col_desc"] = mdf.stripped_title + " for " + mdf.rest_description

    metadata_df = mdf.rename(columns={'uid':'column_id'})

metadata_df.head()

2153


Unnamed: 0,table_id,title,bare_title,stripped_title,universe,subject,table_raceeth,table_age,table_sex,column_id,name,sex,raceeth,age,poverty_status,path,restriction_str,rest_description,col_desc
0,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all,B01001_001,Total,all,all,all,all,,adults,adults,Population for adults
1,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all,B01001_002,Male,male,all,all,all,,men,men,Population for men
2,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all,B01001_003,Under 5 years,male,all,000-005,all,,"males, ages 0 to 5","males, ages 0 to 5","Population for males, ages 0 to 5"
3,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all,B01001_004,5 to 9 years,male,all,005-009,all,,"males, ages 5 to 9","males, ages 5 to 9","Population for males, ages 5 to 9"
4,B01001,Sex By Age,Sex By Age,Population,Total population,Age-Sex,all,all,all,B01001_005,10 to 14 years,male,all,010-014,all,,"male children, ages 10 to 14","male children, ages 10 to 14","Population for male children, ages 10 to 14"


In [8]:
tdf.head().T

Unnamed: 0,0,1,2,3,4
table_id,B01001,B01001A,B01001B,B01001C,B01001D
title,Sex By Age,Sex By Age (White Alone),Sex By Age (Black Or African American Alone),Sex By Age (American Indian And Alaska Native ...,Sex By Age (Asian Alone)
bare_title,Sex By Age,Sex By Age,Sex By Age,Sex By Age,Sex By Age
stripped_title,Population,Population,Population,Population,Population
universe,Total population,People who are White alone,Black or African American alone,People who are American Indian and Alaska Nati...,People who are Asian alone
subject,Age-Sex,Age-Sex,Age-Sex,Age-Sex,Age-Sex
raceeth,all,white,black,aian,asian
age,all,all,all,all,all
sex,all,all,all,all,all


In [9]:
cdf.head().T

Unnamed: 0,0,1,2,3,4
column_id,B01001_001,B01001_002,B01001_003,B01001_004,B01001_005
table_id,B01001,B01001,B01001,B01001,B01001
name,Total,Male,Under 5 years,5 to 9 years,10 to 14 years
sex,all,male,male,male,male
raceeth,all,all,all,all,all
age,all,all,000-005,005-009,010-014
poverty_status,all,all,all,all,all
path,,,,,
