# Census data download

Run this notebook to generate data files:
* `census_geo.*`
* `census_long.csv`

## Imports and setup

In [1]:
import cenpy

In [2]:
# This is the main product we get the data from
acs = cenpy.products.ACS()
acs

Connection to American Community Survey: 5-Year Estimates: Detailed Tables 5-Year(ID: https://api.census.gov/data/id/ACSDT5Y2019)
With MapServer: Census ACS 2019 WMS

In [3]:
# Experiment with the variables if you like - here's the ones I will use
census_vars = acs.variables[acs.variables['group'] == 'B15001']
census_vars.head()

Unnamed: 0,label,concept,predicateType,group,limit,predicateOnly,hasGeoCollectionSupport,attributes,required
B15001_001E,Estimate!!Total:,SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE P...,int,B15001,0,,,"B15001_001EA,B15001_001M,B15001_001MA",
B15001_002E,Estimate!!Total:!!Male:,SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE P...,int,B15001,0,,,"B15001_002EA,B15001_002M,B15001_002MA",
B15001_003E,Estimate!!Total:!!Male:!!18 to 24 years:,SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE P...,int,B15001,0,,,"B15001_003EA,B15001_003M,B15001_003MA",
B15001_004E,Estimate!!Total:!!Male:!!18 to 24 years:!!Less...,SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE P...,int,B15001,0,,,"B15001_004EA,B15001_004M,B15001_004MA",
B15001_005E,Estimate!!Total:!!Male:!!18 to 24 years:!!9th ...,SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE P...,int,B15001,0,,,"B15001_005EA,B15001_005M,B15001_005MA",


## Download California data

In [4]:
# I'm picking out data from CA, at county level, with these variables I got thru experimenting
census_df = acs.from_state(
    'CA',
    level = 'county',
    variables = ['B15001_*']
)
census_df.head(3)

  return self._from_name(state, variables, level, "States", **kwargs)


Unnamed: 0,GEOID,geometry,B15001_001E,B15001_002E,B15001_003E,B15001_004E,B15001_005E,B15001_006E,B15001_007E,B15001_008E,...,B15001_075E,B15001_076E,B15001_077E,B15001_078E,B15001_079E,B15001_080E,B15001_081E,B15001_082E,B15001_083E,NAME
0,6075,"MULTIPOLYGON (((-13649137.130 4553355.950, -13...",757415.0,386289.0,30164.0,203.0,1547.0,6170.0,11448.0,1157.0,...,20643.0,74161.0,15387.0,5708.0,13153.0,10295.0,3785.0,14763.0,11070.0,"San Francisco County, California"
1,6081,"POLYGON ((-13646334.440 4521323.060, -13646298...",607919.0,297536.0,29151.0,766.0,2244.0,8948.0,10812.0,1360.0,...,20555.0,68128.0,7093.0,3622.0,13966.0,13126.0,5451.0,15381.0,9489.0,"San Mateo County, California"
2,6001,"POLYGON ((-13612245.300 4538149.390, -13612345...",1312727.0,638603.0,69988.0,820.0,6170.0,20382.0,29477.0,2269.0,...,38154.0,125967.0,14577.0,9122.0,29319.0,22503.0,7612.0,24097.0,18737.0,"Alameda County, California"


## Save geodata out separately to save space

In [5]:
# Save the county geo info out separately as it is larger
census_df_geo = census_df.drop_duplicates('GEOID')[['GEOID', 'geometry', 'NAME']]

# Convert GEOID to int for smoother parsing later
census_df_geo['GEOID'] = census_df_geo['GEOID'].astype(int)
census_df_geo.head()

Unnamed: 0,GEOID,geometry,NAME
0,6075,"MULTIPOLYGON (((-13649137.130 4553355.950, -13...","San Francisco County, California"
1,6081,"POLYGON ((-13646334.440 4521323.060, -13646298...","San Mateo County, California"
2,6001,"POLYGON ((-13612245.300 4538149.390, -13612345...","Alameda County, California"
3,6039,"POLYGON ((-13370181.620 4462431.440, -13370113...","Madera County, California"
4,6107,"POLYGON ((-13225170.620 4271811.660, -13225307...","Tulare County, California"


In [6]:
census_df_geo.to_file("census_geo.shp", index=False)

### Check reading geodata back in

In [7]:
import geopandas
census_df_geo2 = geopandas.read_file("census_geo.shp")
census_df_geo2.head()

Unnamed: 0,GEOID,NAME,geometry
0,6075,"San Francisco County, California","MULTIPOLYGON (((-13649137.130 4553355.950, -13..."
1,6081,"San Mateo County, California","POLYGON ((-13646334.440 4521323.060, -13646298..."
2,6001,"Alameda County, California","POLYGON ((-13612245.300 4538149.390, -13612345..."
3,6039,"Madera County, California","POLYGON ((-13370181.620 4462431.440, -13370113..."
4,6107,"Tulare County, California","POLYGON ((-13225170.620 4271811.660, -13225307..."


## Convert population data to long format and save
Skip the totals and subtotals for simplicity since we can rederive those easily

In [8]:
# Now do a bunch of munging to convert the data from wide to long format with appropriate column names
census_var_splits = census_vars['label'].str.split(':!!') # split on ':!!'
n_splits = census_var_splits.apply(len)
n_splits.value_counts()

4    70
3    10
2     2
1     1
Name: label, dtype: int64

In [9]:
full_splits_df = census_vars['label'][n_splits == max(n_splits)].str.split(':!!', expand=True)
full_splits_df.drop([0], axis=1, inplace=True)
full_splits_df.rename({1: 'sex', 2: 'age', 3: 'education'}, axis=1, inplace=True)
full_splits_df.head(3)

Unnamed: 0,sex,age,education
B15001_004E,Male,18 to 24 years,Less than 9th grade
B15001_005E,Male,18 to 24 years,"9th to 12th grade, no diploma"
B15001_006E,Male,18 to 24 years,High school graduate (includes equivalency)


In [10]:
census_df_long = census_df \
  .drop(['geometry', 'NAME'], axis=1) \
  .melt(id_vars = ['GEOID'], value_name = 'estimate') \
  .merge(full_splits_df, left_on='variable', right_index=True) \
  .drop('variable', axis=1) \
  .reset_index(drop=True)
census_df_long['estimate'] = census_df_long['estimate'].astype('int')
census_df_long.head(3)

Unnamed: 0,GEOID,estimate,sex,age,education
0,6075,203,Male,18 to 24 years,Less than 9th grade
1,6081,766,Male,18 to 24 years,Less than 9th grade
2,6001,820,Male,18 to 24 years,Less than 9th grade


In [11]:
census_df_long.to_csv('census_long.csv', index=False)

### Check reading data back in

In [12]:
import pandas as pd
census_df_long2 = pd.read_csv("census_long.csv")
census_df_long2.head()

Unnamed: 0,GEOID,estimate,sex,age,education
0,6075,203,Male,18 to 24 years,Less than 9th grade
1,6081,766,Male,18 to 24 years,Less than 9th grade
2,6001,820,Male,18 to 24 years,Less than 9th grade
3,6039,237,Male,18 to 24 years,Less than 9th grade
4,6107,491,Male,18 to 24 years,Less than 9th grade
