In [1]:
# import pandas as a dependency
import pandas as pd

In [2]:
# import csv into a dataframe
geo_df = pd.read_csv("census_block.csv", encoding='latin 1')

In [3]:
# show me what you got
geo_df.head()

Unnamed: 0,GIDBG,State,State_name,County,County_name,Tract,Block_Group
0,10010200000.0,1,Alabama,1,Autauga County,20100,1
1,10010200000.0,1,Alabama,1,Autauga County,20100,2
2,10010200000.0,1,Alabama,1,Autauga County,20200,1
3,10010200000.0,1,Alabama,1,Autauga County,20200,2
4,10010200000.0,1,Alabama,1,Autauga County,20300,1


In [4]:
# add preceding zeros to the columns in order to create a 12 digits census block group code

geo_df['State'] = geo_df['State'].apply(lambda x: '{0:0>2}'.format(x))
geo_df['County'] = geo_df['County'].apply(lambda x: '{0:0>3}'.format(x))
geo_df['Tract'] = geo_df['Tract'].apply(lambda x: '{0:0>6}'.format(x))

In [5]:
# create census block group
geo_df['Census_Block_Group'] = geo_df['State'].astype(str) + geo_df['County'].astype(str) + geo_df['Tract'].astype(str) + geo_df['Block_Group'].astype(str)

In [6]:
# show me what you got
geo_df.head()

Unnamed: 0,GIDBG,State,State_name,County,County_name,Tract,Block_Group,Census_Block_Group
0,10010200000.0,1,Alabama,1,Autauga County,20100,1,10010201001
1,10010200000.0,1,Alabama,1,Autauga County,20100,2,10010201002
2,10010200000.0,1,Alabama,1,Autauga County,20200,1,10010202001
3,10010200000.0,1,Alabama,1,Autauga County,20200,2,10010202002
4,10010200000.0,1,Alabama,1,Autauga County,20300,1,10010203001


In [7]:
# create 5 digit FIPS code

geo_df['FIPS'] = geo_df['State'].astype(str) + geo_df['County'].astype(str)

In [8]:
geo_df.head()

Unnamed: 0,GIDBG,State,State_name,County,County_name,Tract,Block_Group,Census_Block_Group,FIPS
0,10010200000.0,1,Alabama,1,Autauga County,20100,1,10010201001,1001
1,10010200000.0,1,Alabama,1,Autauga County,20100,2,10010201002,1001
2,10010200000.0,1,Alabama,1,Autauga County,20200,1,10010202001,1001
3,10010200000.0,1,Alabama,1,Autauga County,20200,2,10010202002,1001
4,10010200000.0,1,Alabama,1,Autauga County,20300,1,10010203001,1001


In [9]:
# create 11 digit census tract

geo_df['Census_Tract'] = geo_df['State'].astype(str) + geo_df['County'].astype(str) + geo_df['Tract'].astype(str)


In [10]:
# show me what you got
geo_df.head()

Unnamed: 0,GIDBG,State,State_name,County,County_name,Tract,Block_Group,Census_Block_Group,FIPS,Census_Tract
0,10010200000.0,1,Alabama,1,Autauga County,20100,1,10010201001,1001,1001020100
1,10010200000.0,1,Alabama,1,Autauga County,20100,2,10010201002,1001,1001020100
2,10010200000.0,1,Alabama,1,Autauga County,20200,1,10010202001,1001,1001020200
3,10010200000.0,1,Alabama,1,Autauga County,20200,2,10010202002,1001,1001020200
4,10010200000.0,1,Alabama,1,Autauga County,20300,1,10010203001,1001,1001020300


In [11]:
# create columns with no preceding zeros (some data sources omit the preceding zero with FIPS, CBG, and Census Tract)
geo_df['No_Zero_FIPS'] = [x.lstrip("0") for x in geo_df['FIPS']]
geo_df['No_Zero_CBG'] = [x.lstrip("0") for x in geo_df['Census_Block_Group']]
geo_df['No_Zero_Census_Tract'] = [x.lstrip("0") for x in geo_df['Census_Tract']]

In [12]:
# replace Puerto Rico Commonwewealth with Puerto Rico for a later merge

geo_df.replace({'Puerto Rico Commonwealth': 'Puerto Rico'})

Unnamed: 0,GIDBG,State,State_name,County,County_name,Tract,Block_Group,Census_Block_Group,FIPS,Census_Tract,No_Zero_FIPS,No_Zero_CBG,No_Zero_Census_Tract
0,1.001020e+10,01,Alabama,001,Autauga County,020100,1,010010201001,01001,01001020100,1001,10010201001,1001020100
1,1.001020e+10,01,Alabama,001,Autauga County,020100,2,010010201002,01001,01001020100,1001,10010201002,1001020100
2,1.001020e+10,01,Alabama,001,Autauga County,020200,1,010010202001,01001,01001020200,1001,10010202001,1001020200
3,1.001020e+10,01,Alabama,001,Autauga County,020200,2,010010202002,01001,01001020200,1001,10010202002,1001020200
4,1.001020e+10,01,Alabama,001,Autauga County,020300,1,010010203001,01001,01001020300,1001,10010203001,1001020300
5,1.001020e+10,01,Alabama,001,Autauga County,020300,2,010010203002,01001,01001020300,1001,10010203002,1001020300
6,1.001020e+10,01,Alabama,001,Autauga County,020400,1,010010204001,01001,01001020400,1001,10010204001,1001020400
7,1.001020e+10,01,Alabama,001,Autauga County,020400,2,010010204002,01001,01001020400,1001,10010204002,1001020400
8,1.001020e+10,01,Alabama,001,Autauga County,020400,3,010010204003,01001,01001020400,1001,10010204003,1001020400
9,1.001020e+10,01,Alabama,001,Autauga County,020400,4,010010204004,01001,01001020400,1001,10010204004,1001020400


In [13]:
# clean up and re-order columns

geo_df = geo_df[['State', 'State_name', 'County', 'County_name',
                  'FIPS', 'No_Zero_FIPS', 'Tract', 'Census_Tract',
                  'No_Zero_Census_Tract', 'Census_Block_Group', 'No_Zero_CBG',
                  'Block_Group']]

In [14]:
# URL to scrape to add state abbreviations
url = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations'

In [15]:
# scrape the URL
tables = pd.read_html(url)

In [16]:
#locate the correct table and set it to a dataframe
df_state = tables[0]

In [17]:
# remove the first rows because of unnecessary data
df_state = df_state.drop(df_state.index[0:12])
df_state.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
12,Alabama,State,US-AL,AL,1,AL,AL,Ala.,Ala.,,,,,,
13,Alaska,State,US-AK,AK,2,AK,AK,Alaska,Alaska,Alas.,,,,,
14,Arizona,State,US-AZ,AZ,4,AZ,AZ,Ariz.,Ariz.,Az.,,,,,
15,Arkansas,State,US-AR,AR,5,AR,AR,Ark.,Ark.,,,,,,
16,California,State,US-CA,CA,6,CA,CF,Calif.,Calif.,"Ca., Cal.",,,,,


In [18]:
# rename numbered columns to match the table columns from the wikipedia
df_state.rename(columns={ df_state.columns[0]: "State",
                        df_state.columns[2]: "ISO",
                        df_state.columns[3]: "ANSI",
                        df_state.columns[5]: "USPS",
                        df_state.columns[6]: "USCG",
                        df_state.columns[7]: "GPO",
                        df_state.columns[8]: "AP",}, inplace=True)

In [19]:
df_state.head(5)

Unnamed: 0,State,1,ISO,ANSI,4,USPS,USCG,GPO,AP,9,10,11,12,13,14
12,Alabama,State,US-AL,AL,1,AL,AL,Ala.,Ala.,,,,,,
13,Alaska,State,US-AK,AK,2,AK,AK,Alaska,Alaska,Alas.,,,,,
14,Arizona,State,US-AZ,AZ,4,AZ,AZ,Ariz.,Ariz.,Az.,,,,,
15,Arkansas,State,US-AR,AR,5,AR,AR,Ark.,Ark.,,,,,,
16,California,State,US-CA,CA,6,CA,CF,Calif.,Calif.,"Ca., Cal.",,,,,


In [20]:
# mass remove columns from dataframe with unnecessary or null data
cols = [1,4,9,10,11,12,13,14]
df_state.drop(df_state.columns[cols],axis=1,inplace=True)

In [21]:
# reset the index
df_state = df_state.reset_index(drop=True)

In [22]:
# remove rows for non-states and reset the index so that puerto rico doesn't look out of place
df_state = df_state.drop(df_state.index[55:85])
df_state = df_state.drop(df_state.index[51:54])
df_state = df_state.reset_index(drop=True)

In [23]:
df_state.head()

Unnamed: 0,State,ISO,ANSI,USPS,USCG,GPO,AP
0,Alabama,US-AL,AL,AL,AL,Ala.,Ala.
1,Alaska,US-AK,AK,AK,AK,Alaska,Alaska
2,Arizona,US-AZ,AZ,AZ,AZ,Ariz.,Ariz.
3,Arkansas,US-AR,AR,AR,AR,Ark.,Ark.
4,California,US-CA,CA,CA,CF,Calif.,Calif.


In [24]:
# merge geo_df with state_df

merge_df = geo_df.merge(df_state, left_on='State_name', right_on='State')

In [25]:
merge_df.head()

Unnamed: 0,State_x,State_name,County,County_name,FIPS,No_Zero_FIPS,Tract,Census_Tract,No_Zero_Census_Tract,Census_Block_Group,No_Zero_CBG,Block_Group,State_y,ISO,ANSI,USPS,USCG,GPO,AP
0,1,Alabama,1,Autauga County,1001,1001,20100,1001020100,1001020100,10010201001,10010201001,1,Alabama,US-AL,AL,AL,AL,Ala.,Ala.
1,1,Alabama,1,Autauga County,1001,1001,20100,1001020100,1001020100,10010201002,10010201002,2,Alabama,US-AL,AL,AL,AL,Ala.,Ala.
2,1,Alabama,1,Autauga County,1001,1001,20200,1001020200,1001020200,10010202001,10010202001,1,Alabama,US-AL,AL,AL,AL,Ala.,Ala.
3,1,Alabama,1,Autauga County,1001,1001,20200,1001020200,1001020200,10010202002,10010202002,2,Alabama,US-AL,AL,AL,AL,Ala.,Ala.
4,1,Alabama,1,Autauga County,1001,1001,20300,1001020300,1001020300,10010203001,10010203001,1,Alabama,US-AL,AL,AL,AL,Ala.,Ala.


In [26]:
# rename original state_x as state and dropping the duplicate state_y column

merge_df = merge_df.rename(columns={'State_x': 'State'})
merge_df = merge_df.drop(['State_y'], axis=1)

In [27]:
merge_df.head()

Unnamed: 0,State,State_name,County,County_name,FIPS,No_Zero_FIPS,Tract,Census_Tract,No_Zero_Census_Tract,Census_Block_Group,No_Zero_CBG,Block_Group,ISO,ANSI,USPS,USCG,GPO,AP
0,1,Alabama,1,Autauga County,1001,1001,20100,1001020100,1001020100,10010201001,10010201001,1,US-AL,AL,AL,AL,Ala.,Ala.
1,1,Alabama,1,Autauga County,1001,1001,20100,1001020100,1001020100,10010201002,10010201002,2,US-AL,AL,AL,AL,Ala.,Ala.
2,1,Alabama,1,Autauga County,1001,1001,20200,1001020200,1001020200,10010202001,10010202001,1,US-AL,AL,AL,AL,Ala.,Ala.
3,1,Alabama,1,Autauga County,1001,1001,20200,1001020200,1001020200,10010202002,10010202002,2,US-AL,AL,AL,AL,Ala.,Ala.
4,1,Alabama,1,Autauga County,1001,1001,20300,1001020300,1001020300,10010203001,10010203001,1,US-AL,AL,AL,AL,Ala.,Ala.
