In [4]:
# TRY TO GET ZIP CODES FROM THE CENSUS SITE AND CAPTURE ALL THE DATA YOU CAN TO BUILD A GEOCODE DATA SET

In [None]:
# explanation about how to build entity codes
# https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html

In [1]:
a = '''
Area Type                     Number of Digits    Example GEOID
State                         2                   48
County                        2+3=5               48201
County Subdivision            2+3+5=10            4820192975
Places                        2+5=7               4835000
Census Tract                  2+3+6=11            48201223100
Block Group                   2+3+6+1=12          4.82012E+11
Block                         2+3+6+4=15          4.82012E+14
Congressional District        2+2=4               902
State Legislative District    2+3=5               9033
ZCTA                          5                   20746
'''

In [2]:
import pandas as pd
from io import StringIO
import urllib.request, json 
import csv

In [3]:
# FUNCTIONS

In [149]:
def remove_qualifiers(place_name):
    s = place_name
    if("township" not in s and "Township" not in s and "Ashippun" not in s):
        s = s.replace("ship", " ")
    else:
        s = place_name.replace(' township', '').replace(' Township', '')
    s = place_name.replace(' town', '').replace(' Town', '')  
    s = s.replace(' city', '')
    s = s.replace(' village', '')
    s = s.replace(' CDP', '')
    s = s.replace("DeFuniak", "De Funiak")
    s = s.replace("AFB", "Air Force Base")
    s = s.replace("Balance of ", "")
    s = s.replace("St.", "Saint")
    s = s.replace("Ste.", "Sainte")
    s = s.replace("City of the ", "")
    s = s.replace("City of ", "")
    s = s.replace("Township of ", "")
    s = s.replace(" consolidated government", "")
    s = s.replace(" (balance)", "")
    s = s.replace(" municipality", "")
    s = s.replace(" unified government", "")
    s = s.replace(" borough", "")
    s = s.replace(" charter", "")
    return s.strip()

In [144]:
def get_type(place_name):
    if " town" in place_name or " Town" in place_name:
        return "town"
    elif " city" in place_name:
        return "city"
    elif " county" in place_name or " County" in place_name:
        return "county"
    elif " village" in place_name:
        return "village"
    elif " municipality" in place_name:
        return "municipality"
    elif " unified government" in place_name:
        return "unified government"
    elif " borough" in place_name:
        return "borough"
    elif " charter" in place_name:
        return "charter"
    elif " CDP" in place_name:
        return "cdp"

In [6]:
# STATES

In [52]:
state_response = urllib.request.urlopen(f"https://cbb.census.gov/arcgis/rest/services/Census_EMS/Census/MapServer/11/query?f=json&resultOffset=0&resultRecordCount=100&where=1=1&outFields=NAME,STATE&returnGeometry=false")
state_json_string = state_response.read().decode('utf-8')
state_json_obj = json.loads(state_json_string)
state_json_features = state_json_obj['features']

state_df = pd.DataFrame(columns=['STATE_NAME', 'FIPS_CODE'])
for sf in state_json_features:
    new_row = {
       'STATE_NAME':sf["attributes"]["NAME"],
       'FIPS_CODE':sf["attributes"]["STATE"]
    }   
    state_df = pd.concat([state_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

print(state_df)

                                      STATE_NAME FIPS_CODE
0                                  West Virginia        54
1                                        Florida        12
2                                       Illinois        17
3                                      Minnesota        27
4                                       Maryland        24
5                                   Rhode Island        44
6                                          Idaho        16
7                                  New Hampshire        33
8                                 North Carolina        37
9                                        Vermont        50
10                                   Connecticut        09
11                                      Delaware        10
12                                    New Mexico        35
13                                    California        06
14                                    New Jersey        34
15                                     Wisconsin        

In [8]:
# STATE ABBREVIATIONS

In [53]:
abbrev_response = urllib.request.urlopen(f"https://www2.census.gov/geo/docs/reference/state.txt")
abbrev_text = abbrev_response.read().decode('utf-8').replace('|', ',').strip()

with open('geo_state_abbrev.csv', 'w') as f:
    f.write(abbrev_text)

abbrev_df = pd.read_csv("geo_state_abbrev.csv")
print(abbrev_df)

    STATE STUSAB                   STATE_NAME  STATENS
0       1     AL                      Alabama  1779775
1       2     AK                       Alaska  1785533
2       4     AZ                      Arizona  1779777
3       5     AR                     Arkansas    68085
4       6     CA                   California  1779778
5       8     CO                     Colorado  1779779
6       9     CT                  Connecticut  1779780
7      10     DE                     Delaware  1779781
8      11     DC         District of Columbia  1702382
9      12     FL                      Florida   294478
10     13     GA                      Georgia  1705317
11     15     HI                       Hawaii  1779782
12     16     ID                        Idaho  1779783
13     17     IL                     Illinois  1779784
14     18     IN                      Indiana   448508
15     19     IA                         Iowa  1779785
16     20     KS                       Kansas   481813
17     21 

In [113]:
merge_df = state_df.merge(abbrev_df, on="STATE_NAME").sort_values('FIPS_CODE')
merge_df[["STATE_NAME", "FIPS_CODE", "STUSAB"]]

Unnamed: 0,STATE_NAME,FIPS_CODE,STUSAB
22,Alabama,1,AL
38,Alaska,2,AK
53,Arizona,4,AZ
41,Arkansas,5,AR
13,California,6,CA
26,Colorado,8,CO
10,Connecticut,9,CT
11,Delaware,10,DE
51,District of Columbia,11,DC
1,Florida,12,FL


In [None]:
# LOAD FEDCODES

In [131]:
fedcodes_df = pd.read_csv("fedcodes_data.csv")

In [None]:
# BUILD GEO DATA

In [134]:
# usable state codes
state_codes = [
    "01","02","04","05","06","08","09","10","11","12","13","15","16","17","18","19","20",
    "21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37",
    "38","39","40","41","42","44","45","46","47","48","49","50","51","53","54","55","56"
]

In [150]:
get_geometry = "true"
geo_df = pd.DataFrame()
    
for index, row in merge_df.iterrows():
    fips_code = row["FIPS_CODE"]
    if (fips_code not in state_codes):
        continue
        
    response = urllib.request.urlopen(f"https://cbb.census.gov/arcgis/rest/services/Census_EMS/Census/MapServer/4/query?f=json&resultOffset=0&resultRecordCount=1000&where=STATE='{fips_code}'&orderByFields=STATE,PLACE&outFields=GEOID,NAME,PLACE,STATE&returnGeometry={get_geometry}&spatialRel=esriSpatialRelIntersects")
    json_string = response.read().decode('utf-8')
    json_obj = json.loads(json_string)
    json_features = json_obj['features']
    
    for f in json_features:
        
        place_name = f["attributes"]["NAME"]
        place_type = get_type(place_name)
        if(place_type == "cdp"):
            continue
            
        geo_id = f["attributes"]["GEOID"]
        place_id = f["attributes"]["PLACE"]
        place_name_trimmed = remove_qualifiers(place_name)
        state_name = row["STATE_NAME"]
        state_abbrev = row["STUSAB"]
        
        cond_1 = fedcodes_df.feature_name == place_name
        cond_2 = fedcodes_df.feature_name == place_name_trimmed
        cond_3 = fedcodes_df.census_code == str(int(place_id))
        cond_4 = fedcodes_df.state_name == state_name
        match = fedcodes_df[(cond_1 | cond_2 | cond_3) & (cond_4)]
        
        # try to filter by place id
        if(match.shape[0] != 1):
            temp = match[match.census_code == place_id]
            if(temp.shape[0] > 0):
                match = temp
            
        # try to filter by place type
        if(match.shape[0] != 1):
            class_type = "Civil" if place_type == "county" else "Populated Place"
            temp = match[match.feature_class == class_type]
            if(temp.shape[0] > 0):
                match = temp
            
        if(match.shape[0] < 1):
            print(f"{match.shape[0]} - {geo_id} - {place_id} - {str(int(place_id))} - {place_name} - {place_name_trimmed} - {place_type} - {state_name} - {fips_code}")
        
        if(match.shape[0] > 0):
            county_code = match.iloc[0]['county_numeric']
        else:
            county_code = 0
        
        new_row = {
            'GEOID':geo_id,
            'PLACE_NAME':place_name_trimmed,
            'PLACE_ID':place_id,
            'FIPS_CODE':fips_code,
            'TYPE':get_type(place_name),
            'STATE_NAME':state_name,
            'STATE_ABBREVIATION':state_abbrev,
            'COUNTY_CODE':county_code
        }
        geo_df = pd.concat([geo_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

#print(geo_df.COUNTY_CODE)
geo_df.to_csv("geo_data.csv")

0 - 0205200 - 05200 - 5200 - Barrow city - Barrow - city - Alaska - 02
0 - 0602112 - 02112 - 2112 - Angels city - Angels - city - California - 06
0 - 0906820 - 06820 - 6820 - Bozrah town - Bozrah - town - Connecticut - 09
0 - 1304204 - 04204 - 4204 - Augusta-Richmond County consolidated government (balance) - Augusta-Richmond County - county - Georgia - 13
0 - 1608830 - 08830 - 8830 - Boise City city - Boise City - city - Idaho - 16
0 - 2005337 - 05337 - 5337 - Bel Aire city - Bel Aire - city - Kansas - 20
0 - 2201885 - 01885 - 1885 - Amite City town - Amite City - town - Louisiana - 22
0 - 2306260 - 06260 - 6260 - Bowdoin town - Bowdoin - town - Maine - 23
0 - 2507350 - 07350 - 7350 - Boxborough town - Boxborough - town - Massachusetts - 25
0 - 2546575 - 46575 - 46575 - North Attleborough town - North Attleborough - town - Massachusetts - 25
0 - 2600200 - 00200 - 200 - Acme township - Acmeship - town - Michigan - 26
0 - 2600240 - 00240 - 240 - Ada township - Adaship - town - Michigan 

0 - 3406640 - 06640 - 6640 - Boonton township - Boontonship - town - New Jersey - 34
0 - 3406700 - 06700 - 6700 - Bordentown township - Bordentownship - town - New Jersey - 34
0 - 3407180 - 07180 - 7180 - Branchburg township - Branchburgship - town - New Jersey - 34
0 - 3407420 - 07420 - 7420 - Brick township - Brickship - town - New Jersey - 34
0 - 3407720 - 07720 - 7720 - Bridgewater township - Bridgewatership - town - New Jersey - 34
0 - 3408710 - 08710 - 8710 - Buena Vista township - Buena Vistaship - town - New Jersey - 34
0 - 3408950 - 08950 - 8950 - Burlington township - Burlingtonship - town - New Jersey - 34
0 - 3409160 - 09160 - 9160 - Byram township - Byramship - town - New Jersey - 34
0 - 3601176 - 01176 - 1176 - Alexandria town - Alexandria - town - New York - 36
0 - 3602440 - 02440 - 2440 - Arcadia town (balance) - Arcadia - town - New York - 36
0 - 3603166 - 03166 - 3166 - Aurelius town - Aurelius - town - New York - 36
0 - 3603221 - 03221 - 3221 - Au Sable town - Au Sab