In [1]:
import pandas as pd
import numpy as np
import geocoder
import requests
import time
from geopy.geocoders import Nominatim
import json 
from sqlalchemy.sql import text
import boto3

from File_Utilities import FileTools
import DB_Utilities
DBTools = DB_Utilities.DBTools()  # instantiate the class
FileTools.MYDIR = "./data/"

In [2]:
client = boto3.client('location')
geolocator = Nominatim(user_agent="UVaPatent22Capstone_DataScience")
null_response = {'original_state' : "",'original_city' : "",'county_GEOID' : "",'county_fip' :"", 'county_BASENAME' :"",'state_fip' :"",'state_BASENAME' : "",'state_STUSAB' : "",'city_BASENAME' : "", 'city_NAME' : "",  'status' : "" }


def get_coordinates_aws(city, state):    
    # THis costs money...
    try:
        result = client.search_place_index_for_text(FilterCountries=['USA'], IndexName='City_State_lookup', Text=f'{city}, {state}', MaxResults=3)
        # print(f'The type of this variable is : {type(result)}')
        lat = result['Results'][0]['Place']['Geometry']['Point'][0]
        long = result['Results'][0]['Place']['Geometry']['Point'][1]
        return True, str(lat), str(long), result
    except:
        return False, "", "", {}


def get_coordinates_Nominatim(city, state):    
    # Nominatim requests that we only make 1 request per second, otherwise we might get blocked.
    try:
        # location = geolocator.geocode({'city':city, 'state':state})
        location = geolocator.geocode(f'{city} {state}', exactly_one=False, limit=3, addressdetails=True )
        # print(location)
        time.sleep(1)
        # return True, str(location.latitude), str(location.longitude), location      
        return True, str(location.longitude), str(location.latitude), location      
    except Exception as e:
        print(e)
        time.sleep(1)
        return False, "", "", {}

def get_county_information_from_census(lat, long, city, state):
    # print (f'Lat : {lat }, Long : {long}')
    url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lat}&y={long}&benchmark=4&vintage=4&format=json'
    form_url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lat}&y={long}&benchmark=4&vintage=4'
    # print(form_url)

# Probably should be its own class at this point.
    try:
        response = requests.get(url)        
        if(response.status_code != 200):
            raise        
        data = response.json()

        county_GEOID = data['result']['geographies']['Counties'][0]['GEOID']    
        county_fip = data['result']['geographies']['Counties'][0]['COUNTY']
        county_BASENAME = data['result']['geographies']['Counties'][0]['BASENAME']

        state_fip = data['result']['geographies']['States'][0]['STATE']
        state_BASENAME = data['result']['geographies']['States'][0]['BASENAME']
        state_STUSAB = data['result']['geographies']['States'][0]['STUSAB'] 

        city_BASENAME = data['result']['geographies']['County Subdivisions'][0]['BASENAME'] 
        city_NAME = data['result']['geographies']['County Subdivisions'][0]['NAME']  


    except:
        return null_response



    print(f'{city_BASENAME} , {state_BASENAME} , {county_BASENAME} , {county_GEOID}')
    census_object = {
        'original_state' : state,
        'original_city' : city,
        'county_GEOID' : county_GEOID ,
        'county_fip' :county_fip, 
        'county_BASENAME' :county_BASENAME ,
        'state_fip' :state_fip ,
        'state_BASENAME' : state_BASENAME ,
        'state_STUSAB' : state_STUSAB ,
        'city_BASENAME' : city_BASENAME, 
        'city_NAME' : city_NAME , 
        'form_url' : form_url,        
        'status' : state_STUSAB == state } 

    return census_object



def geocode_lat_long(lat, long, city="", state=""):
    # print("  ")
    # print(f'{city} , {state}')


    if DBTools.get_row_count("aws_lookup_cache", f"city = '{city}' and state = '{state}'") == 0: 
        print(f"Geocoding {Lat}, {long}")      
        census_lookup_result = get_county_information_from_census(lat, long, city, state)
        DBTools.insert_location_lookup_cache(city = city
                                        , state = state
                                        , geocode_response = {}
                                        , census_lookup_result = census_lookup_result
                                        , lat = lat
                                        , long = long
                                        )

        return census_lookup_result

    else:
        print(f"{city}, {state} already exists in the cache")
        return null_response


def geocode_city_state(city, state, geocoder_service='aws'):
    # print("  ")
    # print(f'{city} , {state}')


    if DBTools.get_row_count("aws_lookup_cache", f"city = '{city}' and state = '{state}'") == 0: 
        print(f"Geocoding {city}, {state}")      
        if(geocoder_service == 'aws'):
            geocode_success, lat, long, geocode_response = get_coordinates_aws(city, state)
        else:
            geocode_success, lat, long, geocode_response = get_coordinates_Nominatim(city, state)

        if (geocode_success == False):
            return null_response
        else:

            census_lookup_result = get_county_information_from_census(lat, long, city, state)
            DBTools.insert_location_lookup_cache(city = city
                                            , state = state
                                            , geocode_response = geocode_response
                                            , census_lookup_result = census_lookup_result
                                            , lat = lat
                                            , long = long
                                            )

        return census_lookup_result

    else:
        print(f"{city}, {state} already exists in the cache")
        return null_response

        
    # To see the results in an html form.
    # https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x=-70.1441014&y=44.671539&benchmark=4&vintage=4
    # https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x=-89.7134&y=45.6038&benchmark=4&vintage=4
    	

# census_lookup_result = geocode_city_state('Parkridge','NJ', geocoder_service='aws')
# using Nominatim geocoder - it doesn't do a good a job of fuzzy matching. There might be a way to get it to match better...but it's also free.
# census_lookup_result = geocode_city_state('Parkridge','NJ', geocoder_service='Nominatim')

In [None]:
# Testing the geocoder, could write this into a unit test.
test = False
if test:
    null_response = {'original_state' : "",'original_city' : "",'county_GEOID' : "",'county_fip' :"", 'county_BASENAME' :"",'state_fip' :"",'state_BASENAME' : "",'state_STUSAB' : "",'city_BASENAME' : "", 'city_NAME' : "",  'status' : "" }
    test_list = [
        {'city' : 'city', 'state' : 'state', "geocode_response" : {"something" : "something"} , "census_lookup_result" : null_response}, 
        {'city' : 'city1', 'state' : 'state1', "geocode_response" : {"something" : "something"} , "census_lookup_result" : null_response}, 
        {'city' : 'city2', 'state' : 'state2', "geocode_response" : {"something" : "something"} , "census_lookup_result" : null_response}, 
        {'city' : 'city3', 'state' : 'state3', "geocode_response" : {"something" : "something"} , "census_lookup_result" : null_response}]

    for i in test_list:
        city = i['city']
        state = i['state']
        geocode_response = i['geocode_response']
        census_lookup_result = i['census_lookup_result']
        DBTools.insert_location_lookup_cache(city=city, state=state, geocode_response=geocode_response, census_lookup_result=census_lookup_result, lat="123.456", long="456.789")


In [None]:
# All the locations that we need from our data work
pregrant_locations_file = r"./data/pregrant/pregrant_locations.csv"
_pregrant_locations_df = pd.read_csv(pregrant_locations_file)

# All the locations
location_file = r"./data/pregrant/location.tsv"
_location_df = pd.read_csv(location_file, sep='\t')

# Seriously, these leading zeros are annoying.
_location_df[['state_fips', 'county_fips']] = _location_df[['state_fips','county_fips']].fillna("")
_location_df.loc[_location_df.state_fips!="", 'state_fips'] = _location_df.loc[_location_df.state_fips!="", 'state_fips'].astype(str).str.replace("\.0", "").str.zfill(2)
_location_df.loc[_location_df.county_fips!="", 'county_fips'] = _location_df.loc[_location_df.county_fips!="", 'county_fips'].astype(str).str.replace("\.0", "").str.zfill(5)
DBTools.insert_df(_location_df, "pregrant_location_unfiltered")

# _location_df.query("county_fips != ''").sort_values(by=['county_fips'], ascending=True)

In [None]:
# We are only interested in the locations that are in the pregrant locations file that have missing data.
reduced_location_df = pd.merge(_pregrant_locations_df, _location_df, left_on=['location_id'], right_on=['id'], how='left')
print(reduced_location_df.shape)
reduced_location_df = reduced_location_df.query("country == 'US' & county_fips == '' & city != '' & state != '' ", engine="python")
print(reduced_location_df.shape)

In [None]:
_location_df.query("id=='baa6fcdc-cb8e-11eb-9615-121df0c29c1e'")
# _location_df.head()

In [None]:
# We need all the data to join back once we have geocoded the locations.
reinsert = False
if reinsert:
    pregrant_location_all_df = pd.merge(_pregrant_locations_df, _location_df, left_on=['location_id'], right_on=['id'], how='left')
    DBTools.insert_df(pregrant_location_all_df, "pregrant_location_all")

In [None]:
# Actually run the geocoder and persist the results.
# We need to geta dataframe from the database for the final results.
rerun = False
if rerun:
    reduced_location_df.apply(lambda x: geocode_city_state(x.city, x.state), axis=1, result_type='expand')

In [None]:
location_crosswalk_df = DBTools.get_df("pregrant_location_crosswalk", "GEOID != ''")
location_crosswalk_df.shape


In [None]:
# save the results to a file

# I'm adding in a dummy row so the GEOIDS are string and not numeric. It won't match on anything else.
location_crosswalk_df_dummy = location_crosswalk_df.head(1).copy()
location_crosswalk_df_dummy['id'] = "_"
location_crosswalk_df_dummy['GEOID'] = "_"
location_crosswalk_df = pd.concat([location_crosswalk_df, location_crosswalk_df_dummy])


location_crosswalk_df.to_csv("./data/pregrant/location_crosswalk.csv", index=False)

In [None]:
# load a csv into dataframe
reload = pd.read_csv("./data/pregrant/location_crosswalk.csv", low_memory=False)
reload.shape