In [None]:
"""
Python script for batch geocoding of addresses using the Google Geocoding API.
This script allows for massive lists of addresses to be geocoded for free by pausing when the 
geocoder hits the free rate limit set by Google (2500 per day).  If you have an API key for paid
geocoding from Google, set it in the API key section.

Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
come from a csv file with a column "Address". After every 500 successful geocode operations, a temporary file with results is recorded in case of 
script failure / loss of connection later.

Addresses and data are held in memory, so this script may need to be adjusted to process files line
by line if you are processing millions of entries.
"""

import pandas as pd
import requests
import logging
import time

logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

#------------------ CONFIGURATION -------------------------------

# Set your Google API key here. 
# Even if using the free 2500 queries a day, its worth getting an API key since the rate limit is 50 / second.
# With API_KEY = None, you will run into a 2 second delay every 10 requests or so.
# With a "Google Maps Geocoding API" key from https://console.developers.google.com/apis/, 
# the daily limit will be 2500, but at a much faster rate.
# Example: API_KEY = 'AI.....................................'
API_KEY = 'AIzaSyArm9gVOzsmXXsUZNmCxULgF6em1lAnNHY'
# Backoff time sets how many minutes to wait between google pings when your API limit is hit
BACKOFF_TIME = 30
# Set your output file name here.
output_filename = 'geotesting/Filtered_Geo04282020.csv'
# Set your input file here
input_filename = 'geotesting/FilteredDatabase04282020.csv'
# Specify the column name in your input data that contains addresses here
address_column_name = "Affiliation"
# Return Full Google Results? If True, full JSON results from Google are included in output
RETURN_FULL_RESULTS = False

#------------------ DATA LOADING --------------------------------

# Read the data to a Pandas Dataframe
data = pd.read_csv(input_filename, encoding='ISO-8859-1')

if address_column_name not in data.columns:
	raise ValueError("Missing Affiliation column in input data")

# Form a list of addresses for geocoding:
# Make a big list of all of the addresses to be processed.
addresses = data[address_column_name].tolist()

#------------------	FUNCTION DEFINITIONS ------------------------

def get_google_results(address, api_key=API_KEY, return_full_response=False):
    """
    Get geocode results from Google Maps Geocoding API.
    
    Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result.
    
    @param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland"
    @param api_key: String API key if present from google. 
                    If supplied, requests will use your allowance from the Google API. If not, you
                    will be limited to the free usage of 2500 requests per day.
    @param return_full_response: Boolean to indicate if you'd like to return the full response from google. This
                    is useful if you'd like additional location details for storage or parsing later.
    """
    # Set up your Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address" : None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address" : answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
    # Append some other details:    
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

#------------------ PROCESSING LOOP -----------------------------

# Ensure, before we start, that the API key is ok/valid, and internet access is ok
test_result = get_google_results("London, England", API_KEY, RETURN_FULL_RESULTS)
if (test_result['status'] != 'OK') or (test_result['formatted_address'] != 'London, UK'):
    logger.warning("There was an error when testing the Google Geocoder.")
    raise ConnectionError('Problem with test results from Google Geocode - check your API key and internet connection.')

# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            logger.info("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            # If we're ok with API use, save the results
            # Note that the results might be empty / non-ok - log this
            if geocode_result['status'] != 'OK':
                logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
            logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
            results.append(geocode_result)           
            geocoded = True

    # Print status every 100 addresses
    if len(results) % 100 == 0:
    	logger.info("Completed {} of {} affiliations".format(len(results), len(addresses)))
            
    # Every 500 addresses, save progress to file(in case of a failure so you have something!)
    if len(results) % 500 == 0:
        pd.DataFrame(results).to_csv("{}_bak".format(output_filename))

# All done
logger.info("Finished geocoding all affiliations")
# Write the full results to csv using the pandas library.
pd.DataFrame(results).to_csv(output_filename, encoding='utf8')

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Pharmacy Services; Mercy Health Saint Mary's; Grand Rapids MI USA: OK
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Centro de Estudos Sociais Amilcar Cabral 

Geocoded: Department of Public Policy, University of North Carolina at Chapel Hill, Chapel Hill, NC 27599, USA.: OK
Geocoded: Section of Dermatology, Department of Clinical Medicine and SurgeryUniversity of Naples Federico II Naples Italy: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Medicine, Tallaght University Hospital & Trinity College Dublin, Ireland: OK
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Laboratory of Molecular Immunopharmacology and Drug Discovery, Department of ImmunologyTufts University School of Medicine Boston MA: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Infectious Diseases Unit, Department of Medical and Surgical Sciences, Policlinico di SantOrsola, Bologna, Italy: INVALID_REQUEST
Geocoded: Infectious Diseases Unit, Department of Medical and Surgical Sciences, Policlinico di SantOrsola, Bologna, Italy: INVALID_REQUEST
Geocoded: Dermatology Unit Department of Experimental, Diagnostic and Specialty Medicine University of Bologna: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No af

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Cardiology, General Hospital of Northern Theater Command, 83 Wenhua Road, 110840 Shenyang, China: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: H

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Department of OtolaryngologyHead and Neck SurgeryUniversity of Toronto Toronto Ontario Canada: INVALID_REQUEST
Geocoded: Department of OtolaryngologyHead and Neck SurgeryUniversity of Toronto Toronto Ontario Canada: INVALID_REQUEST
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Jeffrey Sachs Center on Sustainable Development, Sunway University, Bandar Sunway, Malaysia: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affi

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: School of Government, Beijing Normal University, Beijing 100875, China.: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Karl Landsteiner University of Health Sc

Geocoded: Division of Hospital Medicine, Department of Medicine, MedStar Washington Hospital Center, Washington, District of Columbia;: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: School of Public Health, University of Alberta, Edmonton, Canada;: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Author Not Found: ZERO_RESULTS
Geocoded: Author Not Found: ZERO_RESULTS
Geocoded: Aberdeen Cardiovascular &amp; Diabetes Centre School of Medicine Medical Sciences and Nutrition Oxford UK: OK
Geocoded: Department of Chemical EngineeringNortheastern University Boston Massachusetts USA: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESU

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Neurosurgery The Medical University of South Carolina Charleston, South Carolina: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Institute of Cardiovascular Sciences, College of Medical and Dental Sciences, University of Birmingham, Birmingham, UK: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: The First Affiliated Hospital of Zhejiang Chinese Medical University, Hangzhou, Zhejiang, China: OK
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 20

Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Culture and Society, Division Ageing and Social Change, , Linköping University, Norrköping, Sweden;: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: CHUV, Lausanne, Switzerland: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affili

Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: University of Washington, Division of Pulmonary & Critical Care Medicine, Seattle, Washington, United States;: OK
Geocoded: Boston University School of Public Health: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Division of Nephrology, Department of Geriatrics and Palliative Medicine, Icahn School of Medicine at Mount Sinai, New York, New York, USA.: OK
Geocoded: Ultrasound, Fifth Affiliated Hospital of Sun Yat-sen University, Zhuhai, China: OK
Error geocoding Department of Respiratory Medicine Wuhan Childrens Hospital Tongji Medical College Huazhong University of Science and Technology Wuhan Hubei China: INVALID_REQUEST
Geocoded: Department of Respiratory Medicine Wuhan Childrens Hospital Tongji Medical College Huazhong U

Geocoded: No affiliation found: ZERO_RESULTS
Completed 800 of 10203 affiliations
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Department of Respiratory Medicine, Sir Run Run Shaw Hospital, Institute of Translational Medicine, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Author Not Found: ZERO_RESULTS
Geocoded: Author Not Found: ZERO_RESULTS
Geocoded: Fifth Year Medical Student University College London UKLondon: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS

Geocoded: Brigham and Womens Hospital, Boston, MA, USA: INVALID_REQUEST
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO

Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: Clinical Research Unit, Italian Institute of Telemedicine, Varese, Italy.: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Geocoded: Pediatrics, University of Connecticut School of Medicine, Farmington, CT, USA: OK
Geocoded: Boston Children's Hospital and Harvard Medical School, Boston, Massachusetts (R.M., L.K.L., E.W.F.): OK
Geocoded: NYU Grossman School of Medicine, Department of Population Health, New York, NY, USA: OK
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Err

Geocoded: Dermatology Department of Medical and Surgical Sciences Università Cattolica del Sacro Cuore Rome Italy: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: From Massachusetts General Hospital and Harvard Medical School, Boston (R.T.G.); the Department of Medicine, Division of Allergy and Infectious Diseases, University of Washington School of Medicine, Seattle (J.B.L.); and the Department of Medicine, Division of Infectious Diseases, Emory University School of Medicine, and Grady Health System, Atlanta (C.R.).: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: From the Department of Medicine, University of California, San Francisco.: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found

Geocoded: Response not 200: ZERO_RESULTS
Geocoded: Universität StuttgartInstitut für Organische Chemie Pfaffenwaldring 55 70569 Stuttgart GERMANY: OK
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Error geocoding No affiliation found: ZERO_RESULTS
Geocoded: No affiliation found: ZERO_RESULTS
Geocoded: 1NeurosurgeryOne, Denver, Colorado;: OK
Geocoded: 1NeurosurgeryOne, Denver, Colorado;: OK
Geocoded: Univ. Lille, Inserm U1285, CHU Lille, Pôle de Réanimation, CNRS, UMR 8576 - UGSF -Unité de Glycobiologie Structurale et Fonctionnelle, F-59000, Lille, France: OK
Error geocoding Response not 200: ZERO_RESULTS
Geocoded: Response not 200: ZERO_RESULTS
Geocoded: 