In [None]:
! pip install wheels/*
! pip install pycountry
! pip install country_list

In [55]:
from pprint import pprint
import datetime
from auxiliary_functions.api_calls_classes import *
from auxiliary_functions.parse_api_calls import *
from auxiliary_functions.useful_functions import *
import pandas as pd
import numpy as np

version = str(datetime.datetime.today().strftime('%d-%m-%Y'))

## Setting the parameters for the entire process

In [56]:
country_code = 'US'
read_path = f'/workspace/main_folder/main_folder/parsed_data/US_version10-11-2022.parquet'

## Execute...

In [57]:
save_path = f'/workspace/main_folder/main_folder/results/{country_code}_search_logs_with_coordinates_{version}.csv'

# Defining the api calls process:

In [58]:
def apply_tt_api(x: pd.Series, api_instance, limit: int = 5, sleep: int = 1, debug: bool = False) -> dict:
    """Function that gets a row from a dataframe and makes the geocoding API calls using the searched_query to the TT API.
    
    :param x: Row of the DataFrame on which you want to perform API calls. The DataFrame must at least contain
    :type x: pd.Series
    :param api_instance: Instantiation of the API class you want to run the results on. Generally, api_instance = TomtomApi().
    :type api_instance: __main__.TomtomApi()
    :param limit: Max number of responses you want from the TT API. Defaults to 5
    :type limit: int, optional
    :param sleep: Number of seconds of pause between two API calls, defaults to 1 second.
    :type sleep: int, optional
    :param debug: Define whether you want to output the progression through the different searched queries. Defaults to False, which doesn't print the searched query
    :type debug: bool, optional
    :return: A dictionary with the full TT API geocoding response to the searched query.
    :rtype: dict
    """
    searched_query = x['searched_query']
    country = x['country']
    
    if debug:
        print(f'collected searched_query: {searched_query}')
        
    return api_instance.call_api({'address': searched_query, 'sleep': sleep, 'limit': limit, 'country': country}, call_type='geocode')

In [59]:
def full_function_make_api_calls(
    path_to_read: str, path_to_save: str, country_iso2: str, limit: int = 5, sleep: int = 1, threshold: float = 0.85, test_size: int or None = None
) -> pd.DataFrame:
    """Function that makes the calls to the TomTom API after reading the Data from a path in the DBFS. You can pass an argument to save the api responses to another file in the DBFS as well.

    :param path_to_read: Path where you want to read the search logs from.
    :type path_to_read: str
    :param path_to_save: Path where you want to save the data, if any. Defaults to None, which means you don't need to save the results.
    :type path_to_save: str or None
    :param country_iso2: Country in ISO-2 format.
    :type country_iso2: str
    :param limit: Max number of responses you want from the TT API, defaults to 5
    :type limit: int, optional
    :param sleep: Time to wait between queries to the API server, defaults to 1
    :type sleep: int, optional
    :param threshold: The minimum level of confidence that is considered a match from the API response, defaults to 0.85, which means the confidence that the API has on what it is responding for the address to be a match is at least 85%.
    :type threshold: float, optional
    :param test_size: Size of the sample you want to take from the read dataframe. This parameter should only be used for testing purposes! It should be left at None! Defaults to None.
    :type test_size: int or None
    :return: A dataframe that contains the parsed response for each of the addresses returned. It contains the coordinates, the searched query, the confidence level, the type of query, the bounding box (if it is a geometry) and whether it is considered a match or not under the threshold criteria.
    :rtype: pd.DataFrame
    """
    api = TomtomApi()
    if ((test_size is None) or (test_size == 0)):
        df = pd.read_parquet(path_to_read).reset_index(drop=True)
    else:
        df = pd.read_parquet(path_to_read).reset_index(drop=True)
        df = df.sample(test_size, random_state=101)

    print(df.shape[0])    
    df = df[df['method_name'] != 'search 2 nearbySearch']
    print(df.shape[0])

    function = lambda x: api_calls_parser((apply_tt_api(x, api, limit=limit, sleep=sleep)), x.name, x['searched_query'], threshold)
    
    result =  df.apply(function, axis=1)
    
    final_df = pd.DataFrame(result.tolist())

    if threshold is not None:
        final_df = final_df[final_df['confidence'] >= threshold]
    
    if path_to_save not in ['', None]:
        final_df.to_csv(path_to_save, sep=',', index=False)
        
    return final_df

In [60]:
#result_comp     = full_function_make_api_calls(comp_path, comp_save_path, country_iso2, 1, 1.5, 0.85)
results = full_function_make_api_calls(
    path_to_read=read_path, 
    path_to_save=save_path, 
    country_iso2=country_code, 
    limit=1, 
    sleep=1, 
    threshold=0.85,
    test_size=20
)

20
20


## Visualizing an example

Below you will find an example of how your results will look like after the entire process. Once you have run this final process, you may export the "csv" file to your preferred folder for analyzing. The path shown below is an example we used to create this notebook, if by any chance you remove the file in the path or change its location, the process will no longer work.

In [61]:
pd.read_csv(save_path, sep=',')

Unnamed: 0,original_formatted_query,confidence,lat,lon,query_type,id,bounding_box,searched_query,match
0,300 mary alice dr bsmt 30680 winder ga us,0.926876,33.9937,-83.70317,Point Address,18447,,300 MARY ALICE DR BSMT 30680 WINDER GA US,1
1,08103 us,1.0,39.93418,-75.11283,Geography,17768,"{'topLeftPoint': {'lat': 39.94581, 'lon': -75....",08103 US,1
2,938 sw gatlin blvd 2fsw port st lucie blvd por...,0.949018,27.25869,-80.3763,Address Range,45879,,938 SW GATLIN BLVD%2FSW PORT ST LUCIE BLVD POR...,1
3,82240 us,1.0,42.06687,-104.18436,Geography,16972,"{'topLeftPoint': {'lat': 42.61208, 'lon': -104...",82240 US,1
4,100 oakridge blk port st lucie fl us,0.877732,27.26466,-80.34215,Address Range,12165,,100 OAKRIDGE BLK PORT ST LUCIE FL US,1
5,53081 us,1.0,43.73396,-87.73043,Geography,37847,"{'topLeftPoint': {'lat': 43.77045, 'lon': -87....",53081 US,1
6,roy wall blvd rockledge fl us,0.999998,28.30281,-80.73306,Street,7396,,ROY WALL BLVD ROCKLEDGE FL US,1
7,05737 us,1.0,43.71055,-72.94473,Geography,30794,"{'topLeftPoint': {'lat': 43.75566, 'lon': -72....",05737 US,1
8,414 banana cay dr i south daytona fl us,0.92694,29.16096,-80.99843,Point Address,8793,,414 BANANA CAY DR I SOUTH DAYTONA FL US,1
9,45242 us,1.0,39.24816,-84.36319,Geography,48319,"{'topLeftPoint': {'lat': 39.28238, 'lon': -84....",45242 US,1
