In [None]:
from pprint import pprint
import datetime
import auxiliary_functions

In [None]:
%run /Users/michel.davidovich@tomtom.com/STAN-112/auxiliary_notebooks/0.0-useful_functions

In [None]:
%run /Users/michel.davidovich@tomtom.com/STAN-112/auxiliary_notebooks/0.1-API_calls_imports

In [None]:
%run /Users/michel.davidovich@tomtom.com/STAN-207/auxiliary_functions/0.0_preprocess_ADX_search_logs_insights_data

In [None]:
%run /Users/michel.davidovich@tomtom.com/STAN-207/2.0-parse_API_calls

# Defining the api calls process:

In [None]:
def apply_tt_api(x: pd.Series, api_instance, limit: int = 5, sleep: int = 1, debug: bool = False) -> dict:
    """Function that gets a row from a dataframe and makes the geocoding API calls using the searched_query to the TT API.
    
    :param x: Row of the DataFrame on which you want to perform API calls. The DataFrame must at least contain
    :type x: pd.Series
    :param api_instance: Instantiation of the API class you want to run the results on. Generally, api_instance = TomtomApi().
    :type api_instance: __main__.TomtomApi()
    :param limit: Max number of responses you want from the TT API. Defaults to 5
    :type limit: int, optional
    :param sleep: Number of seconds of pause between two API calls, defaults to 1 second.
    :type sleep: int, optional
    :param debug: Define whether you want to output the progression through the different searched queries. Defaults to False, which doesn't print the searched query
    :type debug: bool, optional
    :return: A dictionary with the full TT API geocoding response to the searched query.
    :rtype: dict
    """
    searched_query = x['searched_query']
    country = x['country']
    
    if debug:
        print(f'collected searched_query: {searched_query}')
        
    return api_instance.call_api({'address': searched_query, 'sleep': sleep, 'limit': limit, 'country': country}, call_type='geocode')

In [None]:
def full_function_make_api_calls(
    path_to_read: str, path_to_save: str, country_iso2: str, limit: int = 5, sleep: int = 1, threshold: float = 0.85, test_size: int or None = None
) -> pd.DataFrame:
    """Function that makes the calls to the TomTom API after reading the Data from a path in the DBFS. You can pass an argument to save the api responses to another file in the DBFS as well.

    :param path_to_read: Path where you want to read the search logs from.
    :type path_to_read: str
    :param path_to_save: Path where you want to save the data, if any. Defaults to None, which means you don't need to save the results.
    :type path_to_save: str or None
    :param country_iso2: Country in ISO-2 format.
    :type country_iso2: str
    :param limit: Max number of responses you want from the TT API, defaults to 5
    :type limit: int, optional
    :param sleep: Time to wait between queries to the API server, defaults to 1
    :type sleep: int, optional
    :param threshold: The minimum level of confidence that is considered a match from the API response, defaults to 0.85, which means the confidence that the API has on what it is responding for the address to be a match is at least 85%.
    :type threshold: float, optional
    :param test_size: Size of the sample you want to take from the read dataframe. This parameter should only be used for testing purposes! It should be left at None! Defaults to None.
    :type test_size: int or None
    :return: A dataframe that contains the parsed response for each of the addresses returned. It contains the coordinates, the searched query, the confidence level, the type of query, the bounding box (if it is a geometry) and whether it is considered a match or not under the threshold criteria.
    :rtype: pd.DataFrame
    """
    api = TomtomApi()
    if ((test_size is None) or (test_size == 0)):
        df = pd.read_parquet(path_to_read).reset_index(drop=True)
    else:
        df = pd.read_parquet(path_to_read).reset_index(drop=True).sample(test_size, random_state=101)
        
    df = df[df['method_name'] != 'search 2 nearbySearch']
    
    function = lambda x: api_calls_parser((apply_tt_api(x, api, limit=limit, sleep=sleep)), x.name, x['searched_query'], threshold)
    
    result =  df.apply(function, axis=1)
    
    final_df = pd.DataFrame(result.tolist())
    
    if path_to_save not in ['', None]:
        final_df.to_parquet(path_to_save)
        
    return final_df

In [None]:
country_iso2 = 'PT'
interest_date = '19-09-2022'

In [None]:
non_comp_path = f'/dbfs/FileStore/STAN-207/non_complete_responses_{interest_date}_{country_iso2}.parquet'
comp_path     = f'/dbfs/FileStore/STAN-207/complete_responses_{interest_date}_{country_iso2}.parquet'

today = str(datetime.datetime.today().strftime('%d-%m-%Y'))

non_comp_save_path1 = f'/dbfs/FileStore/STAN-207/parsed_responses/incomplete_{today}_{country_iso2}_v1.parquet'
non_comp_save_path2 = f'/dbfs/FileStore/STAN-207/parsed_responses/incomplete_{today}_{country_iso2}_v2.parquet'
comp_save_path     = f'/dbfs/FileStore/STAN-207/parsed_responses/complete_{today}_{country_iso2}.parquet'

In [None]:
pd.read_parquet(comp_path).shape

In [None]:
today

In [None]:
#result_comp     = full_function_make_api_calls(comp_path, comp_save_path, country_iso2, 1, 1.5, 0.85)
result_non_comp = full_function_make_api_calls(non_comp_path, non_comp_save_path, country_iso2, 1, 1.5, 0.85)

In [None]:
def full_function_make_api_calls_split(
    path_to_read: str, path_to_save: str, country_iso2: str, limit: int = 5, sleep: int = 1, top_split: bool = True, threshold: float = 0.85, test_size: int or None = None, seed: int = 101
) -> pd.DataFrame:
    """Function that makes the calls to the TomTom API after reading the Data from a path in the DBFS. You can pass an argument to save the api responses to another file in the DBFS as well.

    :param path_to_read: Path where you want to read the search logs from.
    :type path_to_read: str
    :param path_to_save: Path where you want to save the data, if any. Defaults to None, which means you don't need to save the results.
    :type path_to_save: str or None
    :param country_iso2: Country in ISO-2 format.
    :type country_iso2: str
    :param limit: Max number of responses you want from the TT API, defaults to 5
    :type limit: int, optional
    :param sleep: Time to wait between queries to the API server, defaults to 1
    :type sleep: int, optional
    :param threshold: The minimum level of confidence that is considered a match from the API response, defaults to 0.85, which means the confidence that the API has on what it is responding for the address to be a match is at least 85%.
    :type threshold: float, optional
    :param test_size: Size of the sample you want to take from the read dataframe. This parameter should only be used for testing purposes! It should be left at None! Defaults to None.
    :type test_size: int or None
    :return: A dataframe that contains the parsed response for each of the addresses returned. It contains the coordinates, the searched query, the confidence level, the type of query, the bounding box (if it is a geometry) and whether it is considered a match or not under the threshold criteria.
    :rtype: pd.DataFrame
    """
    api = TomtomApi()
    if ((test_size is None) or (test_size == 0)):
        df = pd.read_parquet(path_to_read).reset_index(drop=True)
    else:
        df = pd.read_parquet(path_to_read).reset_index(drop=True).sample(test_size, random_state=101)
    
    split_point = df.shape[0] // 2
    if top_split:
        df = df.sample(frac=1, random_state=seed)
        df = df.iloc[:split_point]
    else:
        df = df.sample(frac=1, random_state=seed)
        df = df.iloc[split_point:]
        
    df = df[df['method_name'] != 'search 2 nearbySearch']
    
    function = lambda x: api_calls_parser((apply_tt_api(x, api, limit=limit, sleep=sleep)), x.name, x['searched_query'], threshold)
    
    result =  df.apply(function, axis=1)
    
    final_df = pd.DataFrame(result.tolist())
    
    if path_to_save not in ['', None]:
        final_df.to_parquet(path_to_save)
        
    return final_df

In [None]:
result_non_comp1 = full_function_make_api_calls_split(non_comp_path, non_comp_save_path1, country_iso2, 1, 1.5, top_split=True, threshold = 0.85)

In [None]:
result_non_comp2 = full_function_make_api_calls_split(non_comp_path, non_comp_save_path2, country_iso2, 1, 1.5, top_split=True, threshold = 0.85)

In [None]:
result_comp