In [3]:
from auxiliary_functions import generating_ADX_sample, preprocess_ADX_search_logs_insights_data
import re
import pandas as pd
import datetime

version = datetime.datetime.today().strftime('%d-%m-%Y')

# Setting up the initial parameters:

Below we display the only parameters you should be tweaking, which are the country code and the path where the functions should get the data from. This path is the path in the "/data" folder, where you should be storing the data from the responses taken from ADX.

Once this is done, you should only be pressing SHFT + ENTER until the end of the notebook. This notebook reads from the file selected, which should be in the parsed_data folder (where you have parsed the data from the search logs table in ADX).

This function will be making API calls to the TT API, so be mindful of the amount of data that you select as an input, as you may overwhelm the limits of the API. If you believe your process will take too long because of the volume of data, you should change the "sleep" parameter. The higher the value, the smaller the volume of requests you will be making to the API.

In [4]:
# Select the country code
country_code = 'PT'

# Select reading path (should be located on the /data folder):
read_path = '/workspace/main_folder/data/complete_responses_26-10-2022_PT.parquet'

## Just execute...

In [5]:
# Advanced: You can change the save path here. If you have any doubts, just leave it as is
# This path creates itself automatically, so no need to change it.
save_path = f'/workspace/main_folder/results/{country_code}_version{version}.csv'

In [6]:
def extract_format_from_path(path: str) -> str:
    """Function that extracts the file from a path that contains the format at the end and no other points in the file name.
    
    :param path: Path where the data is stored. It will extract the format by getting all the letter after the only point in the path.
    :type path: str
    :return: The format in string format (for example 'csv' or 'parquet')
    :rtype: str
    """
    match = re.match('.+\.([\w]+)', path)
    
    if match is None:
        print('No path extracted')
    else:
        return match.group(1)
    
def parse_libpostal_and_structured_geocode(path: str, save_path: str, sep: str = ';', header: bool = True) -> pd.DataFrame:
    """Function that gets the DataFrame and parses the libpostal responses and the structured geocoding responses. It basically returns a DataFrame with all the relevant data parsed!
    
    :param df: DataFrame that contains the data as obtained from the sampling process in the search_logs_insights table in ADX.
    :type df: pd.DataFrame
    :param path: Specifies the path from which to read the data.
    :type path: str
    :param save_path: Specifies the path on which you want to save the data.
    :type save_path: str
    :param sep: Separator in case you use the csv format, defaults to ';'
    :type sep: str or None, optional
    :param header: Whether you want the first row of the csv to be the columns of the DataFrame, defaults to True, which means the first row of the csv has the column names for the DataFrame.
    :type header: bool, optional
    :return: A DataFrame with the parsed relevant responses
    :rtype: pd.DataFrame
    """
    file_format = extract_format_from_path(path)
    
    if file_format == 'csv':
        df = pd.read_csv(path, sep=sep, header=header)
    elif file_format == 'parquet':
        df = pd.read_parquet(path)
    else:
        raise TypeError('The format is not allowed of the file path is incorrect, check the file is csv or parquet, or that the file is stored in the location you passed')
        
    df = preprocess_ADX_search_logs_insights_data.parse_libpostal(df)
    df = preprocess_ADX_search_logs_insights_data.get_s2sG_results(df)
    
    df.to_parquet(save_path)

    return df

In [7]:
parse_libpostal_and_structured_geocode(
    path=read_path,
    save_path=save_path,
    sep=';',
    header=True
)

Unnamed: 0,request_uri,searched_query,populated_fields,countryName,who_searched,request_country,method_name,lib_postal_result,parsed_request_quertystring,developer_email,...,lp_po_box,lp_suburb,lp_city_district,lp_city,lp_island,lp_state_district,lp_state,lp_country_region,lp_country,lp_world_region
0,/search/Grafana.json?mapcodes=Local&limit=1&id...,Grafana,1|,PRT,NOT_SET,Netherlands,search 2 search,"""{""""house"""":""""grafana""""}""","""{""""limit"""":""""1"""",""""idxSet"""":""""POI%2CPAD%2CAdd...",as-minesweeper@groups.tomtom.com,...,,,,,,,,,,
1,/search/Grafana.json?mapcodes=Local&limit=1&id...,Grafana,1|,PRT,NOT_SET,Netherlands,search 2 search,"""{""""house"""":""""grafana""""}""","""{""""idxSet"""":""""POI%2CPAD%2CAddr%2CStr%2CXStr%2...",as-minesweeper@groups.tomtom.com,...,,,,,,,,,,
2,"/search/PRT,Azores.json?language=en-GB&idxSet=...","PRT,Azores",5|,PRT,NOT_SET,Ireland,search 2 search,"""{""""road"""":""""prt azores""""}""","""{""""language"""":""""en-GB"""",""""idxSet"""":[""""Geo"""",""...",mydrivedevelopment@groups.tomtom.com,...,,,,,,,,,,
3,/search/PRT.json?language=en-GB&idxSet=Geo&idx...,PRT,19|,PORTUGAL,NOT_SET,Ireland,search 2 search,"""{""""country"""":""""prt""""}""","""{""""language"""":""""en-GB"""",""""idxSet"""":[""""Geo"""",""...",mydrivedevelopment@groups.tomtom.com,...,,,,,,,,,prt,
4,"/search/PRT,Continental%20Portugal.json?langua...","PRT,Continental Portugal",5|14|19|,PRT,NOT_SET,Ireland,search 2 search,"""{""""road"""":""""prt"""",""""city"""":""""continental"""",""""...","""{""""language"""":""""en-GB"""",""""idxSet"""":[""""Geo"""",""...",mydrivedevelopment@groups.tomtom.com,...,,,,continental,,,,,portugal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118911,/geocode/ESTRADA%20DA%20PORTELA%20%2087%20%202...,"ESTRADA DA PORTELA 87 2 DRT, CARNAXIDE",4|5|14|,PRT,NOT_SET,Netherlands,search 2 geocode,"""{""""road"""":""""estrada da portela"""",""""house_numb...","""{""""limit"""":""""1"""",""""countrySet"""":""""PRT""""}""",biz-ops@carto.com,...,,,,carnaxide,,,,,,
118912,/geocode/5459-341.json?countrySet=PRT&limit=1,5459-341,4|,PRT,NOT_SET,United States,search 2 geocode,"{""house_number"":""5459-341""}","{""limit"":""1"",""countrySet"":""PRT""}",luke@postcodeanywhere.co.uk,...,,,,,,,,,,
118913,"/geocode/Rua%20Praia%20Moinho%20de%20Baixo,%20...","Rua Praia Moinho de Baixo, 2070-074, Aldeia Meco",5|11|14|,PRT,NOT_SET,Ireland,search 2 geocode,"{""road"":""rua praia moinho de baixo"",""postcode""...","{""limit"":""100"",""countrySet"":""PT"",""lat"":""38.476...",nikhil.kuriakose@trivago.com,...,,,,aldeia meco,,,,,,
118914,/geocode/Urbaniza%C3%A7%C3%A3o%20dos%20Salgado...,"Urbanização dos Salgados, Vale Rabelho, AP9, 8...",5|11|14|,PRT,NOT_SET,Ireland,search 2 geocode,"{""road"":""urbanização dos salgados"",""city"":""val...","{""limit"":""100"",""countrySet"":""PT"",""lat"":""37.090...",nikhil.kuriakose@trivago.com,...,,,,vale rabelho ap9,,,,,,
