In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pyspark.sql.types as T
from pyspark.storagelevel import StorageLevel
from pyspark.sql import SparkSession

import jellyfish
from elasticsearch import Elasticsearch
import json
from datetime import datetime
import time



In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
start = time.time()

In [4]:
spark = SparkSession.builder \
    .appName("CIDACSRL") \
    .master("spark://barravento:7077") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.1.3") \
    .config("spark.es.nodes", "barravento") \
    .config("spark.es.port", "9200") \
    .config("spark.es.nodes.wan.only", "false") \
    .config("spark.es.resource", "dbb2") \
    .getOrCreate()

sc = spark.sparkContext

# ES functions

#### auxiliary functions

In [5]:
def get_match_cols_and_values(vars_col, query_type, add_id_col):
    """
    query_type must be 'exact' for building exact queries or 'general' for any else query and comparison.
    """
    config_ = config_bc.value
    # getting names of indexed columns
    indexed_id_column = config_['datasets_info']['indexed_dataset']['id_column_name']
    
    indexed_cols = config_['datasets_info']['indexed_dataset']['columns']
    
#     if query_type == 'general':
#         indexed_cols = [x for x in indexed_cols if x != indexed_id_column]
        
    # notice that we are linking indexed keys with tolink values
    # the keys will be used to set which field will be fetched on es
    # the values will be used as search content
    tolink_cols_dict = dict(zip(indexed_cols, vars_col))
    
    if add_id_col == False:
        tolink_cols_dict.pop(indexed_id_column, None)
    
    if query_type == 'general':
        return tolink_cols_dict
    elif query_type == 'exact':
        # finding which are the columns used on exact match step
        indexed_exact_match_vars = [indexed_id_column] + [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']
        non_exact_match_cols = list(set(indexed_cols) - set(indexed_exact_match_vars))
        # deleting those columns of non-exact match
        [tolink_cols_dict.pop(x, None) for x in non_exact_match_cols]
        
        return tolink_cols_dict
    else: 
        print("Please use 'general' or 'exact' as query_type input")
        return None, None

#### indexing

In [6]:
def index_dataframe(dataframe, es_index_name):
    # creating new index
    dataframe.write.format("org.elasticsearch.spark.sql") \
                 .option("es.resource", es_index_name).mode('overwrite').save()

#### query building

In [7]:
def build_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'exact', False)
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    # setting the preffix and suffix of query core
    prefix_ = """{"match": {"""
    suffix_ = """}}"""
    
    # filling the query core with all indexed columns and values from vars_col
    strings = []
    for col in list(tolink_cols_dict.keys()):
        string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    # building the query core. 
    # Should be like: {"match": {"name":"ROBESPIERRE PITA"}}, {"birthdate": {"name":"1987-05-05"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "bool": { "must": [ %s ] } }""" % (line)
    # CHANGELOG: to accomplish with new syntax from ES 8.x, we need to change the complete query from
    # complete_query = """{ "size": "%s", "query": { "bool": { "must": [ %s ] } } }""" % (query_size,line)
    # to
    # complete_query = """{ "bool": { "must": [ %s ] } } }""" % (line)
    # read more in: https://github.com/elastic/elasticsearch-py/issues/1698
    #               https://www.elastic.co/guide/en/elasticsearch/client/python-api/8.1/examples.html
    
    return complete_query
udf_build_exact_queries = F.udf(build_exact_queries, StringType()) 

def build_non_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{"bool": { 
                 "should": [ 
                     {'match': {'nome_a': {'query': 'ROBESPIERRE PITA', 'fuzziness':'AUTO', 'operator':'or', 'boost':'3.0'}}},
                     {"match": {"birthdate":"19870505"}} ] } } }
                     {"term": {"sexo_a":"1"}} ] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'exact', False)
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    
    # filling the query core with all indexed columns and values from vars_col
    comparisons = [config['comparisons'][x] for x in config['comparisons']]
    strings = []
    for col in list(tolink_cols_dict.keys()):
        query_col_instructions = [x for x in comparisons if x['indexed_col'] == col][0]
        query_type = str(query_col_instructions['query_type'])
        prefix_ = """{"%s": {""" % query_type
        suffix_ = """}}"""

        if query_col_instructions['should_match'] == 'true':
            if query_col_instructions['is_fuzzy'] == 'true':
                boost = str(query_col_instructions['boost'])
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + " { \"query\" : \"" +  str(tolink_cols_dict[col]) + "\"" + ", \"fuzziness\":\"AUTO\", \"operator\":\"or\", \"boost\":\"" + boost + "\" }" + str(suffix_)
                
            if query_col_instructions['is_fuzzy'] == 'false':
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    # building the query core. 
    # is_fuzzy = 'true' should be like: {"match": {"name":"ROBESPIERRE PITA", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}}, {"term": {"dt_nasc_a":"20070816"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "bool": { "should": [ %s ] } }""" % (line)
    # CHANGELOG: to accomplish with new syntax from ES 8.x, we need to change the complete query from
    # complete_query = """{ "size": "%s", "query": { "bool": { "should": [ %s ] } } }""" % (query_size,line)
    # to
    # complete_query = """{ "bool": { "should": [ %s ] } } }""" % (query_size,line)
    # read more in: https://github.com/elastic/elasticsearch-py/issues/1698
    #               https://www.elastic.co/guide/en/elasticsearch/client/python-api/8.1/examples.html
    
    return complete_query
udf_build_non_exact_queries = F.udf(build_non_exact_queries, StringType())

#### finding matches

In [8]:
def find_elasticsearch_exact_best_candidate(vars_col, exact_queries_col):
    """
    Let us suppose a column with the following query:
    
    '{ "bool": { "must": [ 
                    {"match": {"name":"ROBESPIERRE PITA"}},
                    {"match": {"birthdate":"19870505"}}] } }'
    
    so, this function must return a dict with N results like: 
        {'_index': 'test', '_type': '_doc', '_id': 'aaabbbccc', '_score': 43.9280841,
        '_source': {'name': 'ROBESPIERRE PITA', 'birthdate': '19870505', 'other_col': 'other_value'}},
    
    being N the query_size value set on config, you can see this number on the 'size' field of the query.
    
    This result can now be used to compute the proper similarity and pick the 
    best candidate for each record
    """
    from elasticsearch import Elasticsearch
    from ast import literal_eval

    config_ = config_bc.value
    
    es_connect_string = config_['es_connect_string']
    es_index_name = config_['es_index_name']
    query_size = config_['query_size']
    
    es = Elasticsearch(es_connect_string)
    
    exact_queries_col = literal_eval(exact_queries_col)
    # CHANGELOG: To accomplish with the syntax of ES 8.x we need to add the lines
    # from ast import literal_eval
    # exact_queries_col = literal_eval(exact_queries_col)
    # String typed query contents does not meet the new requisits, rainsing the following error (even when they are properly written): 
    # # elasticsearch.BadRequestError: BadRequestError(400, 'parsing_exception', 'Unknown key for a VALUE_STRING in [query].')
    
    candidates = es.search(index=es_index_name, query=exact_queries_col, size=query_size)['hits']['hits']
    # CHANGELOG: To accomplish with the syntax of ES 8.x we need to change the line from:
    # candidates = es.search(index=es_index_name, body=exact_queries_col)['hits']['hits']
    # to 
    # candidates = es.search(index=es_index_name, query=exact_queries_col, size=query_size)['hits']['hits']
    # This could avoid the following errors: 
    # # ValueError: Couldn't merge 'body' with other parameters as it wasn't a mapping. Instead of using 'body' use individual API parameters
    # # elasticsearch.BadRequestError: BadRequestError(400, 'parsing_exception', 'Unknown key for a VALUE_STRING in [query].')
    
    if len(candidates) == 0:
        best_score_id, best_score_value, scores = 'null', 'null', 'null'
        return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
    else:
        cols_and_values = get_match_cols_and_values(vars_col, 'general', True)
        best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)

        if float(best_score_value) >= float(config_['cutoff_exact_match']):
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
        else: 
            best_score_id, best_score_value, scores = 'null', 'null', 'null'
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)

schema = StructType([StructField("best_candidate_exact", StringType(), False), 
                     StructField("sim_best_candidate_exact", StringType(), False), 
                     StructField("similarity_exact_candidates", StringType(), False)])
udf_find_elasticsearch_exact_best_candidate = F.udf(find_elasticsearch_exact_best_candidate, schema)


def find_elasticsearch_non_exact_best_candidate(vars_col, non_exact_queries_col):
    """
    Let us suppose a column with the following query:
    
    '{ "bool": { 
                 "should": [ 
                     {"match": {"nome_a":"ROBESPIERRE PITA", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}},
                     {"match": {"birthdate":"19870505"}} ] } }'
    
    so, this function must return a dict with N results like: 
        {'_index': 'test', '_type': '_doc', '_id': 'aaabbbccc', '_score': 43.9280841,
        '_source': {'name': 'ROBESPIERRE PITA', 'birthdate': '19870505', 'other_col': 'other_value'}},
    
    being N the query_size value set on config, you can see this number on the 'size' field of the query.
    
    This result can now be used to compute the proper similarity and pick the 
    best candidate for each record
    """
    from elasticsearch import Elasticsearch
    from ast import literal_eval
    
    config_ = config_bc.value
    
    es_connect_string = config_['es_connect_string']
    es_index_name = config_['es_index_name']
    query_size = config_['query_size']
    
    es = Elasticsearch(es_connect_string)
    
    non_exact_queries_col = literal_eval(non_exact_queries_col)
    # CHANGELOG: To accomplish with the syntax of ES 8.x we need to add the lines
    # from ast import literal_eval
    # exact_queries_col = literal_eval(exact_queries_col)
    # String typed query contents does not meet the new requisits, rainsing the following error (even when they are properly written): 
    # # elasticsearch.BadRequestError: BadRequestError(400, 'parsing_exception', 'Unknown key for a VALUE_STRING in [query].')
    
    
    candidates = es.search(index=es_index_name, query=non_exact_queries_col, size=query_size)['hits']['hits']
    # CHANGELOG: To accomplish with the syntax of ES 8.x we need to change the line from:
    # candidates = es.search(index=es_index_name, body=non_exact_queries_col)['hits']['hits']
    # to 
    # candidates = es.search(index=es_index_name, query=non_exact_queries_col, size=query_size)['hits']['hits']
    # This could avoid the following errors: 
    # # ValueError: Couldn't merge 'body' with other parameters as it wasn't a mapping. Instead of using 'body' use individual API parameters
    # # elasticsearch.BadRequestError: BadRequestError(400, 'parsing_exception', 'Unknown key for a VALUE_STRING in [query].')
    
    
    if len(candidates) == 0:
        best_score_id, best_score_value, scores = 'null', 'null', 'null'
        return T.Row('best_candidate_non_exact', 'sim_best_candidate_non_exact', 'similarity_non_exact_candidates')(best_score_id, best_score_value, scores)
    else:
        cols_and_values = get_match_cols_and_values(vars_col, 'general', True)
        best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)
        return T.Row('best_candidate_non_exact', 'sim_best_candidate_non_exact', 'similarity_non_exact_candidates')(best_score_id, best_score_value, scores)
        
schema = StructType([StructField("best_candidate_non_exact", StringType(), False), 
                     StructField("sim_best_candidate_non_exact", StringType(), False), 
                     StructField("similarity_non_exact_candidates", StringType(), False)])
udf_find_elasticsearch_non_exact_best_candidate = F.udf(find_elasticsearch_non_exact_best_candidate, schema)



def find_best_candidates(cols_and_values, candidates):
    
    config_ = config_bc.value
    indexed_id_col = config_['datasets_info']['indexed_dataset']['id_column_name']
    id_value = cols_and_values[indexed_id_col]
    scores = {}
    
    for candidate in candidates:
        candidate_id = candidate['_source'][indexed_id_col]
        sim_candidate = []

        for col_and_value in list(cols_and_values.keys()):
            if col_and_value != indexed_id_col:
                comparison_info = [config_['comparisons'][x] for x in config_['comparisons'] if config_['comparisons'][x]['indexed_col'] == col_and_value][0]
                n_comparisons = len(config_['comparisons'].keys())

                sim_for_pair_of_cols = similarity_hub(n_comparisons, comparison_info, cols_and_values[col_and_value], candidate['_source'][col_and_value])

                sim_candidate.append(sim_for_pair_of_cols)

        score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
        score = (sum(sim_candidate))/score_max
    
        scores[candidate_id] = score
    
    if len(scores) > 0:
        best_score_id = max(scores, key=scores.get)
        best_score_value = scores[best_score_id]
    else: 
        best_score_id = 'null'
        best_score_value = '0.0'
        scores = '{}'
    return best_score_id, best_score_value, scores
    
    
def similarity_hub(n_comparisons, comparison_info, col_and_value, candidate):
    """
    Currently the CIDACS-RL uses overlap for categorical data, jaro_winkler for names and hamming for dates.
    """
    import jellyfish
    
    # getting relevant information for this pair of values
    config_ = config_bc.value
#     score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
    similarity = 0.0
    weight = float(comparison_info['weight'])
    penalty = float(comparison_info['penalty'])
    
    # first, test if some value are missing
    if (candidate == config_['null_value']) or (col_and_value == config_['null_value'])\
        or (candidate == "") or (col_and_value == "") or (candidate == None) or (col_and_value == None):
        similarity = similarity - penalty
    else: 
        sim_type = comparison_info['similarity']
        if (sim_type == 'overlap') and (col_and_value == candidate):
            similarity += (1.0) * weight
            return similarity
        elif (sim_type == 'overlap') and (col_and_value != candidate):
            similarity += 0.0
            return similarity
        elif sim_type == 'jaro_winkler':
            similarity += jellyfish.jaro_winkler(col_and_value, candidate) * weight
        elif sim_type == 'hamming':
            max_size = max(len(col_and_value), len(candidate))
            similarity += 1.0 - float(jellyfish.hamming_distance(col_and_value, candidate)/max_size) * weight
        else: 
            print('Please inform valid similarities for cidacs-rl')
        
        similarity = similarity
    return similarity    

#### main functions

In [9]:
def cidacs_rl_exact_phase(tolink_dataset):
    """
    This function take a dataframe to link with an indexed dataframe on elasticsearch.
    It consists in three main steps: 
        1) The first step consists in create an array column from a set of columns used on integration
        
        withColumn('vars', F.array(tolink_cols)) input: 
        +-----------+--------------------+------+
        |id_cidacs_b|                nome|  sexo|
        +-----------+--------------------+------+
        |          0|    ROBESPIERRE PITA|     1|
        +-----------+--------------------+------+
        
        withColumn('vars', F.array(tolink_cols)) output: 
        +-----------+--------------------+------+--------------------------+
        |id_cidacs_b|                nome|  sexo|                      vars|
        +-----------+--------------------+------+--------------------------+
        |        614|    ROBESPIERRE PITA|     1|  [0, ROBESPIERRE PITA, 1]|
        +-----------+--------------------+------+--------------------------+
        
        2) The second step will take the new array col as input and build exact queries:
        
        udf_build_exact_queries(F.col('vars')) output:
        
        { "size": "50",
            "query": { "bool": 
            { "must": [ 
                {"match": {"nome_a":"ROBESPIERRE PITA"}},
                {"match": {"sexo_a":"1"}} ] } } }
        
        +-----------+-----------------+------+--------------------------+----------------+
        |id_cidacs_b|             nome|  sexo|                      vars|     exact_query|
        +-----------+-----------------+------+--------------------------+----------------+
        |        614|    ROBESPIERR...|     1|  [0, ROBESPIERRE PITA, 1]| { "size": "5...|
        +-----------+-----------------+------+--------------------------+----------------+
        
        3) Finally, a udf should generate 3 new columns with the best candidate id, the similarity with 
           this best candidate, and the set of candidates scores. 
        
        +--------------------+------------------------+---------------------------+
        |best_candidate_exact|sim_best_candidate_exact|similarity_exact_candidates|
        +--------------------+------------------------+---------------------------+
        |                 614|                       1|        {614: 1, 34: 0.8...|
        +--------------------+------------------------+---------------------------+
        
    At last, this function should return the tolink_dataset with all these columns
    """ 
    start = time.time()
    # ------------------------------------ #
    # getting relevant values from config
    # ------------------------------------ #
    
    # collecting config json from broadcasted variable
    config_ = config_bc.value
    
    tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']
    
    tolink_columns = config_['datasets_info']['tolink_dataset']['columns']
    
    temp_dir = config['temp_dir']
    
    paralelism = int(config_['datasets_info']['indexed_dataset']['default_paralelism'])

    prefix_sl = "StorageLevel."
    storage_level = config['datasets_info']['indexed_dataset']['storage_level']
    
    write_checkpoint = config_['write_checkpoint']
    
    # ------------------------------------ #
    # preparing exact search
    # ------------------------------------ #
    # selecting columns
    tolink_dataset = tolink_dataset.select(tolink_columns)
    # building array of variable values
    tolink_dataset = tolink_dataset.withColumn('vars', F.array(tolink_columns))
    # building exact queries
    tolink_dataset = tolink_dataset.withColumn('exact_queries', udf_build_exact_queries(F.col('vars')))
    # finding the best candidate for each tolink record
    tolink_dataset = tolink_dataset.withColumn('result_exact_search', F.explode(F.array(udf_find_elasticsearch_exact_best_candidate(F.col('vars'), F.col('exact_queries')))))
    
    if write_checkpoint == 'true':
        # writing temporary data from this point helps to reset the DAG and improve performance
        tolink_dataset.write.parquet(temp_dir+'result_exact_search.parquet', mode='overwrite')
        tolink_dataset = spark.read.parquet(temp_dir+'result_exact_search.parquet').repartition(paralelism).persist(eval(prefix_sl+storage_level))
    
    # exploding array columns from the last function into 4 atomic cols
    tolink_dataset = tolink_dataset.withColumn('best_candidate_exact', tolink_dataset.result_exact_search['best_candidate_exact'])
    tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_exact', tolink_dataset.result_exact_search['sim_best_candidate_exact'])
    tolink_dataset = tolink_dataset.withColumn('similarity_exact_candidates', tolink_dataset.result_exact_search['similarity_exact_candidates'])
    
    tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_exact', F.col('sim_best_candidate_exact').cast('float'))
    
    # dropping array columns
    cols_to_drop = ['result_exact_search']
    tolink_dataset = tolink_dataset.drop(*cols_to_drop)
    
    print("\t[CIDACS-RL] time for exact phase: {} secs".format(time.time()-start))
    return tolink_dataset



def cidacs_rl_non_exact_phase(tolink_dataset):
    """
    This function take a dataframe from exact match phase and submit it to a non exact search.
    cidacs_rl_non_exact_phase(tolink_dataset) input: 
    
    +--------------------------+--------------------+------------------------+---------------------------+
    |                      vars|best_candidate_exact|sim_best_candidate_exact|similarity_exact_candidates|
    +--------------------------+--------------------+------------------------+---------------------------+
    |       [2, SAMILA SENA, 2]|                null|                    null|                       null|
    +--------------------------+--------------------+------------------------+---------------------------+
        
    cidacs_rl_non_exact_phase(tolink_dataset) output: 
        
        +------------------------+----------------------------+-------------------------------+
        |best_candidate_non_exact|sim_best_candidate_non_exact|similarity_exact_non_candidates|
        +------------------------+----------------------------+-------------------------------+
        |                       7|                        0.94|            {7: 0.94, 3: 0.9...|
        +------------------------+----------------------------+-------------------------------+
    
    At last, this function should return the tolink_dataset with all these columns
    """
    # ------------------------------------ #
    # getting relevant values from config
    # ------------------------------------ #
    
    # collecting config json from broadcasted variable
    start = time.time()
    config_ = config_bc.value
    
    tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']
    
    tolink_columns = config_['datasets_info']['tolink_dataset']['columns']
    
    temp_dir = config_['temp_dir']
    
    is_debug = config_['debug']
    
    paralelism = int(config_['datasets_info']['indexed_dataset']['default_paralelism'])

    prefix_sl = "StorageLevel."
    storage_level = config['datasets_info']['indexed_dataset']['storage_level']
    
    write_checkpoint = config_['write_checkpoint']
    # ------------------------------------ #
    # preparing non exact search
    # ------------------------------------ #
    
    # building linked_from column. Non-null values on sim_best_candidate_exact must be filled 
    # as 'exact_match', otherwise as 'non_exact_match'.    
    filter_isnull = F.col('sim_best_candidate_exact').isNull()
    tolink_dataset = tolink_dataset.withColumn('linked_from', F.when(filter_isnull, 'non_exact_match').otherwise('exact_match'))
    
    # preparing filters for debug and non-debug executions
    filter_exact = F.col('linked_from') == 'exact_match'
    filter_non_exact = F.col('linked_from') == 'non_exact_match'
    
    if is_debug == 'false': 
        # declaring a filtered version of input dataset
        tolink_dataset_ = tolink_dataset.filter(filter_non_exact)
        # declaring the remainder dataframe
        tolink_dataset = tolink_dataset.filter(filter_exact)
        
        # creating, for remainder dataframe, the cols created in this function to ensure union
        tolink_dataset = tolink_dataset.withColumn('best_candidate_non_exact', F.lit(None))
        tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_non_exact', F.lit(None))
        tolink_dataset = tolink_dataset.withColumn('similarity_non_exact_candidates', F.lit(None))
        tolink_dataset = tolink_dataset.withColumn('non_exact_queries', F.lit(None))
    else: 
        # inside dataframe receives the input integrally
        tolink_dataset_ = tolink_dataset
    
    tolink_dataset_ = tolink_dataset_.withColumn('non_exact_queries', udf_build_non_exact_queries(F.col('vars')))

    tolink_dataset_ = tolink_dataset_.withColumn('result_non_exact_search', F.explode(F.array(udf_find_elasticsearch_non_exact_best_candidate(F.col('vars'), F.col('non_exact_queries')))))
    
    if write_checkpoint == 'true':
        # writing temporary data from this point helps to reset the DAG and improve performance
        tolink_dataset_.write.parquet(temp_dir+'result_non_exact_search.parquet', mode='overwrite')
        tolink_dataset_ = spark.read.parquet(temp_dir+'result_non_exact_search.parquet').repartition(paralelism).persist(eval(prefix_sl+storage_level))

    tolink_dataset_ = tolink_dataset_.withColumn('best_candidate_non_exact', tolink_dataset_.result_non_exact_search['best_candidate_non_exact'])
    tolink_dataset_ = tolink_dataset_.withColumn('sim_best_candidate_non_exact', tolink_dataset_.result_non_exact_search['sim_best_candidate_non_exact'])
    tolink_dataset_ = tolink_dataset_.withColumn('similarity_non_exact_candidates', tolink_dataset_.result_non_exact_search['similarity_non_exact_candidates'])
    
    tolink_dataset_ = tolink_dataset_.withColumn('sim_best_candidate_non_exact', F.col('sim_best_candidate_non_exact').cast('float'))
    
    cols_to_drop = ['result_non_exact_search']
    tolink_dataset_ = tolink_dataset_.drop(*cols_to_drop)
    
    if is_debug == 'false':
        tolink_dataset_ = tolink_dataset_.union(tolink_dataset)
    print("\t[CIDACS-RL] time for non-exact phase: {} secs".format(time.time()-start))
    return tolink_dataset_



def cidacsrl():
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print("[CIDACS-RL] starting at {}".format(dt_string))
    start = time.time()

    config_ = config_bc.value
    
    # getting the auxiliary variables
    data_ext = config_['datasets_info']['indexed_dataset']['extension']
    data_path = config_['datasets_info']['indexed_dataset']['path']
    index_df_response = config_['index_data']
    index_name = config_['es_index_name']

    if index_df_response == 'yes':
        start_ = time.time()
        # getting the auxiliary variables
        data_ext = config_['datasets_info']['indexed_dataset']['extension']
        data_path = config_['datasets_info']['indexed_dataset']['path']
        
        paralelism = int(config_['datasets_info']['indexed_dataset']['default_paralelism'])
        
        prefix_sl = "StorageLevel."
        storage_level = config['datasets_info']['indexed_dataset']['storage_level']
        
        # test the extension of the dataset to properly read it
        if data_ext == 'csv':
            indexed_dataset = spark.read.csv(data_path, header=True).repartition(paralelism).persist(eval(prefix_sl+storage_level))
        elif data_ext == 'parquet':
            indexed_dataset = spark.read.parquet(data_path).repartition(paralelism).persist(eval(prefix_sl+storage_level))
        else:
            print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")
    
        # # indexing, at last
        index_dataframe(indexed_dataset, index_name)
        print("[CIDACS-RL] indexing on, it took {} secs".format(time.time()-start_))
    
    # getting the auxiliary variables
    data_ext = config_['datasets_info']['tolink_dataset']['extension']
    data_path = config_['datasets_info']['tolink_dataset']['path']
    
    paralelism = int(config_['datasets_info']['tolink_dataset']['default_paralelism'])
    
    prefix_sl = "StorageLevel."
    storage_level = config['datasets_info']['tolink_dataset']['storage_level']

    # test the extension of the dataset to properly read it
    if data_ext == 'csv':
        tolink_dataset = spark.read.csv(data_path, header=True).repartition(paralelism).persist(eval(prefix_sl+storage_level))
    elif data_ext == 'parquet':
        tolink_dataset = spark.read.parquet(data_path).repartition(paralelism).persist(eval(prefix_sl+storage_level))
    else:
        print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")
    
    tolink_dataset = cidacs_rl_exact_phase(tolink_dataset)
    
    tolink_dataset = cidacs_rl_non_exact_phase(tolink_dataset)
    
    tolink_dataset = tolink_dataset.withColumn('final_cidacs_rl_score', 
                                               F.when(F.col('linked_from') == 'exact_match', F.col('sim_best_candidate_exact'))\
                                                .otherwise(F.col('sim_best_candidate_non_exact')))
    
    tolink_dataset = tolink_dataset.withColumn('final_cidacs_rl_id', 
                                               F.when(F.col('linked_from') == 'exact_match', F.col('best_candidate_exact'))\
                                                .otherwise(F.col('best_candidate_non_exact')))
    
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print("[CIDACS-RL] finished at {}".format(dt_string))
    print("[CIDACS-RL] total time elapsed: {} secs".format(time.time()-start))
    return tolink_dataset

# Running CIDACS-RL

In [10]:
config_file = 'config.json'
f = open(config_file)
config = json.load(f)

# broadcasting config
config_bc = sc.broadcast(config)

In [11]:
config

{'index_data': 'no',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'temp_dir': '../temp_dataframe/',
 'debug': 'false',
 'write_checkpoint': 'false',
 'datasets_info': {'indexed_dataset': {'path': '../data/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a',
   'storage_level': 'MEMORY_ONLY',
   'default_paralelism': '16'},
  'tolink_dataset': {'path': '../data/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b',
   'storage_level': 'MEMORY_ONLY',
   'default_paralelism': '16'},
  'result_dataset': {'path': '../result/'}},
 'comparisons': {'name': {'indexed_col': 'nome_a',
   'tolink_col': 'nome_b',
   'must_match': 'true',
   'should_match': 

In [13]:
#df_1 = spark.read.parquet('../data/sinthetic-dataset-A.parquet')

In [14]:
linked_data = cidacsrl()

[CIDACS-RL] starting at 12/08/2025 19:36:51


Py4JJavaError: An error occurred while calling o54.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 172.18.0.3, executor 1): org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:376)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:444)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:490)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:484)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:75)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.io.FileNotFoundException: File file:/root/data/sinthetic-datasets-b-1000.parquet/part-00000-1c7b70cc-9ae0-4b61-8dff-fb2f23c2279b-c000.snappy.parquet does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:666)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:987)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:656)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:454)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:146)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:347)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:899)
	at org.apache.parquet.hadoop.util.HadoopInputFile.newStream(HadoopInputFile.java:65)
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:498)
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:476)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:451)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:373)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.mergeSchemasInParallel(SchemaMergeUtils.scala:69)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:494)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$.inferSchema(ParquetUtils.scala:107)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:163)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:198)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:195)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:408)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:376)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:444)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:490)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:484)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:75)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.io.FileNotFoundException: File file:/root/data/sinthetic-datasets-b-1000.parquet/part-00000-1c7b70cc-9ae0-4b61-8dff-fb2f23c2279b-c000.snappy.parquet does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:666)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:987)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:656)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:454)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:146)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:347)
	at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:899)
	at org.apache.parquet.hadoop.util.HadoopInputFile.newStream(HadoopInputFile.java:65)
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:498)
	at org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:476)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$readParquetFootersInParallel$1(ParquetFileFormat.scala:451)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:373)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)


In [12]:
linked_data.limit(3).toPandas()

                                                                                

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,best_candidate_exact,sim_best_candidate_exact,similarity_exact_candidates,linked_from,non_exact_queries,best_candidate_non_exact,sim_best_candidate_non_exact,similarity_non_exact_candidates,final_cidacs_rl_score,final_cidacs_rl_id
0,7314,IRLANA CORREIA,IRLANA CORREIA,20090417,1,"[7314, IRLANA CORREIA, IRLANA CORREIA, 2009041...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""IRL...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",722901,0.861511,"{106761=0.7999843615607164, 425580=0.850456091...",0.861511,722901
1,4326,LIDIA BELLONIA BARBOSA,LIDIA BELLONIA BARBOSA,20080822,1,"[4326, LIDIA BELLONIA BARBOSA, LIDIA BELLONIA ...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""LID...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",338244,0.865734,"{516678=0.8331529581529581, 886362=0.772524350...",0.865734,338244
2,18266,RAPHAEL MIQUEIAS NASCIMENTO,CAROLINA SILVA,20110117,2,"[18266, RAPHAEL MIQUEIAS NASCIMENTO, CAROLINA ...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""RAP...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",18266,0.747199,"{918320=0.6190809657196211, 860252=0.631185428...",0.747199,18266


In [13]:
# [CIDACS-RL] starting at 26/01/2022 20:56:21
# 	[CIDACS-RL] time for exact phase: 38.057870864868164 secs
# 	[CIDACS-RL] time for non-exact phase: 45.42489194869995 secs
# [CIDACS-RL] finished at 26/01/2022 20:57:49
# [CIDACS-RL] total time elapsed: 88.37369227409363 secs

In [16]:
linked_data.limit(3).toPandas()

                                                                                

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,best_candidate_exact,sim_best_candidate_exact,similarity_exact_candidates,linked_from,non_exact_queries,best_candidate_non_exact,sim_best_candidate_non_exact,similarity_non_exact_candidates,final_cidacs_rl_score,final_cidacs_rl_id
0,7314,IRLANA CORREIA,IRLANA CORREIA,20090417,1,"[7314, IRLANA CORREIA, IRLANA CORREIA, 2009041...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""IRL...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",722901,0.861511,"{106761=0.7999843615607164, 425580=0.850456091...",0.861511,722901
1,4326,LIDIA BELLONIA BARBOSA,LIDIA BELLONIA BARBOSA,20080822,1,"[4326, LIDIA BELLONIA BARBOSA, LIDIA BELLONIA ...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""LID...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",338244,0.865734,"{516678=0.8331529581529581, 886362=0.772524350...",0.865734,338244
2,18266,RAPHAEL MIQUEIAS NASCIMENTO,CAROLINA SILVA,20110117,2,"[18266, RAPHAEL MIQUEIAS NASCIMENTO, CAROLINA ...","{ ""bool"": { ""must"": [ {""match"": {""nome_a"":""RAP...",,,,non_exact_match,"{ ""bool"": { ""should"": [ {""match"": {""nome_a"": {...",18266,0.747199,"{918320=0.6190809657196211, 860252=0.631185428...",0.747199,18266


In [18]:
print("Tempo total de execução: {} secs".format(time.time() - start))

Tempo total de execução: 95.93091297149658 secs


In [19]:
# Tempo total de execução: 105.01147866249084 secs

In [17]:
linked_data.select('linked_from').groupBy('linked_from').count().show()

                                                                                

+---------------+-----+
|    linked_from|count|
+---------------+-----+
|non_exact_match|  458|
|    exact_match|  542|
+---------------+-----+

