In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pyspark.sql.types as T

import jellyfish
from elasticsearch import Elasticsearch
import json

In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
f = open('config.txt')
config = json.load(f)
config_bc = sc.broadcast(config)
config

{'index_data': 'yes',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'temp_dir': '../0_global_data/fd-cidacs-rl/temp_dataframe/',
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a'},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b'},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'name': {'indexed_col': 'nome_a',
   'tolink_col': 'nome_b',
   'must_match': 'true',
   'should_match': 'true',
   'is_fuzzy': 'true',
   'boost': '3.0',
   'query_type': 'match'

# ES functions

#### auxiliary functions

In [4]:
def get_match_cols_and_values(vars_col, query_type, add_id_col):
    """
    query_type must be 'exact' for building exact queries or 'general' for any else query and comparison.
    """
    config_ = config_bc.value
    # getting names of indexed columns
    indexed_id_column = config_['datasets_info']['indexed_dataset']['id_column_name']
    
    indexed_cols = config_['datasets_info']['indexed_dataset']['columns']
    
#     if query_type == 'general':
#         indexed_cols = [x for x in indexed_cols if x != indexed_id_column]
        
    # notice that we are linking indexed keys with tolink values
    # the keys will be used to set which field will be fetched on es
    # the values will be used as search content
    tolink_cols_dict = dict(zip(indexed_cols, vars_col))
    
    if add_id_col == False:
        tolink_cols_dict.pop(indexed_id_column, None)
    
    if query_type == 'general':
        return tolink_cols_dict
    elif query_type == 'exact':
        # finding which are the columns used on exact match step
        indexed_exact_match_vars = [indexed_id_column] + [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']
        non_exact_match_cols = list(set(indexed_cols) - set(indexed_exact_match_vars))
        # deleting those columns of non-exact match
        [tolink_cols_dict.pop(x, None) for x in non_exact_match_cols]
        
        return tolink_cols_dict
    else: 
        print("Please use 'general' or 'exact' as query_type input")
        return None, None

#### indexing

In [5]:
def index_dataframe(dataframe, es_index_name):
    # creating new index
    dataframe.write.format("org.elasticsearch.spark.sql") \
                 .option("es.resource", es_index_name).mode('overwrite').save()

#### query building

In [6]:
def build_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'exact', False)
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    # setting the preffix and suffix of query core
    prefix_ = """{"match": {"""
    suffix_ = """}}"""
    
    # filling the query core with all indexed columns and values from vars_col
    strings = []
    for col in list(tolink_cols_dict.keys()):
        string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    # building the query core. 
    # Should be like: {"match": {"name":"ROBESPIERRE PITA"}}, {"birthdate": {"name":"1987-05-05"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "size": "%s", "query": { "bool": { "must": [ %s ] } } }""" % (query_size,line)
    
    return complete_query
udf_build_exact_queries = F.udf(build_exact_queries, StringType()) 

def build_non_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{ "size": "50", 
         "query": { 
             "bool": { 
                 "should": [ 
                     {"match": {"nome_a":"ROBESPIERRE PITA", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}},
                     {"match": {"birthdate":"19870505"}} ] } } }
                     {"term": {"sexo_a":"1"}} ] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'exact', False)
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    
    # filling the query core with all indexed columns and values from vars_col
    comparisons = [config['comparisons'][x] for x in config['comparisons']]
    strings = []
    for col in list(tolink_cols_dict.keys()):
        query_col_instructions = [x for x in comparisons if x['indexed_col'] == col][0]
        print(col)
        query_type = str(query_col_instructions['query_type'])
        prefix_ = """{"%s": {""" % query_type
        suffix_ = """}}"""

        if query_col_instructions['should_match'] == 'true':
            if query_col_instructions['is_fuzzy'] == 'true':
                
                boost = str(query_col_instructions['boost'])
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + ", \"fuzziness\":\"AUTO\", \"operator\":\"or\", \"boost\":\"" + boost + "\"" + str(suffix_)
            if query_col_instructions['is_fuzzy'] == 'false':
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    # building the query core. 
    # is_fuzzy = 'true' should be like: {"match": {"name":"ROBESPIERRE PITA", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}}, {"term": {"dt_nasc_a":"20070816"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "size": "%s", "query": { "bool": { "should": [ %s ] } } }""" % (query_size,line)
    
    return complete_query
udf_build_exact_queries = F.udf(build_exact_queries, StringType())

#### finding matches

In [7]:
def find_elasticsearch_exact_best_candidate(vars_col, exact_queries_col):
    """
    Let us suppose a column with the following query:
    
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    
    so, this function must return a dict with N results like: 
        {'_index': 'test', '_type': '_doc', '_id': 'aaabbbccc', '_score': 43.9280841,
        '_source': {'name': 'ROBESPIERRE PITA', 'birthdate': '19870505', 'other_col': 'other_value'}},
    
    being N the query_size value set on config, you can see this number on the 'size' field of the query.
    
    This result can now be used to compute the proper similarity and pick the 
    best candidate for each record
    """
    from elasticsearch import Elasticsearch
    config_ = config_bc.value
    
    es_connect_string = config_['es_connect_string']
    es_index_name = config_['es_index_name']
    
    es = Elasticsearch(es_connect_string)
    
    candidates = es.search(index=es_index_name, body=exact_queries_col)['hits']['hits']
    
    if len(candidates) == 0:
        best_score_id, best_score_value, scores = 'null', 'null', 'null'
        return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
    else:
        cols_and_values = get_match_cols_and_values(vars_col, 'general', True)
        best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)

        if float(best_score_value) >= float(config_['cutoff_exact_match']):
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
        else: 
            best_score_id, best_score_value, scores = 'null', 'null', 'null'
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)

schema = StructType([StructField("best_candidate_exact", StringType(), False), 
                     StructField("sim_best_candidate_exact", StringType(), False), 
                     StructField("similarity_exact_candidates", StringType(), False)])

udf_find_elasticsearch_exact_best_candidate = F.udf(find_elasticsearch_exact_best_candidate, schema)


def find_best_candidates(cols_and_values, candidates):
    
    config_ = config_bc.value
    indexed_id_col = config_['datasets_info']['indexed_dataset']['id_column_name']
    id_value = cols_and_values[indexed_id_col]
    scores = {}
    
    for candidate in candidates:
        candidate_id = candidate['_source'][indexed_id_col]
        sim_candidate = []

        for col_and_value in list(cols_and_values.keys()):
            if col_and_value != indexed_id_col:
                comparison_info = [config_['comparisons'][x] for x in config_['comparisons'] if config_['comparisons'][x]['indexed_col'] == col_and_value][0]
                n_comparisons = len(config_['comparisons'].keys())

                sim_for_pair_of_cols = similarity_hub(n_comparisons, comparison_info, cols_and_values[col_and_value], candidate['_source'][col_and_value])

                sim_candidate.append(sim_for_pair_of_cols)

        score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
        score = (sum(sim_candidate))/score_max
    
        scores[candidate_id] = score
    
#     # taking those records with the same 'id'
#     scores.pop(str(id_value), None)
    
    # finding the best score and id
    if len(scores) > 0:
        best_score_id = max(scores, key=scores.get)
        best_score_value = scores[best_score_id]
    else: 
        best_score_id = 'null'
        best_score_value = '0.0'
        scores = '{}'
    return best_score_id, best_score_value, scores
    
    
def similarity_hub(n_comparisons, comparison_info, col_and_value, candidate):
    """
    Currently the CIDACS-RL uses overlap for categorical data, jaro_winkler for names and hamming for dates.
    """
    import jellyfish
    
    # getting relevant information for this pair of values
    config_ = config_bc.value
#     score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
    similarity = 0.0
    weight = float(comparison_info['weight'])
    penalty = float(comparison_info['penalty'])
    
    # first, test if some value are missing
    if (candidate == config_['null_value']) or (col_and_value == config_['null_value'])\
        or (candidate == "") or (col_and_value == "") or (candidate == None) or (col_and_value == None):
        similarity = similarity - penalty
    else: 
        sim_type = comparison_info['similarity']
        if (sim_type == 'overlap') and(col_and_value == candidate):
            similarity += (1.0) * weight
            return similarity
        elif sim_type == 'jaro_winkler':
            similarity += jellyfish.jaro_winkler(col_and_value, candidate) * weight
        elif sim_type == 'hamming':
            max_size = max(len(col_and_value), len(candidate))
            similarity += 1.0 - float(jellyfish.hamming_distance(col_and_value, candidate)/max_size) * weight
        else: 
            print('Please inform valid similarities for cidacs-rl')
        
        similarity = similarity
    return similarity    

#### main functions

In [None]:
def cidacsrl_exact_match():

# Reading prepocessed datasets

In [8]:
# getting the auxiliary variables
data_ext = config['datasets_info']['indexed_dataset']['extension']
data_path = config['datasets_info']['indexed_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    indexed_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    indexed_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")
    
# All the hyphens symbols must be taken from date type variables converted to string
indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.regexp_replace(F.col('dt_nasc_a'), "-", ""))

In [9]:
indexed_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_a,nome_a,nome_mae_a,dt_nasc_a,sexo_a
0,1,YASMIM VITORIA MATIAS FONSECA,TACIANY DOS SANTOS,20071122,2
1,2,PEDRO HENRIQUE MARTINS DE CARVALHO,FRANCILEIDE DOS SANTOS ALVES,20061102,1
2,3,FABRICIO RODRIGUES DOS SANTOS,MARCELA MACHADO DA SILVA,20071107,1


In [10]:
# getting the auxiliary variables
data_ext = config['datasets_info']['tolink_dataset']['extension']
data_path = config['datasets_info']['tolink_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    tolink_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    tolink_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")

#### preprocessing tolink dataset

In [11]:
for col in tolink_dataset.columns:
    tolink_dataset = tolink_dataset.withColumn(col, F.col(col).cast('string'))

tolink_dataset = tolink_dataset.na.fill(config['null_value'])

# All the hyphens symbols must be taken from date type variables converted to string
tolink_dataset = tolink_dataset.withColumn('dt_nasc_b', F.regexp_replace(F.col('dt_nasc_b'), "-", ""))

In [12]:
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2


# Indexing dataset

In [13]:
# # make sure all the cols in data are StringType()

# for col in indexed_dataset.columns:
#     indexed_dataset = indexed_dataset.withColumn(col, F.col(col).cast('string'))

# indexed_dataset = indexed_dataset.na.fill(config['null_value'])
    
# # All the hyphens symbols must be taken from date type variables converted to string
# indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.regexp_replace(F.col('dt_nasc_a'), "-", ""))

# # indexing, at last
# index_df_response = config['index_data']
# index_name = config['es_index_name']
# if index_df_response == 'yes':
#     index_dataframe(indexed_dataset, index_name)

In [14]:
# index_name = config['es_index_name']
# es = Elasticsearch('http://localhost:9200')
# content = {
#     'size': 1,
#     'query': {
#         'bool': {
#             'must': [
#                 {'match': {'dt_nasc_a': '20070816'}}
#             ]
#         }
#     }
# }
# candidates = es.search(index=index_name, body=content)['hits']['hits']
# candidates

# Linking datasets

#### auxiliary variables

In [15]:
config_ = config_bc.value
query_size = config_['query_size']

#### creating vars column

In [16]:
tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']
tolink_cols = config_['datasets_info']['tolink_dataset']['columns']
# tolink_cols = [x for x in tolink_cols if x != tolink_id_column]

tolink_dataset = tolink_dataset.withColumn('vars', F.array(tolink_cols))
tolink_dataset.limit(2).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA..."
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[4582, ANA KAROLINA RODRIGUES SOUSA, CELINE RA..."


In [17]:
tolink_dataset.select('vars').show(truncate=False)

+--------------------------------------------------------------------------------------+
|vars                                                                                  |
+--------------------------------------------------------------------------------------+
|[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA NASCIMENTO, 20070816, 2]               |
|[4582, ANA KAROLINA RODRIGUES SOUSA, CELINE RAIMUNDA SILVA, 20090614, 2]              |
|[4739, NATALIA DAVID BENTO, KETLEN SANTOS, 20091222, 2]                               |
|[5635, FILHO CARLEIDE NASCIMENTO DA SILVA, LUCIMARA RABELO ALCANTARA, 20100408, 1]    |
|[6614, KAYLANE VITORIA SANTOS FREITAS, FLAVIANA DOS SANTOS, 20100220, 2]              |
|[8684, GABRIELA BARROZO JAKCES, LUCIANA MARIA RODRIGUES DA COSTA, 20101207, 2]        |
|[12206, JOAO VICTOR MARINHO MOREIRA, ELISANGELA OLIVEIRA, 20090206, 1]                |
|[18352, LUAN GUILHERME  DOS SANTOS SILVA, FERNANDA DA SILVA RUIZ, 20110606, 1]        |
|[21426, ANNA BEATRIZ

#### creating exact_queries column

In [18]:
tolink_dataset = tolink_dataset.withColumn('exact_queries', udf_build_exact_queries(F.col('vars')))
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[4582, ANA KAROLINA RODRIGUES SOUSA, CELINE RA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2,"[4739, NATALIA DAVID BENTO, KETLEN SANTOS, 200...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."


In [19]:
tolink_dataset.select('exact_queries').show(2, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|exact_queries                                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{ "size": "50", "query": { "bool": { "must": [ {"match": {"nome_a":"FABIOLA FAGUNDES FRICKS"}},{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO"}},{"match": {"sexo_a":"2"}} ] } } } |
|{ "size": "50", "query": { "bool": { "must": [ {"match": {"nome_a":"ANA KAROLINA RODRIGUES SOUSA"}},{"match": {"nome_mae_a":"CELINE RAIMUNDA SILVA"}},{"match": {"sexo_a":"2"}} ] } } }|
+---------------------------------------------------------------------

<hr />

#### finding the best candidate and similarity

In [20]:
temp_dir = config['temp_dir']

In [21]:
tolink_dataset = tolink_dataset.withColumn('result_exact_search', F.explode(F.array(udf_find_elasticsearch_exact_best_candidate(F.col('vars'), F.col('exact_queries')))))
# writing temporary data from this point helps to reset the DAG and improve performance
tolink_dataset.write.parquet(temp_dir+'result_exact_search.parquet', mode='overwrite')
tolink_dataset.limit(1).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,result_exact_search
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(1081, 1.0, {842866=0.6605594419000216, 436317..."


In [22]:
tolink_dataset = spark.read.parquet(temp_dir+'result_exact_search.parquet')
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,result_exact_search
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(1081, 1.0, {842866=0.6605594419000216, 436317..."
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[4582, ANA KAROLINA RODRIGUES SOUSA, CELINE RA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(4582, 1.0, {730432=0.867459862732972, 604618=..."
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2,"[4739, NATALIA DAVID BENTO, KETLEN SANTOS, 200...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(4739, 1.0, {544947=0.7007736649980847, 103694..."


In [23]:
tolink_dataset = tolink_dataset.withColumn('best_candidate_exact', tolink_dataset.result_exact_search['best_candidate_exact'])
tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_exact', tolink_dataset.result_exact_search['sim_best_candidate_exact'])
tolink_dataset = tolink_dataset.withColumn('similarity_exact_candidates', tolink_dataset.result_exact_search['best_candidate_exact'])

cols_to_drop = ['result_exact_search']
tolink_dataset = tolink_dataset.drop(*cols_to_drop)
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,best_candidate_exact,sim_best_candidate_exact,similarity_exact_candidates
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[1081, FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...",1081,1.0,1081
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[4582, ANA KAROLINA RODRIGUES SOUSA, CELINE RA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...",4582,1.0,4582
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2,"[4739, NATALIA DAVID BENTO, KETLEN SANTOS, 200...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...",4739,1.0,4739


In [24]:
tolink_dataset.count()

1000

In [25]:
tolink_dataset.withColumn('sim_best_candidate_exact', F.col('sim_best_candidate_exact').cast('long')).filter(F.col('sim_best_candidate_exact').isNull()).count()

199

In [26]:
tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_exact', F.col('sim_best_candidate_exact').cast('long'))

In [28]:
tolink_dataset.printSchema()

root
 |-- id_cidacs_b: string (nullable = true)
 |-- nome_b: string (nullable = true)
 |-- nome_mae_b: string (nullable = true)
 |-- dt_nasc_b: string (nullable = true)
 |-- sexo_b: string (nullable = true)
 |-- vars: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- exact_queries: string (nullable = true)
 |-- best_candidate_exact: string (nullable = true)
 |-- sim_best_candidate_exact: long (nullable = true)
 |-- similarity_exact_candidates: string (nullable = true)



In [27]:
tolink_dataset.select('sim_best_candidate_exact').describe().show()

+-------+------------------------+
|summary|sim_best_candidate_exact|
+-------+------------------------+
|  count|                     801|
|   mean|                     1.0|
| stddev|                     0.0|
|    min|                       1|
|    max|                       1|
+-------+------------------------+



In [29]:
config

{'index_data': 'yes',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'temp_dir': '../0_global_data/fd-cidacs-rl/temp_dataframe/',
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a'},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b'},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'name': {'indexed_col': 'nome_a',
   'tolink_col': 'nome_b',
   'must_match': 'true',
   'should_match': 'true',
   'is_fuzzy': 'true',
   'boost': '3.0',
   'query_type': 'match'

In [65]:
def build_non_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'general', False)
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    
    # filling the query core with all indexed columns and values from vars_col
    comparisons = [config['comparisons'][x] for x in config['comparisons']]
    strings = []
    for col in list(tolink_cols_dict.keys()):
        query_col_instructions = [x for x in comparisons if x['indexed_col'] == col][0]
        print(col)
        query_type = str(query_col_instructions['query_type'])
        prefix_ = """{"%s": {""" % query_type
        suffix_ = """}}"""

        if query_col_instructions['should_match'] == 'true':
            if query_col_instructions['is_fuzzy'] == 'true':
                
                boost = str(query_col_instructions['boost'])
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + ", \"fuzziness\":\"AUTO\", \"operator\":\"or\", \"boost\":\"" + boost + "\"" + str(suffix_)
            if query_col_instructions['is_fuzzy'] == 'false':
                string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    # building the query core. 
    # is_fuzzy = 'true' should be like: {"match": {"name":"ROBESPIERRE PITA", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}}, {"term": {"dt_nasc_a":"20070816"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "size": "%s", "query": { "bool": { "should": [ %s ] } } }""" % (query_size,line)
    
    return complete_query
udf_build_exact_queries = F.udf(build_exact_queries, StringType())

In [33]:
vars_col = ['1081', 'FABIOLA FAGUNDES FRICKS', 'LUCIMARA COSTA NASCIMENTO', '20070816', '2']

In [66]:
build_non_exact_queries(vars_col)

nome_a
nome_mae_a
dt_nasc_a
sexo_a


'{ "size": "50", "query": { "bool": { "should": [ {"match": {"nome_a":"FABIOLA FAGUNDES FRICKS", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}},{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO", "fuzziness":"AUTO", "operator":"or", "boost":"2.0"}},{"term": {"dt_nasc_a":"20070816"}},{"term": {"sexo_a":"2"}} ] } } }'

In [None]:
{ "size": "50", 
 "query": { 
     "bool": { 
         "should": [ 
             {"match": {"nome_a":"FABIOLA FAGUNDES FRICKS", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}},
             {"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO", "fuzziness":"AUTO", "operator":"or", "boost":"2.0"}},
             {"term": {"sexo_a":"2"}} ] } } }

In [34]:
config_ = config_bc.value
query_size = config_['query_size']

In [54]:
tolink_cols_dict = get_match_cols_and_values(vars_col, 'general', False)
tolink_cols_dict

{'nome_a': 'FABIOLA FAGUNDES FRICKS',
 'nome_mae_a': 'LUCIMARA COSTA NASCIMENTO',
 'dt_nasc_a': '20070816',
 'sexo_a': '2'}

In [60]:
comparisons = [config['comparisons'][x] for x in config['comparisons']]
strings = []
for col in list(tolink_cols_dict.keys()):
    query_col_instructions = [x for x in comparisons if x['indexed_col'] == col][0]
    print(col)
    query_type = str(query_col_instructions['query_type'])
    prefix_ = """{"%s": {""" % query_type
    suffix_ = """}}"""

    if query_col_instructions['should_match'] == 'true':
        if query_col_instructions['is_fuzzy'] == 'true':
            boost = str(query_col_instructions['boost'])
            string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + ", \"fuzziness\":\"AUTO\", \"operator\":\"or\", \"boost\":\"" + boost + "\"" + str(suffix_)
        if query_col_instructions['is_fuzzy'] == 'false':
            string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
    print(string)

nome_a
{"match": {"nome_a":"FABIOLA FAGUNDES FRICKS", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}}
nome_mae_a
{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO", "fuzziness":"AUTO", "operator":"or", "boost":"2.0"}}
dt_nasc_a
{"term": {"dt_nasc_a":"20070816"}}
sexo_a
{"term": {"sexo_a":"2"}}


In [57]:
query_col_instructions

{'indexed_col': 'nome_a',
 'tolink_col': 'nome_b',
 'must_match': 'true',
 'should_match': 'true',
 'is_fuzzy': 'true',
 'boost': '3.0',
 'query_type': 'match',
 'similarity': 'jaro_winkler',
 'weight': 5.0,
 'penalty': 0.02}

In [46]:
query_col_instructions = [x for x in comparisons if x['indexed_col'] == 'nome_a'][0]
query_col_instructions

{'indexed_col': 'nome_a',
 'tolink_col': 'nome_b',
 'must_match': 'true',
 'should_match': 'true',
 'is_fuzzy': 'true',
 'boost': '3.0',
 'query_type': 'match',
 'similarity': 'jaro_winkler',
 'weight': 5.0,
 'penalty': 0.02}

{"match": {"sexo_a":"2", "fuzziness":"AUTO", "operator":"or", "boost":"3.0"}}


In [44]:
string

'{"match": {"sexo_a":"2"}}'

In [None]:
content = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {'match': {'lb_sex': sex}},
                    {'match': {'lb_birthday_child': birthday_child}},
                    {'match': {'lb_addr_residence': addr_residence}}
                ]
            }
        }
    }

In [None]:
content = {
        'size': 5,
        'query': {
            'bool': {
                'should': [
                    {'match': {'lb_sex': {'query': sex, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2.0'}}},
                    {'match': {'lb_addr_residence': {'query': addr_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'4.0'}}},
                    {'match': {'lb_addr_occurrence': {'query': addr_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_state_residence': {'query': state_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'1.0'}}},
                    {'match': {'lb_state_occurrence': {'query': state_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_state_mun_residence': {'query': state_mun_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'3.0'}}},
                    {'match': {'lb_state_mun_occurrence': {'query': state_mun_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_day_birth': {'query': day_birth}}},
                    {'match': {'lb_month_birth': {'query': month_birth}}},
                    {'match': {'lb_year_birth': {'query': year_birth}}},
                    {'match': {'lb_mun_residence_pad': {'query': mun_residence_pad}}},
                    {'match': {'lb_loc_residence_pad': {'query': loc_residence_pad}}},
                    {'match': {'lb_mun_occurrence_pad': {'query': mun_occurrence_pad}}},
                    {'match': {'lb_loc_occurrence_pad': {'query': loc_occurrence_pad}}},
                    {'term': {'lb_birthday_child': birthday_child}}
                ]
            }
        }
    }