In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pyspark.sql.types as T

import jellyfish
from elasticsearch import Elasticsearch
import json

In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
f = open('config.txt')
config = json.load(f)
config_bc = sc.broadcast(config)
config

{'index_data': 'yes',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a'},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b'},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'name': {'indexed_col': 'nome_a',
   'tolink_col': 'nome_b',
   'must_match': 'true',
   'should_match': 'true',
   'is_fuzzy': 'true',
   'boost': '3.0',
   'query_type': 'match',
   'similarity': 'jaro_winkler',
   'weight': 1.0,
   'penal

# ES functions

#### auxiliary functions

In [4]:
def get_match_cols_and_values(vars_col, query_type):
    """
    query_type must be 'exact' for building exact queries or 'general' for any else query and comparison.
    """
    # getting names of indexed columns
    indexed_id_column = config_['datasets_info']['indexed_dataset']['id_column_name']
    
    indexed_cols = config_['datasets_info']['indexed_dataset']['columns']
    indexed_cols = [x for x in indexed_cols if x != indexed_id_column]
        
    # notice that we are linking indexed keys with tolink values
    # the keys will be used to set which field will be fetched on es
    # the values will be used as search content
    tolink_cols_dict = dict(zip(indexed_cols, vars_col))
    
    if query_type == 'general':
        return tolink_cols_dict
    elif query_type == 'exact':
        # finding which are the columns used on exact match step
        indexed_exact_match_vars = [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']
        non_exact_match_cols = list(set(indexed_cols) - set(indexed_exact_match_vars))
        # deleting those columns of non-exact match
        [tolink_cols_dict.pop(x, None) for x in non_exact_match_cols]
        
        return tolink_cols_dict
    else: 
        print("Please use 'general' or 'exact' as query_type input")
        return None

#### indexing

In [5]:
def index_dataframe(dataframe, es_index_name):
    # creating new index
    dataframe.write.format("org.elasticsearch.spark.sql") \
                 .option("es.resource", es_index_name).mode('overwrite').save()

#### exact query building

In [6]:
def build_exact_queries(vars_col): 
    """
    Let us suppose the following values:
    vars_col = ['ROBESPIERRE PITA', '1987-05-05', '1', 'Mari Santos']
    indexed_cols = ['name', 'birthdate', 'sex', 'mothers_name']
    query_size = 10
    
    and only the first two attributes are assigned to exact match.
    So, the resulting query column would be: 
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    Requirements: 
    - All values on vars_col must be converted into string
    - All the hyphens symbols must be taken from date type used to search (e.g. 1987-05-05 must be converted to 19870505)
    - The config json must be available as a broadcast through sc.broadcast() function.
    - The names of indexed columns must be correctly filled. 
    """
    config_ = config_bc.value
    query_size = config_['query_size']
    
    tolink_cols_dict = get_match_cols_and_values(vars_col, 'exact')
    
    # -------------------------------------------- #
    #   starting the building of query string      #
    # -------------------------------------------- #
    # setting the preffix and suffix of query core
    prefix_ = """{"match": {"""
    suffix_ = """}}"""
    
    # filling the query core with all indexed columns and values from vars_col
    strings = []
    for col in list(tolink_cols_dict.keys()):
        string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
        print(string)
        strings.append(string)
    
    # building the query core. 
    # Should be like: {"match": {"name":"ROBESPIERRE PITA"}}, {"birthdate": {"name":"1987-05-05"}}
    line = ','.join(strings)
    
    # Finally the final query string
    complete_query = """{ "size": "%s", "query": { "bool": { "must": [ %s ] } } }""" % (query_size,line)
    
    return complete_query
udf_build_exact_queries = F.udf(build_exact_queries, StringType()) 

#### finding matches

In [7]:
def find_elasticsearch_exact_best_candidate(vars_col, exact_queries_col):
    """
    Let us suppose a column with the following query:
    
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    
    so, this function must return a dict with N results like: 
        {'_index': 'test', '_type': '_doc', '_id': 'aaabbbccc', '_score': 43.9280841,
        '_source': {'name': 'ROBESPIERRE PITA', 'birthdate': '19870505', 'other_col': 'other_value'}},
    
    being N the query_size value set on config, you can see this number on the 'size' field of the query.
    
    This result can now be used to compute the proper similarity and pick the 
    best candidate for each record
    """
    from elasticsearch import Elasticsearch
    config_ = config_bc.value
    
    es_connect_string = config_['es_connect_string']
    es_index_name = config_['es_index_name']
    
    es = Elasticsearch(es_connect_string)
    
    candidates = es.search(index=es_index_name, body=exact_queries_col)['hits']['hits']
    
    if len(candidates) == 0:
        return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(None, None, None)
    else:
        cols_and_values = get_match_cols_and_values(vars_col, 'general')
        best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)
        if best_score_value >= float(config_['cutoff_exact_match']):
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
        else: 
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(None, None, None)
    
schema = StructType([StructField("best_candidate_exact", StringType(), False), 
                     StructField("sim_best_candidate_exact", StringType(), False), 
                     StructField("similarity_exact_candidates", StringType(), False)])

udf_find_elasticsearch_exact_best_candidate = F.udf(find_elasticsearch_exact_best_candidate, schema)


def find_best_candidates(cols_and_values, candidates):
    
    config_ = config_bc.value
    indexed_id_col = config_['datasets_info']['indexed_dataset']['id_column_name']
    scores = {}
    
    for candidate in candidates:
        
        candidate_id = candidate['_source'][indexed_id_col]
        sim_candidate = []
        for col_and_value in list(cols_and_values.keys()):
            comparison_info = [config_['comparisons'][x] for x in config_['comparisons'] if config_['comparisons'][x]['indexed_col'] == col_and_value][0]
            n_comparisons = len(config_['comparisons'].keys())
            
            sim_for_pair_of_cols = similarity_hub(n_comparisons, comparison_info, cols_and_values[col_and_value], candidate['_source'][col_and_value])
                        
            sim_candidate.append(sim_for_pair_of_cols)
        
        score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
        score = (sum(sim_candidate))/score_max
        
        scores[candidate_id] = score
            
    best_score_id = max(scores, key=scores.get)
    best_score_value = scores[best_score_id]
    return best_score_id, best_score_value, scores
    
    
def similarity_hub(n_comparisons, comparison_info, col_and_value, candidate):
    """
    Currently the CIDACS-RL uses overlap for categorical data, jaro_winkler for names and hamming for dates.
    """
    import jellyfish
    
    # getting relevant information for this pair of values
    config_ = config_bc.value
#     score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
    similarity = 0.0
    weight = float(comparison_info['weight'])
    penalty = float(comparison_info['penalty'])
    
    # first, test if some value are missing
    if (candidate == config_['null_value']) or (col_and_value == config_['null_value'])\
        or (candidate == "") or (col_and_value == "") or (candidate == None) or (col_and_value == None):
        similarity = similarity - penalty
    else: 
        sim_type = comparison_info['similarity']
        if (sim_type == 'overlap') and(col_and_value == candidate):
            similarity += (1.0) * weight
            return similarity
        elif sim_type == 'jaro_winkler':
            similarity += jellyfish.jaro_winkler(col_and_value, candidate) * weight
        elif sim_type == 'hamming':
            max_size = max(len(col_and_value), len(candidate))
            similarity += 1.0 - float(jellyfish.hamming_distance(col_and_value, candidate)/max_size) * weight
        else: 
            print('Please inform valid similarities for cidacs-rl')
        
        similarity = similarity
    return similarity    

# Reading prepocessed datasets

In [8]:
# getting the auxiliary variables
data_ext = config['datasets_info']['indexed_dataset']['extension']
data_path = config['datasets_info']['indexed_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    indexed_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    indexed_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")
    
# All the hyphens symbols must be taken from date type variables converted to string
indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.regexp_replace(F.col('dt_nasc_a'), "-", ""))

In [9]:
indexed_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_a,nome_a,nome_mae_a,dt_nasc_a,sexo_a
0,1,YASMIM VITORIA MATIAS FONSECA,TACIANY DOS SANTOS,20071122,2
1,2,PEDRO HENRIQUE MARTINS DE CARVALHO,FRANCILEIDE DOS SANTOS ALVES,20061102,1
2,3,FABRICIO RODRIGUES DOS SANTOS,MARCELA MACHADO DA SILVA,20071107,1


In [10]:
# getting the auxiliary variables
data_ext = config['datasets_info']['tolink_dataset']['extension']
data_path = config['datasets_info']['tolink_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    tolink_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    tolink_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")

#### preprocessing tolink dataset

In [11]:
for col in tolink_dataset.columns:
    tolink_dataset = tolink_dataset.withColumn(col, F.col(col).cast('string'))

tolink_dataset = tolink_dataset.na.fill(config['null_value'])

# All the hyphens symbols must be taken from date type variables converted to string
tolink_dataset = tolink_dataset.withColumn('dt_nasc_b', F.regexp_replace(F.col('dt_nasc_b'), "-", ""))

In [12]:
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2


# Indexing dataset

In [163]:
# make sure all the cols in data are StringType()

for col in indexed_dataset.columns:
    indexed_dataset = indexed_dataset.withColumn(col, F.col(col).cast('string'))

indexed_dataset = indexed_dataset.na.fill(config['null_value'])
    
# All the hyphens symbols must be taken from date type variables converted to string
indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.regexp_replace(F.col('dt_nasc_a'), "-", ""))

# indexing, at last
index_df_response = config['index_data']
index_name = config['es_index_name']
if index_df_response == 'yes':
    index_dataframe(indexed_dataset, index_name)

In [164]:
# es = Elasticsearch('http://localhost:9200')
# content = {
#     'size': 1,
#     'query': {
#         'bool': {
#             'must': [
#                 {'match': {'dt_nasc_a': '200711asd22'}}
#             ]
#         }
#     }
# }
# es.search(index=index_name, body=content)['hits']['hits']

# Linking datasets

#### auxiliary variables

In [13]:
config_ = config_bc.value
query_size = config_['query_size']

#### creating vars column

In [14]:
tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']
tolink_cols = config_['datasets_info']['tolink_dataset']['columns']
tolink_cols = [x for x in tolink_cols if x != tolink_id_column]

tolink_dataset = tolink_dataset.withColumn('vars', F.array(tolink_cols))
tolink_dataset.limit(2).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA NASCI..."
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[ANA KAROLINA RODRIGUES SOUSA, CELINE RAIMUNDA..."


#### creating exact_queries column

In [15]:
tolink_dataset = tolink_dataset.withColumn('exact_queries', udf_build_exact_queries(F.col('vars')))
tolink_dataset.limit(3).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA NASCI...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."
1,4582,ANA KAROLINA RODRIGUES SOUSA,CELINE RAIMUNDA SILVA,20090614,2,"[ANA KAROLINA RODRIGUES SOUSA, CELINE RAIMUNDA...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."
2,4739,NATALIA DAVID BENTO,KETLEN SANTOS,20091222,2,"[NATALIA DAVID BENTO, KETLEN SANTOS, 20091222, 2]","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [..."


#### finding the best candidate and similarity

In [19]:
tolink_dataset = tolink_dataset.withColumn('result_exact_search', F.explode(F.array(udf_find_elasticsearch_exact_best_candidate(F.col('vars'), F.col('exact_queries')))))
tolink_dataset.limit(1).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,result_exact_search
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA NASCI...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(1081, 1.0, {842866=0.6373916093300152, 436317..."


In [24]:
tolink_dataset = tolink_dataset.withColumn('best_candidate_exact', tolink_dataset.result_exact_search['best_candidate_exact'])
tolink_dataset = tolink_dataset.withColumn('sim_best_candidate_exact', tolink_dataset.result_exact_search['sim_best_candidate_exact'])
tolink_dataset = tolink_dataset.withColumn('similarity_exact_candidates', tolink_dataset.result_exact_search['best_candidate_exact'])

cols_to_drop = ['result_exact_search']
tolink_dataset = tolink_dataset.drop(*cols_to_drop)

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b,vars,exact_queries,result_exact_search,best_score_id_exact_match
0,1081,FABIOLA FAGUNDES FRICKS,LUCIMARA COSTA NASCIMENTO,20070816,2,"[FABIOLA FAGUNDES FRICKS, LUCIMARA COSTA NASCI...","{ ""size"": ""50"", ""query"": { ""bool"": { ""must"": [...","(1081, 1.0, {842866=0.6373916093300152, 436317...",1081


In [22]:
tolink_dataset.printSchema()

root
 |-- id_cidacs_b: string (nullable = false)
 |-- nome_b: string (nullable = false)
 |-- nome_mae_b: string (nullable = false)
 |-- dt_nasc_b: string (nullable = false)
 |-- sexo_b: string (nullable = false)
 |-- vars: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- exact_queries: string (nullable = true)
 |-- result_exact_search: struct (nullable = true)
 |    |-- best_candidate_exact: string (nullable = false)
 |    |-- sim_best_candidate_exact: string (nullable = false)
 |    |-- similarity_exact_candidates: string (nullable = false)



In [None]:
best_score_id, best_score_value, scores

In [None]:
F.explode(F.array(udf_best_candidate_search(F.col('vars'), F.col('queries'), F.lit(es_index_name), F.lit(es_connect_string), F.lit('most_distant'), F.lit(True)))))
def find_elasticsearch_exact_best_candidate(vars_col, exact_queries_col):

In [17]:
# tolink_dataset.limit(1).show(truncate=False)

In [18]:
tolink_dataset.limit(4).withColumn('type_col', udf_find_cidacs_rl_exact_candidates(F.col('es_exact_cand'))).select('type_col').show(truncate=False)

NameError: name 'udf_find_cidacs_rl_exact_candidates' is not defined

In [78]:
for info_comp in config['comparisons']:
    print(config['comparisons'][info_comp])

{'indexed_col': 'nome_a', 'tolink_col': 'nome_b', 'must_match': 'true', 'should_match': 'true', 'is_fuzzy': 'true', 'boost': '3.0', 'query_type': 'match', 'similarity': 'jaro_wikler', 'weight': 1.0, 'penalty': 0.02}
{'indexed_col': 'nome_mae_a', 'tolink_col': 'nome_mae_b', 'must_match': 'true', 'should_match': 'true', 'is_fuzzy': 'true', 'boost': '2.0', 'query_type': 'match', 'similarity': 'jaro_wikler', 'weight': 1.0, 'penalty': 0.02}
{'indexed_col': 'dt_nasc_a', 'tolink_col': 'dt_nasc_b', 'must_match': 'false', 'should_match': 'true', 'is_fuzzy': 'false', 'boost': '', 'query_type': 'term', 'similarity': 'hamming', 'weight': 1.0, 'penalty': 0.02}
{'indexed_col': 'sexo_a', 'tolink_col': 'sexo_b', 'must_match': 'true', 'should_match': 'true', 'is_fuzzy': 'false', 'boost': '', 'query_type': 'term', 'similarity': 'overlap', 'weight': 1.0, 'penalty': 0.02}


In [73]:
def find_cidacs_rl_exact_candidates(es_exact_cand_col):
    import ast
    return type(ast.literal_eval(es_exact_cand_col))
udf_find_cidacs_rl_exact_candidates = F.udf(find_cidacs_rl_exact_candidates, StringType())

In [168]:
candidates = [{'_type':'_doc', '_source':{'dt_nasc_a':'20070816', 'id_cidacs_a':'1081', 'nome_mae_a':'LUCIMARA COSTA NASCIMENTO', 'sexo_a':'2', 'nome_a':'FABIOLA FAGUNDES FRICKS'}, '_id':'ckz6TX4BggJoL2NZaXO6', '_index':'fd-cidacs-rl', '_score':'43.248585'}, 
                 {'_type':'_doc', '_source':{'dt_nasc_a':'20100709', 'id_cidacs_a':'339811', 'nome_mae_a':'ADRIANA FERNANDES COSTA', 'sexo_a':'2', 'nome_a':'FABIOLA FAGUNDES FRICKS'}, '_id':'wFD6TX4BggJoL2NZjO9i', '_index':'fd-cidacs-rl', '_score':'32.082012'}]

In [169]:
vars_col = ['FABIOLA FAGUNDES FRICKS', 'LUCIMARA COSTA NASCIMENTO', '20070816', '2']

In [170]:
cols_and_values = get_match_cols_and_values(vars_col, 'general')
cols_and_values

{'nome_a': 'FABIOLA FAGUNDES FRICKS',
 'nome_mae_a': 'LUCIMARA COSTA NASCIMENTO',
 'dt_nasc_a': '20070816',
 'sexo_a': '2'}

In [171]:
col_and_value

'FABIOLA FAGUNDES FRICKS'

In [172]:
candidate = candidates[0]['_source']['nome_a']
col_and_value = cols_and_values['nome_a']
print(cadidate)
print(col_and_value)

FABIOLA FAGUNDES FRICKS
FABIOLA FAGUNDES FRICKS


In [173]:
config_ = config_bc.value

In [174]:
n_comparisons = len(config_['comparisons'].keys())
n_comparisons

4

In [84]:
comparison_info = [config_['comparisons'][x] for x in config_['comparisons'] if config_['comparisons'][x]['indexed_col'] == 'nome_a'][0]
comparison_info

{'indexed_col': 'nome_a',
 'tolink_col': 'nome_b',
 'must_match': 'true',
 'should_match': 'true',
 'is_fuzzy': 'true',
 'boost': '3.0',
 'query_type': 'match',
 'similarity': 'jaro_winkler',
 'weight': 1.0,
 'penalty': 0.02}

In [59]:
score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
score_max

4.0

In [89]:
similarity_hub(n_comparisons, comparison_info, col_and_value, candidate)

0.25

In [181]:
best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)

In [185]:
best_score_id

'1081'

In [186]:
best_score_value

1.0

In [187]:
scores

{'1081': 1.0, '339811': 0.751929347826087}

In [180]:
def find_best_candidates(cols_and_values, candidates):
    
    config_ = config_bc.value
    indexed_id_col = config_['datasets_info']['indexed_dataset']['id_column_name']
    scores = {}
    
    for candidate in candidates:
        
        candidate_id = candidate['_source'][indexed_id_col]
        sim_candidate = []
        for col_and_value in list(cols_and_values.keys()):
            comparison_info = [config_['comparisons'][x] for x in config_['comparisons'] if config_['comparisons'][x]['indexed_col'] == col_and_value][0]
            n_comparisons = len(config_['comparisons'].keys())
            
            sim_for_pair_of_cols = similarity_hub(n_comparisons, comparison_info, cols_and_values[col_and_value], candidate['_source'][col_and_value])
                        
            sim_candidate.append(sim_for_pair_of_cols)
        
        score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
        score = (sum(sim_candidate))/score_max
        
        scores[candidate_id] = score
            
    best_score_id = max(scores, key=scores.get)
    best_score_value = scores[best_score_id]
    return best_score_id, best_score_value, scores
    
    
def similarity_hub(n_comparisons, comparison_info, col_and_value, candidate):
    """
    Currently the CIDACS-RL uses overlap for categorical data, jaro_winkler for names and hamming for dates.
    """
    import jellyfish
    
    # getting relevant information for this pair of values
    config_ = config_bc.value
#     score_max = sum([float(config_['comparisons'][x]['weight']) for x in config_['comparisons']])
    similarity = 0.0
    weight = float(comparison_info['weight'])
    penalty = float(comparison_info['penalty'])
    
    # first, test if some value are missing
    if (candidate == config_['null_value']) or (col_and_value == config_['null_value'])\
        or (candidate == "") or (col_and_value == "") or (candidate == None) or (col_and_value == None):
        similarity = similarity - penalty
    else: 
        sim_type = comparison_info['similarity']
        if (sim_type == 'overlap') and(col_and_value == candidate):
            similarity += (1.0) * weight
            return similarity
        elif sim_type == 'jaro_winkler':
            similarity += jellyfish.jaro_winkler(col_and_value, candidate) * weight
        elif sim_type == 'hamming':
            max_size = max(len(col_and_value), len(candidate))
            similarity += 1.0 - float(jellyfish.hamming_distance(col_and_value, candidate)/max_size) * weight
        else: 
            print('Please inform valid similarities for cidacs-rl')
        
        similarity = similarity
    return similarity    

In [188]:
def find_elasticsearch_exact_best_candidate(vars_col, exact_queries_col):
    """
    Let us suppose a column with the following query:
    
    '{ "size": "50", "query": 
                    { "bool": { "must": [ 
                                {"match": {"name":"ROBESPIERRE PITA"}},
                                {"match": {"birthdate":"19870505"}}] } } }'
    
    so, this function must return a dict with N results like: 
        {'_index': 'test', '_type': '_doc', '_id': 'aaabbbccc', '_score': 43.9280841,
        '_source': {'name': 'ROBESPIERRE PITA', 'birthdate': '19870505', 'other_col': 'other_value'}},
    
    being N the query_size value set on config, you can see this number on the 'size' field of the query.
    
    This result can now be used to compute the proper similarity and pick the 
    best candidate for each record
    """
    from elasticsearch import Elasticsearch
    config_ = config_bc.value
    
    es_connect_string = config_['es_connect_string']
    es_index_name = config_['es_index_name']
    
    es = Elasticsearch(es_connect_string)
    
    candidates = es.search(index=es_index_name, body=exact_queries_col)['hits']['hits']
    
    if len(candidates) == 0:
        return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(None, None, None)
    else:
        cols_and_values = get_match_cols_and_values(vars_col, 'general')
        best_score_id, best_score_value, scores = find_best_candidates(cols_and_values, candidates)
        if best_score_value >= float(config_['cutoff_exact_match']):
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(best_score_id, best_score_value, scores)
        else: 
            return T.Row('best_candidate_exact', 'sim_best_candidate_exact', 'similarity_exact_candidates')(None, None, None)
    
schema = StructType([StructField("best_candidate_exact", StringType(), False), 
                     StructField("sim_best_candidate_exact", StringType(), False), 
                     StructField("similarity_exact_candidates", StringType(), False)])

udf_find_elasticsearch_exact_best_candidate = F.udf(find_elasticsearch_exact_best_candidate, schema)

In [None]:
indexed_exact_match_vars = [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']

In [None]:
tolink_cols_dict = dict(zip(indexed_cols, list_values))
tolink_cols_dict

In [81]:
vars_col = ['FABIOLA FAGUNDES FRICKS', 'LUCIMARA COSTA NASCIMENTO', '2007-08-16', '2']

In [84]:
def get_exact_match_cols(vars_col):
    indexed_id_column = config_['datasets_info']['indexed_dataset']['id_column_name']
    tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']

    indexed_cols = config_['datasets_info']['indexed_dataset']['columns']
    indexed_cols = [x for x in indexed_cols if x != indexed_id_column]

    tolink_cols_dict = dict(zip(indexed_cols, vars_col))

    indexed_exact_match_vars = [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']

    non_exact_match_cols = list(set(indexed_cols) - set(indexed_exact_match_vars))
    [tolink_cols_dict.pop(x, None) for x in non_exact_match_cols]
    
    return tolink_cols_dict

In [85]:
get_exact_match_cols(vars_col)

{'nome_a': 'FABIOLA FAGUNDES FRICKS',
 'nome_mae_a': 'LUCIMARA COSTA NASCIMENTO',
 'sexo_a': '2'}

In [153]:


indexed_id_column = config_['datasets_info']['indexed_dataset']['id_column_name']
tolink_id_column = config_['datasets_info']['tolink_dataset']['id_column_name']

indexed_cols = config_['datasets_info']['indexed_dataset']['columns']
indexed_cols = [x for x in indexed_cols if x != indexed_id_column]
print(indexed_cols)

tolink_cols = config_['datasets_info']['tolink_dataset']['columns']
tolink_cols = [x for x in tolink_cols if x != tolink_id_column]
print(tolink_cols)

['nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a']
['nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b']


In [154]:
list_values = ['FABIOLA FAGUNDES FRICKS', 'LUCIMARA COSTA NASCIMENTO', '2007-08-16', 2]

In [155]:
# notice that we are linking indexed keys with tolink values
tolink_cols_dict = dict(zip(indexed_cols, list_values))
tolink_cols_dict

{'nome_a': 'FABIOLA FAGUNDES FRICKS',
 'nome_mae_a': 'LUCIMARA COSTA NASCIMENTO',
 'dt_nasc_a': '2007-08-16',
 'sexo_a': 2}

In [156]:
indexed_exact_match_vars = [config_['comparisons'][x]['indexed_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']
indexed_exact_match_vars

['nome_a', 'nome_mae_a', 'sexo_a']

In [157]:
# tolink_exact_match_vars = [config_['comparisons'][x]['tolink_col'] for x in config_['comparisons'] if config_['comparisons'][x]['must_match'] == 'true']
# exact_match_vars

In [161]:
non_exact_match_cols = list(set(indexed_cols) - set(indexed_exact_match_vars))
[tolink_cols_dict.pop(x, None) for x in non_exact_match_cols]
tolink_cols_dict

{'nome_a': 'FABIOLA FAGUNDES FRICKS',
 'nome_mae_a': 'LUCIMARA COSTA NASCIMENTO',
 'sexo_a': 2}

In [135]:
query_size = config_['query_size']
print(type(query_size))
print(query_size)

<class 'int'>
50


In [136]:
prefix_ = """{"match": {"""
suffix_ = """}}"""

In [163]:
strings = []
for col in list(tolink_cols_dict.keys()):
    string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(tolink_cols_dict[col]) + "\"" + str(suffix_)
    print(string)
    strings.append(string)

{"match": {"nome_a":"FABIOLA FAGUNDES FRICKS"}}
{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO"}}
{"match": {"sexo_a":"2"}}


In [164]:
line = ','.join(strings)
line

'{"match": {"nome_a":"FABIOLA FAGUNDES FRICKS"}},{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO"}},{"match": {"sexo_a":"2"}}'

In [165]:
complete_query = """{ "size": "%s", "query": { "bool": { "must": [ %s ] } } }""" % (query_size,line)
complete_query

'{ "size": "50", "query": { "bool": { "should": [ {"match": {"nome_a":"FABIOLA FAGUNDES FRICKS"}},{"match": {"nome_mae_a":"LUCIMARA COSTA NASCIMENTO"}},{"match": {"sexo_a":"2"}} ] } } }'

In [None]:
to_delete
tolink_cols_dict = 

In [None]:
content = {
        'size': 5,
        'query': {
            'bool': {
                'must': [
                    {'match': {'lb_sex': sex}},
                    {'match': {'lb_birthday_child': birthday_child}},
                    {'match': {'lb_addr_residence': addr_residence}}
                ]
            }
        }
    }

In [None]:
content = {
        'size': 5,
        'query': {
            'bool': {
                'should': [
                    {'match': {'lb_sex': {'query': sex, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2.0'}}},
                    {'match': {'lb_addr_residence': {'query': addr_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'4.0'}}},
                    {'match': {'lb_addr_occurrence': {'query': addr_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_state_residence': {'query': state_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'1.0'}}},
                    {'match': {'lb_state_occurrence': {'query': state_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_state_mun_residence': {'query': state_mun_residence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'3.0'}}},
                    {'match': {'lb_state_mun_occurrence': {'query': state_mun_occurrence, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'lb_day_birth': {'query': day_birth}}},
                    {'match': {'lb_month_birth': {'query': month_birth}}},
                    {'match': {'lb_year_birth': {'query': year_birth}}},
                    {'match': {'lb_mun_residence_pad': {'query': mun_residence_pad}}},
                    {'match': {'lb_loc_residence_pad': {'query': loc_residence_pad}}},
                    {'match': {'lb_mun_occurrence_pad': {'query': mun_occurrence_pad}}},
                    {'match': {'lb_loc_occurrence_pad': {'query': loc_occurrence_pad}}},
                    {'term': {'lb_birthday_child': birthday_child}}
                ]
            }
        }
    }

In [55]:
config['comparisons']

{'nome_a': {'compare_to': 'nome_b',
  'must_match': 'true',
  'should_match': 'true',
  'is_fuzzy': 'true',
  'boost': '3.0',
  'query_type': 'match',
  'similarity': 'jaro_wikler',
  'weight': 1.0,
  'penalty': 0.02},
 'nome_mae_a': {'compare_to': 'nome_mae_b',
  'must_match': 'true',
  'should_match': 'true',
  'is_fuzzy': 'true',
  'boost': '2.0',
  'query_type': 'match',
  'similarity': 'jaro_wikler',
  'weight': 1.0,
  'penalty': 0.02},
 'dt_nasc_a': {'compare_to': 'dt_nasc_b',
  'must_match': 'false',
  'should_match': 'true',
  'is_fuzzy': 'false',
  'boost': '',
  'query_type': 'term',
  'similarity': 'hamming',
  'weight': 1.0,
  'penalty': 0.02},
 'sexo_a': {'compare_to': 'sexo_b',
  'must_match': 'true',
  'should_match': 'true',
  'is_fuzzy': 'false',
  'boost': '',
  'query_type': 'term',
  'similarity': 'overlap',
  'weight': 1.0,
  'penalty': 0.02}}

In [None]:
def build_queries(list_of_values_col, query_size):
    """
    Let us suppose the following column values:
    list_of_cols = ['ab', 'vbx']
    list_of_values = ['2', 'mamao']
    so, this udf must return a column with values like this: 
    {"match": {"ab":"2"}},{"match": {"vbx":"mamao"}}
    """
    query_size = str(query_size)
    list_of_cols = cols.value  # [x for x in cols.value if x != 'id']
    prefix_ = """{"match": {"""
    suffix_ = """}}"""
    strings = []
    dict_cols = dict(zip(list_of_cols, list_of_values_col))
    
    for col in list(dict_cols.keys()):
        string = str(prefix_) + "\"" + str(col) + "\"" + ":" + "\"" +  str(dict_cols[col]) + "\"" + str(suffix_)
        strings.append(string)
    
    line = ','.join(strings)
    complete_query = """{ "size": "%s", "query": { "bool": { "should": [ %s ] } } }""" % (query_size,line)
    return complete_query
udf_build_queries = F.udf(build_queries, StringType()) 