In [1]:
import jellyfish
from elasticsearch import Elasticsearch
from multiprocessing.dummy import Pool as ThreadPool
import os
import sys
import datetime
if sys.version_info[0] >= 3:
    unicode = str
import time

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [32]:
size = '1000000'

In [33]:
# [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
range_years = list(range(int(2008),2020))

In [34]:
def listPath(path, recur=False, pattern=None, partitioned=False):
    result = []
    if path.startswith('hdfs'): 
        result = hdfs.ls(path, recursive=recur)
    else:
        if recur:
            bases = []
            for root, dirnames, filenames in os.walk(path):
                bases += [root + '/' + x for x in filenames]
            result = bases
        else:
            result = os.listdir(path)
    if partitioned:
        result = ['/'.join(x.split('/')[:-1]) for x in result if partitioned in x]
        result = list(set(result))
    if pattern:
        result = [x for x in result if x.endswith(pattern)]        
    return result

In [35]:
# Base to read,  write and es index
# Paths need to end with '/'
sourceFileName = ".csv"
sourceBase = "../../../../0_global_data/fd-cidacs-rl/sinthetic-datasets-b-legacy/sinthetic-datasets-b-"+size+".csv/" # Example: hdfs:///npd/trusted/data/base_sim/05_linkage_extraction/
targetBase = "../../../../0_global_results/fd-cidacs-rl/legacy/" # Example: hdfs:///npd/refined/data/linkage_base_sim_x_base_sinasc/
index_name = "fd-cidacs-rl-legacy" # Example: sinasc_maes_2001a2015_dtnascmae_nulo
# hdfs.mkdir(targetBase)
os.system('mkdir ' + targetBase)
bases = listPath(sourceBase, pattern=sourceFileName)

In [36]:
bases

['part-00000-82d7d52e-f477-41b3-97cc-3f9a2e0f9c2d-c000.csv']

In [37]:
# Load elastic search and start thread pool
ncores = 3
pool = ThreadPool(ncores)
es = Elasticsearch('http://localhost:9200', maxsize=ncores, timeout=30, max_retries=10, retry_on_timeout=True)

In [38]:
# Headers for reference
indexedBaseHeader = "A" # Example: sinasc
sourceBaseHeader = "B" # Example: sim

In [39]:
spark.read.csv(sourceBase, header=True).limit(10).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,4,VINICIUS DA SILVA SOUZA,ELIZANGELA LIMA DA SILVA,20071008,1
1,5,LUAN FERREIRA DO NASCIMENTO,KEZIA NUNES GALDINO MONTEIRO,20080128,1
2,7,JOAO PEDRO BATISTA DOS SANTOS,SOILA COSTA DA SILVA,20070903,1
3,9,GABRIEL COUTO GOMES,ROSILDA LEAL BARBOSA,20061008,1
4,10,LUIZA VITORIA BATISTA DOS SANTOS,EDILZA MAGALHAES DE SOUZA MARTINS,20061027,2
5,12,MABYLA TAHANNA DE OLIVEIRA LOURENCO,KLAUCIARA DA SILVA PENNA,20070821,2
6,13,JASMYNNE ELOYSE NUNES SANTANA,MARLI BARBOSA,20070724,2
7,14,MARCOS GABRIEL SILVA FERREIRA,MARISSA KATHLEN S ROCHA,20070706,1
8,16,VITOR CORDEIRO DOS SANTOS,SUELLI RIBEIRO SALUSTINO,20070902,1
9,22,CARLOS EDUARDO MARTINS VIEIRA,ELIANA DE ARAUJO LORENCO,20070810,1


In [40]:
dic_bases = []
for source in bases:
    # Open csv base
    with open(sourceBase + source, 'r') as base:
        dic_base = list()
        header = True
        #If csv contains header as first line, skip it
        for l in base:
            if header:
                header = False
                continue
            # Split csv line
            l = l.replace('\n', '').split(',')
            # Get each char
            seq = l[0].strip()
            nome_b = l[1].strip()
            nome_mae_b = l[2].strip()
            dt_nasc_b = l[3].strip()
            sexo_b = l[4].strip()
            

            # If all fields are blanks, then don't add the register, add it otherwise.
            if not (dt_nasc_b == '' and nome_b == '' and nome_mae_b == '' and sexo_b == ''):
                content = {
                'seq':seq,
                'nome_b':unicode(nome_b),
                'nome_mae_b':unicode(nome_mae_b),
                'dt_nasc_b':unicode(dt_nasc_b),
                'sexo_b':unicode(sexo_b)
                }
                dic_base.append(content)
    dic_bases.append(dic_base)

In [41]:
# Number of registers for each base
for i in range(len(dic_bases)):
    print(bases[i].split('/')[-1], len(dic_bases[i]))

part-00000-82d7d52e-f477-41b3-97cc-3f9a2e0f9c2d-c000.csv 1000000


In [42]:
# Exact search on elastic search function
def searchExactPerson(nome_b, nome_mae_b, sexo_b, startId=0):
    
    global es
    
    content = {
        'size': 100,
        'query': {
            'bool': {
                'must': [
                    {'match': {'nome_a': nome_b}},
                    {'match': {'nome_mae_a': nome_mae_b}},
                    {'match': {'sexo_a': sexo_b}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index=index_name, body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

# Fuzzy search on elastic search function
def searchFuzzyPerson(nome_b, nome_mae_b, dt_nasc_b, sexo_b, startId=0):
  
    global es
    
    content = {
        'size': 100,
        'query': {
            'bool': {
                'should': [
                    {'match': {'nome_a': {'query': nome_b, 'fuzziness':'AUTO', 'operator':'or', 'boost':'3.0'}}},
                    {'match': {'nome_mae_a': {'query': nome_mae_b, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2.0'}}},
                    {'match': {'sexo_a': {'query': sexo_b}}},
                    {'term': {'dt_nasc_a': dt_nasc_b}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index=index_name, body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

In [43]:
def findBestCandidate(candidates, person):
    if candidates:
        scores = []
        for candidate in candidates:
            score = compare(candidate['_source'], person)
            scores.append((score, candidate))
#         scores.sort(reverse=True) do not fit on python 3.x, it raises TypeError: '<' not supported between instances of 'dict' and 'dict'
        scores.sort(key=lambda x: x[0], reverse=True) 
        bestCandidate = scores[0][1]
        bestScore = scores[0][0]
        bestCandidate['_source']['score'] = bestScore
        return bestCandidate
    else:
        return None

In [44]:
def compare(candidate, source):
    # Weights
    nome_w = 5.0
    nome_mae_w = 5.0
    dt_nasc_w = 1.0
    sexo_w = 3.0

    nome_penalty = 0.02
    nome_mae_penalty = 0.02
    dt_nasc_penalty = 0.02
    sexo_penalty = 0.02

    # Max score
    score_max = nome_w + nome_mae_w + dt_nasc_w + sexo_w

    # Initialize scores and penalties
    score_nome, score_nome_mae, score_dt_nasc, score_sexo, penalty = 0, 0, 0, 0, 0

    # Compare addresses name with jaro distance
    if candidate['nome_a'] == '' or source['nome_b'] == '':
        score_max -= nome_w
        penalty += nome_penalty
    else:
        score_nome = jellyfish.jaro_winkler(candidate['nome_a'], source['nome_b']) * nome_w


    if candidate['nome_mae_a'] == '' or source['nome_mae_b'] == '':
        score_max -= nome_mae_w
        penalty += nome_mae_penalty
    else:
        score_nome_mae = jellyfish.jaro_winkler(candidate['nome_a'], source['nome_b']) * nome_mae_w

    if candidate['dt_nasc_a'] == '' or source['dt_nasc_b'] == '':
        score_max -= dt_nasc_w
        penalty += dt_nasc_penalty
    else:
        score_dt_nasc = (1.0 - float(jellyfish.hamming_distance(candidate['dt_nasc_a'], source['dt_nasc_b'])) / max(len(candidate['dt_nasc_a']), len(source['dt_nasc_b']))) * dt_nasc_w


   # Compare sex
    if candidate['sexo_a'] == '' or source['sexo_b'] == '' :
        score_max -= sexo_w
        penalty += sexo_penalty
    elif candidate['sexo_a'] == source['sexo_b'] :
        score_sexo += sexo_w
            
    score = ((score_nome + score_nome_mae + score_dt_nasc + score_sexo) / score_max) - penalty
    return score

In [45]:
def cidacsrl(source):
#     print(source)
    result = ''
    #Perform exact search
    candidates = searchExactPerson(nome_b=source['nome_b'],
                                   nome_mae_b=source['nome_mae_b'],
                                   sexo_b=source['sexo_b'])
    
    bestCandidate = findBestCandidate(candidates, source)
    
    if candidates and bestCandidate['_source']['score'] >= .95:
            
        score = str(bestCandidate['_source']['score'])

        searchType = 'searchExactPerson'

        fields = [bestCandidate['_id'], source['seq'],
                  bestCandidate['_source']['nome_a'], source['nome_b'],
                  bestCandidate['_source']['nome_mae_a'], source['nome_mae_b'],
                  bestCandidate['_source']['sexo_a'], source['sexo_b'],
                  bestCandidate['_source']['dt_nasc_a'], source['dt_nasc_b'],
                  searchType, score]
        result = ','.join(fields) + '\n'

    # If no candidate is selected, perform fuzzy search
    else:
        candidates = searchFuzzyPerson(nome_b=source['nome_b'],
                                       nome_mae_b=source['nome_mae_b'],
                                       sexo_b=source['sexo_b'],
                                       dt_nasc_b=source['dt_nasc_b'])
        
        bestCandidate = findBestCandidate(candidates, source)
        if bestCandidate:
            score = str(bestCandidate['_source']['score'])
            
            searchType = 'searchFuzzyPerson'
            
            fields = [bestCandidate['_id'], source['seq'], 
                      bestCandidate['_source']['nome_a'], source['nome_b'],
                      bestCandidate['_source']['nome_mae_a'], source['nome_mae_b'],
                      bestCandidate['_source']['sexo_a'], source['sexo_b'],
                      bestCandidate['_source']['dt_nasc_a'], source['dt_nasc_b'], 
                      searchType, score]
            result = ','.join(fields) + '\n'
    return result

In [46]:
# Build datamart header
headerFields = ['seq', 'nome', 'nome_mae', 'dt_nasc', 'sexo']
larger = [x + '_' + indexedBaseHeader for x in headerFields]
smaller = [x + '_' + sourceBaseHeader for x in headerFields]
l = []
for i in range(len(larger)):
    l.append(larger[i])
    l.append(smaller[i])

l.append('searchType')
l.append('score')
header = ','.join(l)

In [47]:
for i in range(len(dic_bases)):
    marker = time.time()
    num_tasks = len(dic_bases[i])
    result = []
    c, elapsed_time = 0, 0
    for j, x in enumerate(pool.imap_unordered(cidacsrl, dic_bases[i])):
        result.append(x)
        c += 1
        elapsed_time = time.time() - marker
        done = float(j)/num_tasks
        estimated = str(datetime.timedelta(seconds=(num_tasks -c)*(elapsed_time/c)))
        sys.stderr.write('\rdone: {:%} \ the estimated remaining time is roughly: {} \ total elapsed time: {}'.format(done, estimated, str(datetime.timedelta(seconds=time.time() - marker))))
    f = open(targetBase + bases[i].split('/')[-1], 'w')
    f.write(header + '\n')
    for line in result:
        f.write(line)
    f.close()
    print('\n')

done: 0.221700% \ the estimated remaining time is roughly: 4:03:48.217251 \ total elapsed time: 0:00:32.51752984116103

KeyboardInterrupt: 

<hr />
<hr />
<hr />
<hr />