In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

import jellyfish
from elasticsearch import Elasticsearch
import json

In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
f = open('config.txt')
config = json.load(f)
config

{'index_data': 'yes',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'temp_dir': '../0_global_data/fd-cidacs-rl/temp_dataframe/',
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a'},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b'},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'name': {'indexed_col': 'nome_a',
   'tolink_col': 'nome_b',
   'must_match': 'true',
   'should_match': 'true',
   'is_fuzzy': 'true',
   'boost': '3.0',
   'query_type': 'match'

# Reading prepocessed datasets

In [9]:
# getting the auxiliary variables
data_ext = config['datasets_info']['indexed_dataset']['extension']
data_path = config['datasets_info']['indexed_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    indexed_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    indexed_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")

# Creating sinthetic datasets

In [10]:
def supress_last_name(col):
    col = str(col)
    return ' '.join(col.split(' ')[:-1])
udf_supress_last_name = F.udf(supress_last_name, StringType())

In [11]:
prefix = '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-'

In [12]:
sizes = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]

In [13]:
map_sizes = {}

In [14]:
for size in sizes:
    accum_size = 0
    map_sizes[str(size)] = {}
    # setting the proportions of exact true matches, gray area and false matches
    # ~50% of true matches
    # ~20% of gray area (record with last name supression and wrong information on sex)
    # ~30% of false matches
    n_true_m = (size/100)*50
    n_gray_m = (size/100)*20
    n_false_m = (size/100)*30
    
    # using literal numbers to estimate the right proportion of sample
    p_true_m = n_true_m/1000000
    p_gray_m = n_gray_m/1000000
    p_false_m = (n_false_m/1000000)+ 0.01
    
    # getting sample of exact true
    true_df = indexed_dataset.sample(p_true_m)
    count = true_df.count()
    accum_size += count
    map_sizes[str(size)]['true_df'] = count
    
    # getting sample of gray area
    gray_df = indexed_dataset.sample(p_gray_m)
    count = gray_df.count()
    accum_size += count
    map_sizes[str(size)]['gray_df'] = count
    
    # getting sample of false matches
    false_df = indexed_dataset.sample(p_false_m)
    count = false_df.count()
    accum_size += count
    map_sizes[str(size)]['false_df'] = count
    
    # recording the total size of resulting datasets
    map_sizes[str(size)]['accum_size'] = accum_size
    
    # suppressing last name and changing sex info for gray area records
    gray_df = gray_df.withColumn('nome_a', udf_supress_last_name(F.col('nome_a')))
    gray_df = gray_df.withColumn('nome_mae_a', udf_supress_last_name(F.col('nome_mae_a')))
    gray_df = gray_df.withColumn('sexo_a', F.when(F.col('sexo_a') == 1, 2).otherwise(1))
    
    
    # messing with name for false matches
    false_df = false_df.withColumn('nome_a', F.col('nome_mae_a'))
    
    # union
    df = true_df.union(gray_df).union(false_df).limit(size)
    count = df.count()
    map_sizes[str(size)]['final_size'] = count
    
    # changing names
    names_dict = {'id_cidacs_a': 'id_cidacs_b', 
                  'nome_a': 'nome_b', 
                  'nome_mae_a': 'nome_mae_b', 
                  'dt_nasc_a': 'dt_nasc_b', 
                  'sexo_a': 'sexo_b'}
    for col in names_dict.keys():
        df = df.withColumnRenamed(col, names_dict[col])
    
    # writing data
    df.write.parquet(prefix+str(size)+'.parquet', mode='overwrite')
    print(map_sizes[str(size)])

# map_sizes

{'true_df': 53, 'gray_df': 20, 'false_df': 10110, 'accum_size': 10183, 'final_size': 100}
{'true_df': 257, 'gray_df': 104, 'false_df': 10166, 'accum_size': 10527, 'final_size': 500}
{'true_df': 502, 'gray_df': 199, 'false_df': 10319, 'accum_size': 11020, 'final_size': 1000}
{'true_df': 2504, 'gray_df': 970, 'false_df': 11295, 'accum_size': 14769, 'final_size': 5000}
{'true_df': 5001, 'gray_df': 2011, 'false_df': 13106, 'accum_size': 20118, 'final_size': 10000}
{'true_df': 25185, 'gray_df': 9949, 'false_df': 25143, 'accum_size': 60277, 'final_size': 50000}
{'true_df': 50252, 'gray_df': 20282, 'false_df': 40013, 'accum_size': 110547, 'final_size': 100000}
{'true_df': 249809, 'gray_df': 99845, 'false_df': 159722, 'accum_size': 509376, 'final_size': 500000}
{'true_df': 499805, 'gray_df': 200412, 'false_df': 309901, 'accum_size': 1010118, 'final_size': 1000000}


In [15]:
# testing one dataset
spark.read.parquet('../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-10000.parquet/').limit(5).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,388,VICTOR LUCAS SANTOS DE ALMEIDA,EDINEUZA SANTOS PEDRO,2007-10-06,1
1,755,MURILO ESPOLADOR CORDEIRO,JAQUELINE DE SOUZA RODRIGUES,2008-02-13,1
2,1201,ICARO GABRIEL DA SILVA FONSECA,ANA PAULA OSTINU,2007-08-17,1
3,1443,JEAN LUCAS AYRES,DANIELLE DE SOUZA MENEZES,2008-04-17,1
4,1895,ICARO MANUEL DE OLIVEIRA CHAVIER,SOLANGE DA SILVA,2008-10-08,1
