In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

import jellyfish
from elasticsearch import Elasticsearch
import json

In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
f = open('config.txt')
config = json.load(f)
config

{'index_data': 'yes',
 'index_name': 'fd-cidacs-rl',
 'query_size': 50,
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a']},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-B.csv',
   'extension': 'csv',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b']},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'nome_a': {'compare_to': 'nome_b',
   'exact_match': 'true',
   'fuzzy_match': 'true',
   'query_type': 'match',
   'similarity': 'jaro_wikler',
   'weight': 1.0,
   'penalty': 0.02},
  'nome_mae_a': {'compare_to': 'nome_mae_b',
   'exact_match': 'true',
   'fuzzy_match': 'true',
   'query_type': 'match',
   'similarity': 'jaro_wikler',
   'weight': 1.0,
   'penalty': 0.02},
  'dt_nasc_a': {'compare_to': 'dt_nasc_b',
   'exact_match': 'false',
  

# Reading prepocessed datasets

In [4]:
# getting the auxiliary variables
data_ext = config['datasets_info']['indexed_dataset']['extension']
data_path = config['datasets_info']['indexed_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    indexed_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    indexed_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")

# Creating sinthetic datasets

In [5]:
def supress_last_name(col):
    col = str(col)
    return ' '.join(col.split(' ')[:-1])
udf_supress_last_name = F.udf(supress_last_name, StringType())

In [6]:
prefix = '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-'

In [7]:
sizes = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]

In [8]:
map_sizes = {}

In [15]:
for size in sizes:
    accum_size = 0
    map_sizes[str(size)] = {}
    # setting the proportions of exact true matches, gray area and false matches
    # ~50% of true matches
    # ~20% of gray area (record with last name supression and wrong information on sex)
    # ~30% of false matches
    n_true_m = (size/100)*50
    n_gray_m = (size/100)*20
    n_false_m = (size/100)*30
    
    # using literal numbers to estimate the right proportion of sample
    p_true_m = n_true_m/1000000
    p_gray_m = n_gray_m/1000000
    p_false_m = (n_false_m/1000000)+ 0.01
    
    # getting sample of exact true
    true_df = indexed_dataset.sample(p_true_m)
    count = true_df.count()
    accum_size += count
    map_sizes[str(size)]['true_df'] = count
    
    # getting sample of gray area
    gray_df = indexed_dataset.sample(p_gray_m)
    count = gray_df.count()
    accum_size += count
    map_sizes[str(size)]['gray_df'] = count
    
    # getting sample of false matches
    false_df = indexed_dataset.sample(p_false_m)
    count = false_df.count()
    accum_size += count
    map_sizes[str(size)]['false_df'] = count
    
    # recording the total size of resulting datasets
    map_sizes[str(size)]['accum_size'] = accum_size
    
    # suppressing last name and changing sex info for gray area records
    gray_df = gray_df.withColumn('nome_a', udf_supress_last_name(F.col('nome_a')))
    gray_df = gray_df.withColumn('nome_mae_a', udf_supress_last_name(F.col('nome_mae_a')))
    gray_df = gray_df.withColumn('sexo_a', F.when(F.col('sexo_a') == 1, 2).otherwise(1))
    
    # union
    df = true_df.union(gray_df).union(false_df).limit(size)
    count = df.count()
    map_sizes[str(size)]['final_size'] = count
    
    # changing names
    names_dict = {'id_cidacs_a': 'id_cidacs_b', 
                  'nome_a': 'nome_b', 
                  'nome_mae_a': 'nome_mae_b', 
                  'dt_nasc_a': 'dt_nasc_b', 
                  'sexo_a': 'sexo_b'}
    for col in names_dict.keys():
        df = df.withColumnRenamed(col, names_dict[col])
    
    # writing data
    df.write.parquet(prefix+str(size)+'.parquet', mode='overwrite')
    print(map_sizes[str(size)])

# map_sizes

{'true_df': 54, 'gray_df': 20, 'false_df': 10141, 'accum_size': 10215, 'final_size': 100}
{'true_df': 250, 'gray_df': 109, 'false_df': 10167, 'accum_size': 10526, 'final_size': 500}
{'true_df': 515, 'gray_df': 196, 'false_df': 10229, 'accum_size': 10940, 'final_size': 1000}
{'true_df': 2460, 'gray_df': 1019, 'false_df': 11437, 'accum_size': 14916, 'final_size': 5000}
{'true_df': 5170, 'gray_df': 1936, 'false_df': 13008, 'accum_size': 20114, 'final_size': 10000}
{'true_df': 24734, 'gray_df': 9868, 'false_df': 25110, 'accum_size': 59712, 'final_size': 50000}
{'true_df': 49889, 'gray_df': 20133, 'false_df': 40296, 'accum_size': 110318, 'final_size': 100000}
{'true_df': 250628, 'gray_df': 100149, 'false_df': 160057, 'accum_size': 510834, 'final_size': 500000}
{'true_df': 500197, 'gray_df': 200196, 'false_df': 309491, 'accum_size': 1009884, 'final_size': 1000000}


In [16]:
# testing one dataset
spark.read.parquet('../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-10000.parquet/').limit(5).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,92,THAYSLANE VITORIA DA ROCHA SOARES,JANAYNA MARTINS DA SILVA,2007-10-12,2
1,658,SARA DUARTE DE FRANCA,JOCILENE SOUZA DOS SANTOS,2007-08-22,2
2,1068,MATHEUS MONTEIRO SANTOS,CARINA DA SILVA CRISTINO,2007-05-22,1
3,1087,FELIPE LIDUINO RODRIGUES,EDNALVA DUARTE,2008-04-30,1
4,1155,MATEUS VIEIRA DOS SANTOS JUNIOR,NIVEA GRAZIELLA SILVA DE AZEVEDO,2006-09-29,1
