In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

import jellyfish
from elasticsearch import Elasticsearch
import json

In [2]:
import pandas as pd
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [3]:
f = open('config.txt')
config = json.load(f)
config

{'index_data': 'yes',
 'es_index_name': 'fd-cidacs-rl',
 'es_connect_string': 'http://localhost:9200',
 'query_size': 50,
 'cutoff_exact_match': '0.95',
 'null_value': '99',
 'temp_dir': '../0_global_data/fd-cidacs-rl/temp_dataframe/',
 'debug': 'false',
 'datasets_info': {'indexed_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-dataset-A.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_a', 'nome_a', 'nome_mae_a', 'dt_nasc_a', 'sexo_a'],
   'id_column_name': 'id_cidacs_a',
   'storage_level': 'MEMORY_ONLY',
   'default_paralelism': '96'},
  'tolink_dataset': {'path': '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-1000.parquet',
   'extension': 'parquet',
   'columns': ['id_cidacs_b', 'nome_b', 'nome_mae_b', 'dt_nasc_b', 'sexo_b'],
   'id_column_name': 'id_cidacs_b',
   'storage_level': 'MEMORY_ONLY',
   'default_paralelism': '96'},
  'result_dataset': {'path': '../0_global_data/result/'}},
 'comparisons': {'name': {'indexed_col': 'nom

# Reading prepocessed datasets

In [4]:
# getting the auxiliary variables
data_ext = config['datasets_info']['indexed_dataset']['extension']
data_path = config['datasets_info']['indexed_dataset']['path']

# test the extension of the dataset to properly read it
if data_ext == 'csv':
    indexed_dataset = spark.read.csv(data_path, header=True)
elif data_ext == 'parquet':
    indexed_dataset = spark.read.parquet(data_path)
else:
    print("Please make sure the extension for this dataset is set as 'csv' or 'parquet'")

# indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.to_date(F.col('dt_nasc_a'), 'dd/MM/yyyy'))
    
for col in indexed_dataset.columns:
    indexed_dataset = indexed_dataset.withColumn(col, F.col(col).cast('string'))

indexed_dataset = indexed_dataset.na.fill(config['null_value'])

# All the hyphens symbols must be taken from date type variables converted to string
# indexed_dataset = indexed_dataset.withColumn('dt_nasc_a', F.regexp_replace(F.col('dt_nasc_a'), "-", ""))
indexed_dataset.limit(5).toPandas()

Unnamed: 0,id_cidacs_a,nome_a,nome_mae_a,dt_nasc_a,sexo_a
0,1,YASMIM VITORIA MATIAS FONSECA,TACIANY DOS SANTOS,20071122,2
1,2,PEDRO HENRIQUE MARTINS DE CARVALHO,FRANCILEIDE DOS SANTOS ALVES,20061102,1
2,3,FABRICIO RODRIGUES DOS SANTOS,MARCELA MACHADO DA SILVA,20071107,1
3,4,VINICIUS DA SILVA SOUZA,ELIZANGELA LIMA DA SILVA,20071008,1
4,5,LUAN FERREIRA DO NASCIMENTO,KEZIA NUNES GALDINO MONTEIRO,20080128,1


In [5]:
# indexed_dataset = spark.read.csv('/home/pierre/Dropbox/repos/0_global_data/fd-cidacs-rl/sinthetic-dataset-A.csv', header=True)
# indexed_dataset.write.parquet('/home/pierre/Dropbox/repos/0_global_data/fd-cidacs-rl/sinthetic-dataset-A_.parquet')

# Creating sinthetic datasets

In [6]:
def supress_last_name(col):
    col = str(col)
    return ' '.join(col.split(' ')[:-1])
udf_supress_last_name = F.udf(supress_last_name, StringType())

In [7]:
prefix = '../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-'

In [8]:
sizes = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]

In [9]:
map_sizes = {}

In [10]:
for size in sizes:
    accum_size = 0
    map_sizes[str(size)] = {}
    # setting the proportions of exact true matches, gray area and false matches
    # ~50% of true matches
    # ~20% of gray area (record with last name supression and wrong information on sex)
    # ~30% of false matches
    n_true_m = (size/100)*50
    n_gray_m = (size/100)*20
    n_false_m = (size/100)*30
    
    # using literal numbers to estimate the right proportion of sample
    p_true_m = n_true_m/1000000
    p_gray_m = n_gray_m/1000000
    p_false_m = (n_false_m/1000000)+ 0.01
    
    # getting sample of exact true
    true_df = indexed_dataset.sample(p_true_m)
    count = true_df.count()
    accum_size += count
    map_sizes[str(size)]['true_df'] = count
    
    # getting sample of gray area
    gray_df = indexed_dataset.sample(p_gray_m)
    count = gray_df.count()
    accum_size += count
    map_sizes[str(size)]['gray_df'] = count
    
    # getting sample of false matches
    false_df = indexed_dataset.sample(p_false_m)
    count = false_df.count()
    accum_size += count
    map_sizes[str(size)]['false_df'] = count
    
    # recording the total size of resulting datasets
    map_sizes[str(size)]['accum_size'] = accum_size
    
    # suppressing last name and changing sex info for gray area records
    gray_df = gray_df.withColumn('nome_a', udf_supress_last_name(F.col('nome_a')))
    gray_df = gray_df.withColumn('nome_mae_a', udf_supress_last_name(F.col('nome_mae_a')))
    gray_df = gray_df.withColumn('sexo_a', F.when(F.col('sexo_a') == 1, 2).otherwise(1))
    
    
    # messing with name for false matches
    false_df = false_df.withColumn('nome_a', F.col('nome_mae_a'))
    
    # union
    df = true_df.union(gray_df).union(false_df).limit(size)
    
#     df = df.withColumn('dt_nasc_a', F.to_date(F.col('dt_nasc_a'), 'dd/MM/yyyy'))
#     df = df.withColumn('dt_nasc_a', F.col('dt_nasc_a').cast('string'))
    
    count = df.count()
    map_sizes[str(size)]['final_size'] = count
    
    # changing names
    names_dict = {'id_cidacs_a': 'id_cidacs_b', 
                  'nome_a': 'nome_b', 
                  'nome_mae_a': 'nome_mae_b', 
                  'dt_nasc_a': 'dt_nasc_b', 
                  'sexo_a': 'sexo_b'}
    for col in names_dict.keys():
        df = df.withColumnRenamed(col, names_dict[col])
    
    # writing data
    df.write.parquet(prefix+str(size)+'.parquet', mode='overwrite')
    print(map_sizes[str(size)])

# map_sizes

{'true_df': 56, 'gray_df': 19, 'false_df': 9974, 'accum_size': 10049, 'final_size': 100}
{'true_df': 257, 'gray_df': 104, 'false_df': 9955, 'accum_size': 10316, 'final_size': 500}
{'true_df': 542, 'gray_df': 210, 'false_df': 10284, 'accum_size': 11036, 'final_size': 1000}
{'true_df': 2498, 'gray_df': 1005, 'false_df': 11509, 'accum_size': 15012, 'final_size': 5000}
{'true_df': 4905, 'gray_df': 1979, 'false_df': 13061, 'accum_size': 19945, 'final_size': 10000}
{'true_df': 25150, 'gray_df': 10065, 'false_df': 25297, 'accum_size': 60512, 'final_size': 50000}
{'true_df': 50329, 'gray_df': 20271, 'false_df': 39895, 'accum_size': 110495, 'final_size': 100000}
{'true_df': 250964, 'gray_df': 99792, 'false_df': 160060, 'accum_size': 510816, 'final_size': 500000}
{'true_df': 499976, 'gray_df': 200552, 'false_df': 309944, 'accum_size': 1010472, 'final_size': 1000000}


In [11]:
# testing one dataset
spark.read.parquet('../0_global_data/fd-cidacs-rl/sinthetic-datasets-b/sinthetic-datasets-b-10000.parquet/').limit(5).toPandas()

Unnamed: 0,id_cidacs_b,nome_b,nome_mae_b,dt_nasc_b,sexo_b
0,453,DAVI MATHEUS DOS SANTOS FARIAS,JUCIENE RAMOS DOS SANTOS,20070721,1
1,1073,CAIO FURLAN CRESPO,ELEN CRISTINA DE OLIVEIRA CARVALHO,20061015,1
2,1206,CARLOS GABRIEL CARVALHO DE SOUZA,LEUDILENE DA COSTA SILVA,20080319,1
3,1413,JOSIANE DOS SANTOS FERREIRA,ALDENIRA RODRIGUES SANTOS,20080126,2
4,1551,YASMIM CAROLINA LOPES GONCALVES,JOANA FATIMA AYRES,20080117,2
