In [1]:
import pandas as pd
import numpy as np

from corruptor import ProbabilisticCorruptor

import warnings
warnings.filterwarnings("ignore")

In [2]:
ncvoter = pd.read_csv('ncvoter.txt', sep='\t', header=0, encoding='latin1')

In [3]:
columns = ['ncid', 'first_name', 'last_name', 'birth_year', 'mail_addr1', 'mail_city', 'mail_zipcode']
ncvoter_selection = ncvoter[columns].dropna()

# Cleaning
for column in columns:
    if ncvoter_selection[column].dtype == 'O':
        ncvoter_selection[column] = ncvoter_selection[column].str.strip()
        ncvoter_selection[column] = ncvoter_selection[column].str.lower()

ncvoter_selection = ncvoter_selection.replace('', np.nan, regex=True).dropna()

# Formatting
ncvoter_selection.rename(columns={'ncid': 'id', 'first_name': 'firstname', 'last_name': 'lastname', 'birth_year': 'birthyear'}, inplace=True)
ncvoter_selection.rename(columns={'mail_addr1': 'address', 'mail_city': 'city', 'mail_zipcode': 'zipcode'}, inplace=True)

# Ensure unique numeric IDs
prefixes = ncvoter_selection['id'].str[0:2].unique().tolist()
ncvoter_selection['id'] = ncvoter_selection['id'].apply(lambda x: int(x[2:]) + ((10**(len(x[2:])+1)) * (prefixes.index(x[:2])+1)))

# Validation
assert len(ncvoter_selection) == len(ncvoter_selection['id'].unique())
assert len(ncvoter_selection.query('firstname==""')) == 0
assert len(ncvoter_selection.query('lastname==""')) == 0
assert len(ncvoter_selection.query('address==""')) == 0
assert len(ncvoter_selection.query('city==""')) == 0

ncvoter_selection.head()

Unnamed: 0,id,firstname,lastname,birthyear,address,city,zipcode
0,1056273,evelyn,aabel,1935,4430 e greensboro-chapel hill rd,graham,27253
1,1098377,christina,aaron,1976,421 whitt ave,burlington,27215
2,1069747,claudia,aaron,1945,1013 edith st,burlington,27215
3,10170513,james,aaron,1948,po box 98,saxapahaw,27340
4,1091549,nathan,aaron,1976,421 whitt ave,burlington,27215


In [4]:
ncvoter_selection['id'].max()

790150164

In [5]:
len(ncvoter_selection)

7037317

In [6]:
def take_random_sample(df, n):
    rows = np.random.choice(df.index.values, n, replace=False)
    sample = df.loc[rows]
    
    return sample[:n]

In [7]:
"""dataframe_corruptor.py"""
from random import choices, sample

class DataFrameCorruptor2():
    """DataFrameCorruptor definition"""

    def __init__(self, mapping):
        self.population = [(k, mapping[k][1]) for k in mapping.keys()]
        self.weights = [mapping[k][0] for k in mapping.keys()]

    def corrupt(self, df, n, inplace=False):
        indices = sample(list(df.index), n)

        df_wc = df if inplace else df.copy()
        for i in indices:
            (column, corruption) = choices(self.population, self.weights, k=1)[0]
            df_wc.loc[i, column] = corruption.corrupt(df.loc[i, column])

        return df_wc

In [8]:
prob = ProbabilisticCorruptor({'none': 0.20, 'ocr': 0.40, 'typo': 0.40})
dfc = DataFrameCorruptor2({
    'firstname': (0.25, prob), 
    'lastname': (0.25, prob),
    'address': (0.25, prob), 
    'city': (0.25, prob)
})

In [9]:
def create_two_party_datasets(size, overlap):
    overlap_size = int(size * overlap)
    overlap_size_half = int(overlap_size / 2)   
    sample_size = (2 * (size - overlap_size))
    sample_size_half = int(sample_size / 2)
    
    two_party = take_random_sample(ncvoter_selection, sample_size)
    two_party_fh = two_party[:sample_size_half]
    two_party_sh = two_party[sample_size_half:]

    two_party_fh_cs = dfc.corrupt(take_random_sample(two_party_fh, overlap_size), overlap_size_half, inplace=True)
    two_party_sh_cs = dfc.corrupt(take_random_sample(two_party_sh, overlap_size), overlap_size_half, inplace=True)
    
    two_party_a = pd.concat([two_party_fh, two_party_sh_cs])
    two_party_b = pd.concat([two_party_sh, two_party_fh_cs])
    
    return (two_party_a, two_party_b)

In [10]:
two_party_4m_a, two_party_4m_b = create_two_party_datasets(4000 * 1000, 0.121)

two_party_4m_a.to_csv('two_party_4m_a.csv', index=False)
two_party_4m_b.to_csv('two_party_4m_b.csv', index=False)

In [11]:
# sample = take_random_sample(ncvoter_selection, 1 * 10**3)
# sample.head()

In [2]:
fs = [
#     'two_party_50k_a.csv',
#     'two_party_50k_b.csv',
    
#     'two_party_100k_a.csv',
#     'two_party_100k_b.csv',
    
#     'two_party_200k_a.csv',
#     'two_party_200k_b.csv',
    
#     'two_party_500k_a.csv',
#     'two_party_500k_b.csv',
    
#     'two_party_1m_a.csv',
#     'two_party_1m_b.csv',
    
#     'two_party_1m500k_a.csv',
#     'two_party_1m500k_b.csv',
    
#     'two_party_2m_a.csv',
#     'two_party_2m_b.csv',
    
#     'two_party_3m_a.csv',
#     'two_party_3m_b.csv',
    
#     'two_party_2m500k_a.csv',
#     'two_party_2m500k_b.csv',
    
#     'two_party_3m500k_a.csv',
#     'two_party_3m500k_b.csv',
    
    'two_party_4m_a.csv',
    'two_party_4m_b.csv'
]

for f in fs:
    pd.read_csv(f).dropna().to_csv(f, index=False)