In [None]:
from faker import Faker
import random
import calendar
import datetime
from scipy.stats import truncnorm, skewnorm, poisson
import numpy as np
import pandas as pd
import math
import requests

from dataclasses import dataclass, asdict
from typing import Optional, Callable, List, Dict, TypeVar

In [None]:
# Download required files - tables of name variants from Ancestry.com's database
files = {'givenname_similar_names.csv': 'https://github.com/tfmorris/Names/raw/master/search/src/main/resources/givenname_similar_names.csv',
         'surname_similar_names.csv': 'https://github.com/tfmorris/Names/raw/master/search/src/main/resources/surname_similar_names.csv'}

for fname, url in files.items():
    with open(fname, 'wb') as f:
        content = requests.get(url, stream=True).content
        f.write(content)

In [None]:
class ConcentratedDist:
    def __init__(self, sample: Callable, concentration: float = 1.0, discount: float = 0.0):
        self._sample = sample
        self.discount = discount
        self.concentration = concentration
        self.value_counts = {}
        self.count = 0
    
    def __call__(self):
        if math.isinf(self.concentration):
            return self._sample()
        if self.count == 0:
            idx = 0
            weights = [1.0]            
        else:
            # Weights associated with observed values
            weights = [(value_count - self.discount) / (self.concentration + self.count - 1) for value_count in self.value_counts.values()]
            # Weight associated with base distribution
            weights.append((self.concentration + self.discount * len(weights)) / (self.concentration + self.count - 1))
            idx = random.choices(range(len(weights)), weights = weights)[0]
        if idx == len(weights) - 1:
            # Draw from base distribution
            value = self._sample()
            if value in self.value_counts:
                self.value_counts[value] += 1
            else:
                self.value_counts[value] = 1
        else:
            # Previously seen value
            value = list(self.value_counts)[idx]
            self.value_counts[value] += 1
        self.count += 1
        return value
        

@dataclass
class Person:
    first_name: str
    last_name: str
    gender: str
    state: str
    zipcode: str
    birth_month: int
    birth_day: int
    birth_year: int
        
class PersonGenerator:
    def __init__(self, fake: Faker, gender_weights = {'M': 0.50, 'F': 0.50}, concentrations: dict = {}, discounts: dict = {}, 
                 dob_min = datetime.date(1960, 1, 1), dob_max = datetime.date(1990, 1, 1)):
        self.fake = fake
        self.gender_weights = gender_weights
        self.dob_min = dob_min
        self.dob_max = dob_max
        self.GENDER = list(self.gender_weights.keys())
        self.GENDER_WEIGHTS = list(self.gender_weights.values())
        self.concentrations = concentrations
        self.discounts = discounts
        self._last_name_gen = ConcentratedDist(lambda : self.fake.last_name().upper(), self.concentrations.get('last_name', math.inf), self.discounts.get('last_name', 0.0))
        self._first_name_female_gen = ConcentratedDist(lambda : self.fake.first_name_female().upper(), self.concentrations.get('first_name', math.inf), self.discounts.get('first_name', 0.0))
        self._first_name_male_gen = ConcentratedDist(lambda : self.fake.first_name_male().upper(), self.concentrations.get('first_name', math.inf), self.discounts.get('first_name', 0.0))
        self._first_name_gen = ConcentratedDist(lambda : self.fake.first_name().upper(), self.concentrations.get('first_name', math.inf), self.discounts.get('first_name', 0.0))
        self._state_gen = ConcentratedDist(self.fake.state_abbr, self.concentrations.get('state', math.inf), self.discounts.get('state', 0.0))
        self._zipcode_gen = {}
        
    def gender(self) -> str:
        return random.choices(self.GENDER, self.GENDER_WEIGHTS)[0]
    
    def dob(self) -> datetime.date:
        return self.fake.date_between_dates(self.dob_min, self.dob_max)
        
    def zipcode(self, state_abbr: Optional[str] = None) -> str:
        gen = self._zipcode_gen.get(state_abbr, None)
        if gen is None:
            gen = ConcentratedDist(lambda : self.fake.zipcode_in_state(state_abbr), self.concentrations.get('zipcode', math.inf), self.discounts.get('zipcode', 0.0))
            self._zipcode_gen[state_abbr] = gen
        return gen()
    
    def first_name(self, gender: Optional[str] = None) -> str:
        if gender == 'M':
            return self._first_name_male_gen()
        elif gender == 'F':
            return self._first_name_female_gen()
        else:
            return self._first_name_gen()
    
    def last_name(self) -> str:
        return self._last_name_gen()
    
    def state(self) -> str:
        return self._state_gen()
    
    def __call__(self, fixed_attributes: dict = {}):
        gender = fixed_attributes.get('gender', self.gender())
        first_name = fixed_attributes.get('first_name', self.first_name(gender))
        last_name = fixed_attributes.get('last_name', self.last_name())
        dob = self.dob()
        if 'birth_year' in fixed_attributes:
            year = fixed_attributes['birth_year']
            max_day = calendar.monthrange(year, dob.month)[1]
            dob = dob.replace(year = year, day = min(dob.day, max_day))
        if 'birth_month' in fixed_attributes:
            month = fixed_attributes['birth_month']
            max_day = calendar.monthrange(dob.year, month)[1]
            dob = dob.replace(month = month, day = min(dob.day, max_day))
        if 'birth_day' in fixed_attributes:
            day = fixed_attributes['birth_day']
            max_day = calendar.monthrange(dob.year, dob.month)[1]
            dob = dob.replace(day = min(day, max_day))
        state = fixed_attributes.get('state', self.state())
        zipcode = fixed_attributes.get('zipcode', self.zipcode(state))
        return Person(first_name=first_name, last_name=last_name, gender=gender, 
                      birth_month=dob.month, birth_day=dob.day, birth_year=dob.year,
                      state=state, zipcode=zipcode)
    
class HouseholdGenerator:
    def __init__(self, fake: Faker, person_generator: PersonGenerator):
        self.person_generator = person_generator
        self.NUM_CHILDREN = {0: 0.605, 1: 0.151, 2: 0.159, 3: 0.071, 4: 0.009, 5: 0.002, 6: 0.002, 7: 0.001}
        self.NUM_HOUSEMATES = {0: 0.45, 1: 0.2, 2: 0.2, 3: 0.05, 4: 0.05, 5: 0.05} 
        self.HOUSEHOLDER_TYPE = {'B': 0.73, 'F': 0.19, 'M': 0.8}
        self.MARRIED_SAME_LAST_NAME = 0.8
        self.PARENT_AGE_CHILD_BORN = skewnorm(4, loc=23, scale=7).rvs
        
    def __call__(self) -> List[dict]:
        people = []
        
        householder_type = random.choices(list(self.HOUSEHOLDER_TYPE), weights=list(self.HOUSEHOLDER_TYPE.values()))[0]
        if householder_type == 'B':
            male_householder = self.person_generator({'gender': 'M'})
            female_household_fixed = {'gender': 'F', 'zipcode': male_householder.zipcode, 'state': male_householder.state, 
                                      'birth_year': int(random.gauss(male_householder.birth_year, 5))}
            if random.random() < self.MARRIED_SAME_LAST_NAME:
                female_household_fixed['last_name'] = male_householder.last_name
            female_householder = self.person_generator(female_household_fixed)
            householders = [male_householder, female_householder]
        else:
            householders = [self.person_generator({'gender': householder_type})]
        household_fixed = {'zipcode': householders[0].zipcode, 'state': householders[0].state}
        people.extend(householders)
        
        num_children = random.choices(list(self.NUM_CHILDREN), weights=list(self.NUM_CHILDREN.values()))[0]
        for child in range(num_children):
            child_fixed = {'last_name': householders[0].last_name, 'birth_year': householders[0].birth_year + int(self.PARENT_AGE_CHILD_BORN())}
            child_fixed.update(household_fixed)
            child = self.person_generator(child_fixed)
            people.append(child)
        
        if householder_type != 'B' and num_children == 0:
            num_housemates = random.choices(list(self.NUM_HOUSEMATES), weights=list(self.NUM_HOUSEMATES.values()))[0]
            for housemate in range(num_housemates):
                housemate_fixed = {'birth_year': householders[0].birth_year + int(random.gauss(0, 4))}
                housemate_fixed.update(household_fixed)
                housemate_fixed.update(household_fixed)
                housemate = self.person_generator(housemate_fixed)
                people.append(housemate)
        
        return people


def get_random_pos(start: int, stop: int) -> int:
    return random.randint(start, stop)

def random_del(x: str, domain: str) -> str:
    if len(x) < 1:
        return x
    del_pos = get_random_pos(0, len(x) - 1)
    return x[0:del_pos] + x[(del_pos + 1):]

def random_trans(x: str, domain: str) -> str:
    if len(x) < 2:
        return x
    trans_pos = get_random_pos(0, len(x) - 2)
    trans_str = x[trans_pos + 1] + x[trans_pos]
    return x[0:trans_pos] + trans_str + x[(trans_pos + 2):]

def random_sub(x: str, domain: str) -> str:
    if len(x) < 1:
        return x
    sub_pos = get_random_pos(0, len(x) - 1)
    sub_str = random.choice(domain)
    return x[0:sub_pos] + sub_str + x[(sub_pos + 1):]

def random_ins(x: str, domain: str) -> str:
    ins_pos = get_random_pos(0, len(x) - 1)
    ins_str = random.choice(domain)
    return x[0:ins_pos] + ins_str + x[(ins_pos + 1):]


V = TypeVar('V')
Attributes = Dict[str, V]
Distortion = Callable[[V, Attributes], V]

class TypoDistortion(Distortion[str]):
    def __init__(self, char_domain: str, ins_weight: float = 5, del_weight: float = 15, 
                 sub_weight: float = 35, trans_weight: float = 5):
        self.char_domain = char_domain
        self.EDIT_OPS = [random_ins, random_sub, random_del, random_trans]
        self.EDIT_OP_WEIGHTS = [ins_weight, sub_weight, del_weight, trans_weight]
    
    def __call__(self, x: str, cond: dict) -> str:
        edit_op = random.choices(self.EDIT_OPS, self.EDIT_OP_WEIGHTS)[0]
        return edit_op(x, self.char_domain)

class VariantDistortion(Distortion[str]):
    def __init__(self, variant_file: str):
        self.variant_file = variant_file
        self.index = self._read_variant_file()
    
    def _read_variant_file(self) -> Dict[str, List[str]]:
        index = pd.read_csv(self.variant_file, names = ["name", "variants"])
        index["name"] = index["name"].str.upper()
        index["variants"] = index["variants"].str.upper()
        index = index.set_index("name")
        index = index.loc[index.variants.notnull()]
        index = index.to_dict('index')
        return {k: v['variants'].split() for k, v in index.items()}
    
    def __call__(self, x: str, cond: dict) -> str:
        variants = self.index.get(x, None)
        if variants is None:
            return x
        x_new = random.choice(variants)
        return x_new
    
    
class Distortion(Distortion[V]):
    def __init__(self, distortions: Dict[Distortion[V], float]):
        self.distortions = distortions
        self.DISTORTIONS = list(distortions)
        self.DISTORTIONS_WEIGHTS = list(distortions.values())
    
    def __call__(self, x: V, cond: Attributes[V]) -> V:
        distortion = random.choices(self.DISTORTIONS, self.DISTORTIONS_WEIGHTS)[0]
        return distortion(x, cond)

In [None]:
def rand_tpoisson(mu: float, a: int, b: int):
        k = np.arange(a, b + 1)
        pmf = poisson(mu).pmf(k)
        pmf = pmf / pmf.sum()
        return np.random.choice(k, p=pmf)

ALPHA_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
def generate_data(link_conf: Dict[str, float], dist_probs: Dict[str, float], file_prefix: str, seed: int):
    random.seed(seed)
    np.random.seed(seed)
    
    fake = Faker()
    person_generator = PersonGenerator(fake, concentrations={'first_name': 1000.0, 'last_name': 100.0, 'state': 1.0, 'zipcode': 1.0}, discounts={'first_name': 0.8, 'last_name': 0.8})
    household_generator = HouseholdGenerator(fake, person_generator)
    
    num_entities = link_conf['exp_num_ents']
    entities = []
    while len(entities) < num_entities:
        entities.extend(household_generator())
    entities = [asdict(e) for e in entities]
    for uid, e in enumerate(entities):
        e.update({'uid': uid})
    random.shuffle(entities)
    entities_df = pd.DataFrame.from_records(entities)
    print(f"Generated {entities_df.shape[0]} entities")
    entities_df.to_csv(file_prefix + "_entities.csv", index=False, header=True)
    
    first_name_distortion = Distortion(
        {TypoDistortion(ALPHA_CHARS): 5,
         VariantDistortion('givenname_similar_names.csv'): 4.9,
         lambda x, cond: person_generator.first_name(cond['gender']): 0.1}
    )

    last_name_distortion = Distortion(
        {TypoDistortion(ALPHA_CHARS): 5, 
         VariantDistortion('surname_similar_names.csv'): 3,
         lambda x, cond: person_generator.last_name(): 2}
    )

    attr_distortions = [
        ('first_name', first_name_distortion),
        ('last_name', last_name_distortion),
        ('gender', lambda x, cond: person_generator.gender()),
        ('zipcode', lambda x, cond: person_generator.zipcode(cond['state'])),
        ('state', lambda x, cond: person_generator.state()),
        ('birth_year', lambda x, cond: int(x + random.gauss(0, 5))),
        ('birth_month', lambda x, cond: person_generator.dob().month),
        ('birth_day', lambda x, cond: person_generator.dob().day),
    ]

    def distort_person(person: dict, attr_distortions):
        random.shuffle(attr_distortions)
        for attr, distortion in attr_distortions:
            if random.random() < dist_probs[attr]:
                person[attr] = distortion(person[attr], person)
        return person
    
    records = []
    for entity in entities:
        if random.random() > link_conf['inclusion_pr']:
            # Don't include this entity
            continue
        
        num_appearances = rand_tpoisson(link_conf['mu'], link_conf['a'], link_conf['b'])

        for _ in range(num_appearances):
            record = distort_person(entity.copy(), attr_distortions)
            record['uid'] = entity['uid']
            records.append(record)
    random.shuffle(records)
    
    records_df = pd.DataFrame.from_records(records)
    print(f"Generated {records_df.shape[0]} records")
    records_df.to_csv(file_prefix + "_records.csv", index=False, header=True)
    
    return records_df, entities_df

In [None]:
def pois_link_conf(mu: float, b: int, inclusion_pr: float = 0.5, exp_num_recs: int = 1000) -> dict:
    k = np.arange(0, b + 1)
    pois_pmf = poisson.pmf(k, mu)
    pois_pmf[0] = 0.0
    pois_pmf = pois_pmf / pois_pmf.sum()
    inc_pmf = np.zeros_like(pois_pmf, dtype=float)
    inc_pmf[0] = 1.0
    pmf = (1 - inclusion_pr) * inc_pmf + inclusion_pr * pois_pmf
    exp = (k * pmf).sum()
    exp_num_ents = round(exp_num_recs / exp)
    return {
        'inclusion_pr': inclusion_pr,
        'exp_num_ents': exp_num_ents,
        'exp_num_recs': exp_num_recs,
        'a': 1,
        'b': b,
        'mu': mu
    }

dist_confs = [
    {
        'first_name': 0.1,
        'last_name': 0.1,
        'gender': 0.01,
        'zipcode': 0.05,
        'state': 0.05,
        'birth_year': 0.01,
        'birth_month': 0.01,
        'birth_day': 0.01
    }, # low distortion
    {
        'first_name': 0.4,
        'last_name': 0.4,
        'gender': 0.01,
        'zipcode': 0.1,
        'state': 0.1,
        'birth_year': 0.1,
        'birth_month': 0.1,
        'birth_day': 0.1
    } # high distortion
]

link_confs = [
    pois_link_conf(0.1, 4, 0.9, 1000),    # low duplication
    pois_link_conf(1, 4, 0.9, 1000),      # moderate duplication
    pois_link_conf(8, 4, 0.9, 1000),      # high duplication
    pois_link_conf(100, 4, 0.9, 1000),     # very high duplication
    pois_link_conf(0.1, 4, 0.9, 10000),    # low duplication
    pois_link_conf(1, 4, 0.9, 10000),      # moderate duplication
    pois_link_conf(8, 4, 0.9, 10000),      # high duplication
    pois_link_conf(100, 4, 0.9, 10000)     # very high duplication
]

s = 0
print(f"Random seed {s}")
for link_conf in link_confs:
    print(f"--Link config with TPois(mu={link_conf['mu']}, a={link_conf['a']}, b={link_conf['b']})")
    for d_idx, dist_conf in enumerate(dist_confs):
        print(f"----Distortion config {d_idx} with {dist_conf}")
        records, entities = generate_data(link_conf, dist_conf, f"gen_link-conf-mu-{link_conf['mu']}_dist-conf-{d_idx}_seed-{s}_exp-num-recs-{link_conf['exp_num_recs']}", s)