In [1]:
import random
import numpy as np

def find_interval(x, partition, endpoints=True):
    for i in range(0, len(partition)):
        if x < partition[i]:
            return i-1 if endpoints else i
    return -1 if endpoints else len(partition)


def weighted_choice(sequence, weights):
    x = np.random.random()
    cum_weights = [0] + list(np.cumsum(weights))
    index = find_interval(x, cum_weights)
    return sequence[index]


def cartesian_choice(*iterables):
    res = []
    for population in iterables:
        res.append(random.choice(population))
    return res


def weighted_cartesian_choice(*iterables):
    res = []
    for population, weights in iterables:
        lst = weighted_choice(population, weights)
        res.append(lst)
    return res


def weighted_sample(population, weights, k):
    sample = set()
    population = list(population)
    weights = list(weights)
    while len(sample) < k:
        choice = weighted_sample(population, weights)
        sample.add(choice)
        index = population.index(choice)
        weights.pop(index)
        population.remove(choice)
        weights = [ x / sum(weights) for x in weights]
    return list(sample)


def weighted_sample_alternative(population, weights, k):
    sample = set()
    population = list(population)
    weights = list(weights)
    while len(sample) < k:
        choice = weighted_sample(population, weights)
        if choice not in sample:
            sample.add(choice)
    return list(sample)

In [4]:
weighted_firstnames = [ ("John", 80), ("Eve", 70), ("Jane", 2), 
                        ("Paul", 8), ("Frank", 20), ("Laura", 6), 
                        ("Robert", 17), ("Zoe", 3), ("Roger", 8), 
                        ("Edgar", 4), ("Susanne", 11), ("Dorothee", 22),
                        ("Tim", 17), ("Donald", 12), ("Igor", 15),
                        ("Simone", 9), ("Bernard", 8), ("Sarah", 7),
                        ("Yvonne", 11), ("Bill", 12), ("Bernd", 10)]

weighted_surnames = [('Singer', 2), ('Miles', 2), ('Moore', 5),
                     ('Strongman', 5), ('Romero', 3), ("Yiang", 4),
                     ('Looper', 1), ('Rampman', 1), ('Chopman', 1), 
                     ('Smiley', 1), ('Bychan', 1), ('Smith', 150), 
                     ('Baker', 144), ('Miller', 87), ('Cook', 5),
                     ('Joyce', 1), ('Bush', 5), ('Shorter', 6), 
                     ('Wagner', 10), ('Sundigos', 10), ('Firenze', 8),
                     ('Puttner', 20), ('Faulkner', 10), ('Bowman', 11),
                     ('Klein', 1), ('Jungster', 14), ("Warner", 14),
                     ('Tiller', 9), ('Wogner', 10), ('Blumenthal', 16)]


firstnames, weights = zip(*weighted_firstnames)
wsum = sum(weights)
weights_firstnames = [ x / wsum for x in weights]

surnames, weights = zip(*weighted_surnames)
wsum = sum(weights)
weights_surnames = [ x / wsum for x in weights]

weights = (weights_firstnames, weights_surnames)


def synthesizer( data, weights=None, format_func=None, repeats=True):
    
    def choice(data, weights):
        if weights:
            return weighted_cartesian_choice(*zip(data, weights))
        else:
            return cartesian_choice(*data)
        
    def synthesize():
        if not repeats:
            memory = set()
        while True:
            res = choice(data, weights)
            if not repeats:
                sres = str(res)
                while sres in memory:
                    res = choice(data, weights)
                    sres = str(res)
                memory.add(sres)
            if format_func:
                yield format_func(res)
            else:
                yield res
    return synthesize
        

recruit_employee = synthesizer( (firstnames, surnames), weights = weights, format_func=lambda x: " ".join(x), repeats=False)

employee = recruit_employee()
for _ in range(12):
    print(next(employee))

John Miller
Eve Miller
John Baker
Laura Miller
Robert Baker
Bill Baker
John Sundigos
Bill Romero
John Moore
Sarah Smith
Eve Baker
Eve Smith
