In [1]:
from collections import defaultdict
import numpy as np

In [2]:
names = open('male.txt').readlines()[6:]
names = map(lambda name:name[0:-1], names)
names = map(lambda name:name.lower(), names)

In [3]:
def build_model(data, deg=1):
    freq = defaultdict(lambda:defaultdict(float))
    for element in data:
        element = "^" * deg + element + "$"
        for i in range(deg, len(element)):
            freq[element[i - deg:i]][element[i]] += 1
    sum_freqs = {}
    for k, v in freq.items():
        sum_freqs[k] = sum(nb for nb in v.values())
        for kprev in freq[k].keys():
            freq[k][kprev] = float(freq[k][kprev]) / sum_freqs[k]
    return freq

In [4]:
max_deg = 3
models = []
for deg in range(max_deg, 0, -1):
    print(deg)
    models.append((build_model(names, deg=deg), deg))

3
2
1


In [5]:
def gen(models, max_size=300, temp=1):    
    maxdeg = max(d for m, d in models)
    s = "^" * maxdeg
    for i in range(maxdeg, maxdeg + max_size):
        pr = None
        for model, deg in models:
            prev = s[i - deg:i]
            if prev in model:
                pr = model
                break
        assert pr is not None
        chars = pr[prev].keys()
        probas = np.array(pr[prev].values())
        if temp != 1:
            probas = softmax((probas * temp)[None, :])[0]
        char_idx = np.random.multinomial(1, probas).argmax()
        char = chars[char_idx]
        s += char
        if char == "$":
            break
    return s

def softmax(w):
    maxes = np.amax(w, axis=1)
    maxes = maxes.reshape(maxes.shape[0], 1)
    e = np.exp(w - maxes)
    dist = e / np.sum(e, axis=1, keepdims=True)
    return dist

In [6]:
fake_names = [gen(models, temp=0.).replace('^', '').replace('$', '').capitalize() for i in range(1000)]

In [7]:
nb_from_training_data = sum(1 if name.lower() in names else 0 for name in fake_names)
print('Perc. generated exactly like training data : {}%'.format(100. * nb_from_training_data/len(fake_names)))
print('Perc. generated which are original : {}%'.format(100. - 100. * nb_from_training_data/len(fake_names)))

Perc. generated exactly like training data : 24.1%
Perc. generated which are original : 75.9%


In [8]:
fake_names = [name for name in fake_names if name.lower() not in names]

In [9]:
for n in fake_names:
    print(n)

Giovan
Dellemmetr
Iancey
Quillassie
Jamiliercy
Zipperdiet
Fardinero
Blakeene
Keilldo
Thols
Zollaume
Raynardecaid
Glynd
Goria
Nunzie
Dwartail
Whitti
Yardt
Xymenito
Butledgarolphe
Voltainninod
Titol
Lloysius
Ques
Meiercy
Quillmond
Powe
Nes
Xim
Zel
Irwinnislad
Werenzio
Ant
Evel
Wrigor
Ibrane
Vaugie
Yuric
Nie
Vas
Hiraud
Llonzo
Quillus
Lorey
Yehudsonels
Vijah
Alstotlet
Brichilarsonen
Jetheus
Ralfonsendi
Sandie
Neillynd
Zechibaut
Keened
Milias
Efry
Rhet
Obe
Quillus
Clelary
Gaylin
Nunzier
Zollinn
Valejan
Kylandalt
Forter
Kimmani
Frazio
Xymened
Xymenso
Xevela
Salien
Yigaleksandley
Tudolfy
Tyn
Jasonn
Vladeustu
Nil
Joth
Rits
Chroedee
Wojcien
Muhamlint
Neetf
Xavilliam
Hyatanwoodmanforr
Dysses
Oderse
Phiner
Weylinn
Ichmon
Gearc
Lincoleg
Gooses
Purchashley
Ulico
Odellswortinod
Hil
Quente
Gabraharons
Tros
Shayde
Sargess
Jimbal
Purchilasto
Sily
Octavey
Hobartis
Ques
Tabe
Endan
Uptonen
Jimmienninos
Umberchy
Fabi
Istearny
Ores
Hyatty
Bharlocky
Bjorg
Yehu
Thorval
Quiggis
Craigelijay
Aamili
Urbadias
Fyod