In [1]:
names = open('names.txt', 'r').read().splitlines()

In [2]:
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
# Some details about our dataset
size = len(names)
min_size_name = min(len(name) for name in names)
max_size_name = max(len(name) for name in names)

print(f"Number of names: {size}, min_size_name: {min_size_name}, max_size_name: {max_size_name}")

Number of names: 32033, min_size_name: 2, max_size_name: 15


 ### Let see the bigrams (Pair of previous and consecutive)

In [4]:
# Here we have included an special character at the beginning as well at the end (<S>, <E>)
dic_big = {} # Dictionary of bigrams, counts the occurrences of each bigram in all the names 
for name in names:
    chs = ['<S>'] + list(name) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        dic_big[bigram] = dic_big.get(bigram, 0) + 1

In [5]:
# Sorting all the bigrams in descending order
sorted(dic_big.items(), key = lambda kv: -kv[1])

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422),
 (('o', 'n'), 2411),
 (('h', '<E>'), 2409),
 (('r', 'a'), 2356),
 (('a', 'h'), 2332),
 (('h', 'a'), 2244),
 (('y', 'a'), 2143),
 (('i', 'n'), 2126),
 (('<S>', 's'), 2055),
 (('a', 'y'), 2050),
 (('y', '<E>'), 2007),
 (('e', 'r'), 1958),
 (('n', 'n'), 1906),
 (('y', 'n'), 1826),
 (('k', 'a'), 1731),
 (('n', 'i'), 1725),
 (('r', 'e'), 1697),
 (('<S>', 'd'), 1690),
 (('i', 'e'), 1653),
 (('a', 'i'), 1650),
 (('<S>', 'r'), 1639),
 (('a', 'm'), 1634),
 (('l', 'y'), 1588),
 (('<S>', 'l'), 1572),
 (('<S>', 'c'), 1542),
 (('<S>', 'e'), 1531),
 (('j', 'a'), 1473),
 (

In [6]:
import torch

In [7]:
# We want to compare each character with all each other's. We have 26 characters plus 1 extra special token <.>
# to denote the start and the end of a word
# For that purpuse let's create a 27 X 27 matrix whichs contains the counts
N = torch.zeros((27, 27), dtype=torch.int32)

chars = sorted(list(set(''.join(names))))

# Mapping from character to integer
# We also want to specify <.> be at position 0 and all the other characters offset off
string_to_int = {char:integer+1 for integer, char in enumerate(chars)}
string_to_int['.'] = 0
print(string_to_int)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [8]:
# Mapping from integer to character
integer_to_char = {integer:char for char, integer in string_to_int.items()}
print(integer_to_char)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [9]:
# Counts the occurrences of bigrams
for name in names:
    chs = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        index1 = string_to_int[ch1]
        index2 = string_to_int[ch2]
        N[index1, index2] += 1

In [10]:
N

tensor([[   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
         1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
          134,  535,  929],
        [6640,  556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568,
         2528, 1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,
          182, 2050,  435],
        [ 114,  321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,
          103,    0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,
            0,   83,    0],
        [  97,  815,    0,   42,    1,  551,    0,    2,  664,  271,    3,  316,
          116,    0,    0,  380,    1,   11,   76,    5,   35,   35,    0,    0,
            3,  104,    4],
        [ 516, 1303,    1,    3,  149, 1283,    5,   25,  118,  674,    9,    3,
           60,   30,   31,  378,    0,    1,  424,   29,    4,   92,   17,   23,
            0,  317,    1],
        [3983,  679,  121,  153,  384, 1271,   82,

In [None]:
# Let's visualize the ocurrence of the bigrams
# For instance the bigram n,<E> it's the most common bigram
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')

for i in range(27):
    for j in range(27):
        chstr = integer_to_char[i] + integer_to_char[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
        
plt.axis('off');

## Sampling from the count table (N)

In [None]:
N[0, :]

In [None]:
# Let's convert to probabilities (The probability of finding two characters together)
probability = N[0, :].float()
probability = probability / probability.sum()
probability

In [None]:
# Generator for random number
generator = torch.Generator().manual_seed(2147483647) # Deterministic generator for random numbers
idx = torch.multinomial(probability, num_samples=1, replacement=True, generator=generator).item()
# The most probably character in the distribution (Probability based in coincidences of bigrams)
# The probability of the more likely bigram for the first character (a)
integer_to_char[idx]

In [None]:
# Deterministic generator for random number
generator = torch.Generator().manual_seed(2147483647)
mul_probability = torch.rand(3, generator=generator)
mul_probability = mul_probability / mul_probability.sum()
mul_probability

In [None]:
# Generate a multinomial probability distribution
# Here we can see that for n samples, approximately 60% are from the first type (zeros),
# approximately 30% are from second type (ones),
# and approximately 1% is are from third type (two)
torch.multinomial(mul_probability, num_samples=10, replacement=True, generator=generator)

### Generate names based on probability

In [None]:
generator = torch.Generator().manual_seed(2147483647)

idx = 0
while True:
    probability = N[idx].float()
    probability = probability/probability.sum()
    # Returns the index of the next character row to check for probabilities
    idx = torch.multinomial(probability, num_samples=1, replacement=True, generator=generator).item()
    print(integer_to_char[idx])
    if idx == 0:
        break

In [None]:
# This will take all the bigrams combinations present in N
# Then add over all the combinations of character let say (X) with each other present in the vocabulary (row sum)
bigrams_combination = N.float()
row_sum = bigrams_combination.sum(1, keepdim=True)

print(row_sum[0,:])
print(sum(bigrams_combination[0,:]))

assert (row_sum[0,:] == sum(bigrams_combination[0,:]))

# Now if we divide each element of the row (bigram combination, let say Xa, Xb, Xc, .... X? where ? is the last word in the
# vocabulary) by the sum of the entire row then we convert these values to probabilities 

probability = bigrams_combination / row_sum

print('----')
print(f'probability.shape: {probability}')

In [None]:
probability = N.float()
probability = probability / probability.sum(1, keepdim=True) # Sum over the rows
probability.shape

In [None]:
# Now let's generate 10 names
generator = torch.Generator().manual_seed(2147483647)

for i in range(20):
    generated_names = []
    idx = 0
    while True:
        prob = probability[idx]
        # Returns the index of the next character row to check for probabilities
        idx = torch.multinomial(prob, num_samples=1, replacement=True, generator=generator).item()
        generated_names.append(integer_to_char[idx])
        if idx == 0:
            break
            
    print(''.join(generated_names))