In [60]:
import torch

In [4]:
# load the loads to word array 
words = open('names.txt','r').read().splitlines()

In [5]:
# print first 10 chars 
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [7]:
# length of the word array 
print(len(words))

32033


In [18]:
# max and min word length 
print(f'max length of a word in the words array = {max(len(w) for w in words)}')
print(f'min length of a word in the words array = {min(len(w) for w in words)}')

max length of a word in the words array = 15
min length of a word in the words array = 2


In [42]:
# create character bi-gram dictionary with bi-gram frequancy 
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1,ch2 in zip(chs,chs[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram,0) + 1

In [59]:
# first 20 bi-grams which has highest occurance (freq.)
sorted(b.items(),key = lambda kv : -kv[1])[:20]

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422)]

In [91]:
# make encoder (chars->int) and decoder (int->chars)
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] =0
itos = {i:s for s,i in stoi.items()}

In [81]:
# this will store the bi-gram freq. (first char along the row and second along the col.)
N = torch.zeros((27, 27), dtype=torch.int32)

for w in words:
    chs = ['.'] + list(w) + ['.']   # start and end with char "."
    for ch1,ch2 in zip(chs,chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1,ix2] +=1

In [83]:
N[:3]

tensor([[   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
         1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
          134,  535,  929],
        [6640,  556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568,
         2528, 1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,
          182, 2050,  435],
        [ 114,  321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,
          103,    0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,
            0,   83,    0]], dtype=torch.int32)

In [87]:
# this is showing the probability of all the characters that can come after letter '.', i.e. N[0]
p = N[0].float()
p = p/p.sum()
p

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [94]:
# we want to find the next char after letter 'a'
g = torch.Generator().manual_seed(2147483647)   # this is to make same output 
ix = torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()
itos[ix]

'j'

In [None]:
# P                   : 27,27
# P.sum(1,keepdims=T) : 27,1
# According to broadcasting rules this 

In [103]:
P = (N+1).float()   # here we add 1 to remve the zero probablities 
P /= P.sum(1,keepdims=True)
print(P[0])

tensor([3.1192e-05, 1.3759e-01, 4.0767e-02, 4.8129e-02, 5.2745e-02, 4.7785e-02,
        1.3038e-02, 2.0898e-02, 2.7293e-02, 1.8465e-02, 7.5577e-02, 9.2452e-02,
        4.9064e-02, 7.9195e-02, 3.5777e-02, 1.2321e-02, 1.6095e-02, 2.9008e-03,
        5.1154e-02, 6.4130e-02, 4.0830e-02, 2.4641e-03, 1.1759e-02, 9.6070e-03,
        4.2109e-03, 1.6719e-02, 2.9008e-02])


In [105]:
# let's try to generate few words completely based on the bi-gram probabilities (or count freq.)
g = torch.Generator().manual_seed(2147483647)  # generator 

for i in range(5):                             # generate 5 words 
    out = []
    ix = 0                                     # start with char '.'
    while True:
        p = P[ix]                              # pluck one raw of P 
        ix = torch.multinomial(p,num_samples=1,replacement=True,generator=g).item() # get the next char index
        out.append(itos[ix])                   # append the next char to "out" list
        
        if ix ==0:
            break
    print(''.join(out))

junide.
janasah.
p.
cony.
a.
