## Read file with words

In [7]:
with open('unique_english_words.txt', 'r') as f:
    words = [word.rstrip() for word in f.readlines()] 

words[:3]

['ethereal', 'woebegone', 'credulous']

In [8]:
print(f'No of words: {len(words)}')
print(f'Min len: {min([len(word) for word in words])}')
print(f'Max len: {max([len(word) for word in words])}')

No of words: 750
Min len: 3
Max len: 29


## Creating bigram using Pytorch

In [9]:
import torch

In [10]:
unique_symbols = set(list(''.join(words)))
unique_symbols

{'-',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [12]:
SPECIAL_TOKEN = '.'
unique_symbols.add(SPECIAL_TOKEN)

In [14]:
stoi = {s: i for i, s in enumerate(unique_symbols)}
itos = {i: s for s, i in stoi.items()}

In [18]:
n_symbols = len(unique_symbols)

In [23]:
N = torch.zeros((n_symbols, n_symbols), dtype = torch.int16)

for word in words:
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN # Add special token in the begginning and in the end of each word
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]

        N[idx1, idx2] += 1

N

tensor([[ 10,   0,  32,   5,  49,   4,  17,   0,   1,  35,   0,   1,   0,   2,
           1,  21,   0,   2,   0,   7,   3,   0,   0,   7,   0,   1,   9,   0],
        [  0,   0,   0,   0,  19,   0,  11,   0,   0,  11,   0,   0,   0,   0,
           0,   7,   0,   0,   0,   0,   0,   0,   0,   3,   0,   0,   0,   0],
        [ 45,  17,   0,  26,  63,  21,  31,  11,  14,  26,  38,  28,  12,  97,
          33,  22,   8,  36,  65,   2,  41,   6,  16,  13,  35,   0,  18,  26],
        [  7,   6,  48,   9,  49,  13,  67,   4,   0,  47,   3,   2,   1,   5,
           4,  40,   0,   7,   7,  18,   4,   0,   8,  20,   8,   0,   5,   3],
        [ 23,   6, 129, 105,  11,   6,  16,   4,   3,   6,  11,   4,   2,  43,
          10,   8,   3,  20,  15,   7,   8,  11,  82,  11,  33,   4,  38,   2],
        [  1,   0,  27,   8,  17,   7,  31,   0,   0,  17,   0,   2,   0,   1,
           0,  13,   0,   6,   0,   9,   1,   0,   6,  14,   1,   0,  12,   4],
        [ 28,   4,  39,  51,   4,  27,   1,   

In [30]:
# create new tensor P to store probabilities
P = N.float()
P = P/P.sum(dim = 1, keepdim = True)

In [39]:
g = torch.Generator().manual_seed(42123)

# let's generate some new unique English words!
for i in range(20):
    word = []
    idx = stoi[SPECIAL_TOKEN] # any word always starts with special token '.'
    while True:
        i_probs = P[idx]
        idx = torch.multinomial(i_probs, num_samples = 1, replacement = True, generator = g).item()
        if idx == stoi[SPECIAL_TOKEN]: # if we meet special token - it is the end of the word!
            break  
        word.append(itos[idx])

    print(''.join(word))

scanom
ctilluky
ba
gomanesanquxtil
ngleffabe
fipiodisowicala
squssebiorgleasousceflincismaman
zaureecheryooid
s
ganizzenthanste-nsicaverere
g
k
sphratiledegal
cioblunidereborns
loufla
voge
ceacons
qusiadopercemmengaty
usppolarddibee
st


In [41]:
pip list --format=freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
