## Read file with words

In [37]:
with open('unique_english_words.txt', 'r') as f:
    words = [word.rstrip() for word in f.readlines()] 

words[:3]

['ethereal', 'woebegone', 'credulous']

In [38]:
print(f'No of words: {len(words)}')
print(f'Min len: {min([len(word) for word in words])}')
print(f'Max len: {max([len(word) for word in words])}')

No of words: 750
Min len: 3
Max len: 29


## Creating bigram using Pytorch

In [39]:
import torch

In [40]:
unique_symbols = set(list(''.join(words)))
unique_symbols

{'-',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [41]:
SPECIAL_TOKEN = '.'
unique_symbols.add(SPECIAL_TOKEN)

In [42]:
stoi = {s: i for i, s in enumerate(unique_symbols)}
itos = {i: s for s, i in stoi.items()}

In [43]:
n_symbols = len(unique_symbols)

In [44]:
N = torch.zeros((n_symbols, n_symbols), dtype = torch.int16)

for word in words:
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN # Add special token in the begginning and in the end of each word
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]

        N[idx1, idx2] += 1

N

tensor([[  0,   0,   4,   0,   0,   2,   0,   0,   0,   0,   1,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   7,   0,   3,   0,   0,   0],
        [  0,  10,  29,   0,   1,  31,   0,   0,   4,   1,  59,   1,  39,  22,
           0,   2,   1,  31,   2,   8,   0,  55,  22,   1,  75,   0,   3,   0],
        [  2,  75,   1,  24,  30,   0,   4,   0,  63,   6,   4,  85,   1,   4,
          28,   7,   0,  51,  26,  36,  19,  39,  13,  27,  13,   4,   6,   0],
        [  0,   0,  19,   4,   1,  25,   0,   0,  11,   0,  20,   1,   0,   2,
           1,   0,   0,  12,   4,   0,   0,   4,  16,   0,  24,   0,   2,   0],
        [  0,   0,  34,  14,   6,  24,   0,   1,   1,   0,  22,   2,   0,   3,
           2,   0,   0,   0,   2,   0,  20,  23,  17,   0,  25,   0,   3,   0],
        [  0,  11,   5,  18,  23,  26,   3,   2,  45,   7,   5,  67,   1,   2,
          11,   6,   0,  43,  15,  12,  34,  18,  80,  12,   5,   6,   6,   4],
        [  0,   0,  11,   0,   0,   7,   0,   

In [51]:
# create new tensor P to store probabilities
P = (N + 1).float() ## adding fake 1 count to avoid log(0) sutiation during loss calculation!
P = P/P.sum(dim = 1, keepdim = True)

In [52]:
g = torch.Generator().manual_seed(42123)

# let's generate some new unique English words!
for i in range(20):
    word = []
    idx = stoi[SPECIAL_TOKEN] # any word always starts with special token '.'
    while True:
        i_probs = P[idx]
        idx = torch.multinomial(i_probs, num_samples = 1, replacement = True, generator = g).item()
        if idx == stoi[SPECIAL_TOKEN]: # if we meet special token - it is the end of the word!
            break  
        word.append(itos[idx])

    print(''.join(word))

ol
uzentis
ngtona
a
prvemave-m
dwhicoumddopalohes
jwg
nss
fqalivon
pur
o
s
cemafldesas
cror
talogammsticr
wwichuongillide
cty
mkaro
h
holiterdeene


## How good is this bigram model?

In [53]:
# take first two words and check probabilities for pair of characters

for word in words[:2]:
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]

        print(f'{ch1}{ch2} : {P[idx1, idx2]:.2f}')

.e : 0.08
et : 0.05
th : 0.09
he : 0.16
er : 0.16
re : 0.12
ea : 0.03
al : 0.11
l. : 0.10
.w : 0.04
wo : 0.08
oe : 0.01
eb : 0.01
be : 0.12
eg : 0.01
go : 0.07
on : 0.14
ne : 0.08
e. : 0.20


In [54]:
print(f'If every out of 28 (27+special token) chars were equaly likely \
then the probability for each pair of chars would be: {1/28.:.2f}.')

If every out of 28 (27+special token) chars were equaly likely then the probability for each pair of chars would be: 0.04.


But we can see above that some of the probabilities are higher than 4% (eg "he : 0.18" or "e. : 0.21").
<br>It means that our bigram model actually learnt something.
<br><br>But how to quantify this in a single number which would expess the quality of our model?
<br>To do this we can use likelihood which can be calculated as `a product of these probabilities`.
<br>`The better the model` we have `the greater the product` value we should get because in a good model all these probabilities should be near 1.
<br>But because all prob values are between 0 and 1 `their product is gonna be a very small number` which is not convinient.
<br>So for convinience what is usually used is **LOG-LIKELIHOOD**.
<br>But it is unnecessary to calculate this tiny product first ang then log(product). 
<br>Logarithm has such a nice property as: `log(a*b*c) = log(a) + log(b) + log(c)`
<br>But again because all prob values are between 0 and 1 all log probs are gonna be negative. In terms of loss function we are actually interested in using this metric is supposing *maximazing* it (because the ideal probs are = 1, log(1) = 0 so the ideal loss value is 0). 
<br>So to make this look more as a loss function we just invert this value. This is how we get **NEGATIVE LOG-LIKELIHOOD**.
<br>`negative log-likelihood = - log-likelihood`
<br> and one more modification: for convinience normilized nll is used which is `NLL = NLL/n` where n is a number of samples

In [55]:
## training loss

log_likelihood = 0.0
n = 0

for word in words:
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        prob = P[idx1, idx2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1

nll = -log_likelihood
print(f'negative log-likelihood = {nll}')
print(f'normilized negative log-likelihood = {nll/n}')

negative log-likelihood = 18443.9375
normilized negative log-likelihood = 2.576688766479492


In [50]:
## THIS CELL WORKS ONLY BEFORE ADDING FAKE COUNT (FOR P = N.float())
# we can evaluate how probable any word is given parameters N of our bigram model 
## training loss

log_likelihood = 0.0
n = 0

for word in ["puzqzle"]: # added 'q' on purpose to get rare 'zq' pair of symbols
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        prob = P[idx1, idx2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f'{ch1}{ch2} prob: {prob:.2f}, log: {logprob:.2f}')

nll = -log_likelihood
print(f'negative log-likelihood = {nll}')
print(f'normilized negative log-likelihood = {nll/n}')

# oops! our loss is inf.
# this happened because some of char pairs have 0 occurance in our training set.
# to avoid this problem of getting log(0) we can just add fake counts to our table of frequency.

.p prob: 0.05, log: -2.98
pu prob: 0.03, log: -3.44
uz prob: 0.01, log: -5.17
zq prob: 0.00, log: -inf
qz prob: 0.00, log: -inf
zl prob: 0.02, log: -3.91
le prob: 0.16, log: -1.84
e. prob: 0.21, log: -1.57
negative log-likelihood = inf
normilized negative log-likelihood = inf


In [56]:
## AFTER ADDING FAKE COUNT
log_likelihood = 0.0
n = 0

for word in ["puzqzle"]: # added 'q' on purpose to get rare 'zq' pair of symbols
    word = SPECIAL_TOKEN + word + SPECIAL_TOKEN
    for ch1, ch2 in zip(word, word[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        prob = P[idx1, idx2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f'{ch1}{ch2} prob: {prob:.2f}, log: {logprob:.2f}')

nll = -log_likelihood
print(f'negative log-likelihood = {nll}')
print(f'normilized negative log-likelihood = {nll/n}')

.p prob: 0.05, log: -2.99
pu prob: 0.03, log: -3.43
uz prob: 0.01, log: -4.84
zq prob: 0.01, log: -4.36
qz prob: 0.02, log: -3.97
zl prob: 0.03, log: -3.66
le prob: 0.15, log: -1.90
e. prob: 0.20, log: -1.61
negative log-likelihood = 26.760574340820312
normilized negative log-likelihood = 3.345071792602539
