# The spelled-out intro to language modeling: building makemore

Following: https://www.youtube.com/watch?v=PaCmpygFfXo&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=2

In [13]:
from pathlib import Path
from collections import defaultdict

In [5]:
data = Path('../data')

# Exploring bigrams (0:00 - 0:15:00)

Tasks done in the video:
- Load the dataset
- Explore
    - min/max length of names
    - number of names
- count all the bigrams into a dictionary
    - dedicated \<S\> and \<E\> tokens

In [9]:
names = (data/'names.txt').open().read().splitlines()
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [10]:
len(names)

32033

In [12]:
min(len(name) for name in names), max(len(name) for name in names)

(2, 15)

In [20]:
bigrams = defaultdict(int)
start = '<S>'
end = '<E>'

for name in names:
    name = [start] + list(name) + [end]
    for ch1,ch2 in zip(name, name[1:]):
        bigrams[(ch1,ch2)] += 1

In [27]:
sorted((bigram_tuples for bigram_tuples in bigrams.items()), key=lambda b: b[1], reverse=True)[:10]

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963)]

# Bigrams into torch.Tensor ( - 00:36:00, skipping efficiency until 00:50:00)

- Goal: n*n matrix that holds the bigram count in each cell
- No more dedicated start/stopping tokens, instead use . for both
- sample from the model
    - start with the starting dot
    - choose random next character according to probabilities in that row (torch.multinomial)
    - repeat until ending-dot is reached
    - Use torch.Generator for comparability

### Bigrams to Tensor

Extracting all used characters:

In [38]:
characters = set('.')
characters = characters.union(set(''.join(names)))
characters

{'.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [39]:
import torch

In [50]:
stoi = {char: i for i,char in enumerate(characters)}
itos = {i: char for char,i in stoi.items()}
stoi

{'q': 0,
 '.': 1,
 'd': 2,
 'k': 3,
 'o': 4,
 'v': 5,
 't': 6,
 'p': 7,
 'y': 8,
 'h': 9,
 'w': 10,
 'a': 11,
 'i': 12,
 'c': 13,
 'f': 14,
 'j': 15,
 'g': 16,
 'm': 17,
 'u': 18,
 's': 19,
 'l': 20,
 'e': 21,
 'n': 22,
 'b': 23,
 'z': 24,
 'x': 25,
 'r': 26}

In [41]:
bigrams = torch.zeros((len(characters), len(characters)))
start = '.'
end = '.'

for name in names:
    name = [start] + list(name) + [end]
    for ch1,ch2 in zip(name, name[1:]):
        row_index = stoi[ch1]
        col_index = stoi[ch2]
        bigrams[row_index, col_index] += 1

### Sampling

Counts for characters following the start character:

In [45]:
bigrams[stoi[start],:]

tensor([  92.,    0., 1690., 2963.,  394.,  376., 1308.,  515.,  535.,  874.,
         307., 4410.,  591., 1542.,  417., 2422.,  669., 2538.,   78., 2055.,
        1572., 1531., 1146., 1306.,  929.,  134., 1639.])

Drawing a sample with counts as weights:

In [54]:
next_idx = torch.multinomial(bigrams[stoi[start],:], 1, replacement=True).item()
next_char = itos[next_idx]
next_char

'b'

Repeat until the stopping token is drawn:

In [67]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    name = start
    while True:
        next_idx = torch.multinomial(bigrams[stoi[name[-1]],:], 1, replacement=True, generator=g).item()
        next_char = itos[next_idx]
        name += next_char

        if next_char == end:
            break


    print(name)

.mee.
.ker.
.milizinselan.
.jusan.
.an.
.a.
.dusarieronah.
.lttumith.
.hoantth.
.cielanahatyaremm.
.a.
.ke.
.kyacen.
.amasoneinittilae.
.kh.
.a.
.kanysaa.
.drsen.
.mawienin.
.arydrabela.


Sanity-Check: Drawing with uniform probability

In [68]:
g = torch.Generator().manual_seed(2147483647)
weights = torch.ones(len(characters))

for i in range(20):
    name = start
    while True:
        next_idx = torch.multinomial(weights, 1, replacement=True, generator=g).item()
        next_char = itos[next_idx]
        name += next_char

        if next_char == end:
            break


    print(name)

.mns.
..
.xzdmumcxmberfhn.
.feajlownoyoqmfjxclroema.
.lovlagtwophpnivwtausmibfhupyyrbieqhokzqkvacrekwfwjvjnwnuppmuwuqktdy.
.dasigjakqzlzbdgcafeeszoprtvzybmgiogofbrnaxzsmwtjummgtzslbatjetynbtycsexxjlnknnmgtkdxowkjeya.
.waymffgmccii.
.juomxqpsrgbqriphwhf.
.fprreeisxnmuete.
.b.
.nsodfbnicuxgnpustpgsypfcb.
.tmcswondyfpnjhhpzyaboiluoabf.
.prxafmnqitslomhpnfkojhovlseapogqktgwszvmwkscwbpervsqbjtitwztixipdvpgwogjnolbqoirxsmludyzykofvheecqzlxsitdc.
.dj.
..
.fxwgjweewtiks.
.ekmkplwcirmld.
.hsnzulpegagghpzeovworsujmvhpjnezcnlqwiztjiajrkhqlayohnaoht.
.zkvnigalabbnezixclbxipgr.
.ebonztndjjtioogycwvjcwpwkmvmblqqbizkikwdcqmeexhwlfrcxksykijucwxuvumsijcouidebmfichlducmyldobdxniscbfdjubkoywjlvrbfszenttnhfucvoviedyotpl.


# Loss function ( - 01:03:00)

- negative Likelihood is used to measure loss (product of probabilities)
- use logs for numerical purposes (turns into the sum)

In [70]:
P = bigrams / bigrams.sum(dim=1, keepdim=True)

In [87]:
for name in names[:2]:
    name = [start] + list(name) + [end]
    log_likelihood = 0
    n = 0
    for ch1,ch2 in zip(name, name[1:]):
        row_index = stoi[ch1]
        col_index = stoi[ch2]
        prob = P[row_index, col_index]
        log = torch.log(prob)
        log_likelihood += log
        n += 1
        
        print(f'({ch1}, {ch2}): {log}')
    print(f'Neg. Log-Likelihood: {-log_likelihood}')
    print(f'Mean Neg. Log-Likelihood: {-log_likelihood / n}')
    print('')

(., e): -3.0408453941345215
(e, m): -3.2793259620666504
(m, m): -3.6772043704986572
(m, a): -0.9417552351951599
(a, .): -1.6298604011535645
Neg. Log-Likelihood: 12.568990707397461
Mean Neg. Log-Likelihood: 2.513798236846924

(., o): -4.3981709480285645
(o, l): -2.550807476043701
(l, i): -1.7277942895889282
(i, v): -4.186665058135986
(v, i): -1.0382850170135498
(i, a): -1.9795759916305542
(a, .): -1.6298604011535645
Neg. Log-Likelihood: 17.511159896850586
Mean Neg. Log-Likelihood: 2.501594305038452



In [86]:
for name in ['andrej']:
    name = [start] + list(name) + [end]
    log_likelihood = 0
    n = 0
    for ch1,ch2 in zip(name, name[1:]):
        row_index = stoi[ch1]
        col_index = stoi[ch2]
        prob = P[row_index, col_index]
        log = torch.log(prob)
        log_likelihood += log
        n += 1
        
        print(f'({ch1}, {ch2}): {log}')
    print(f'Neg. Log-Likelihood: {-log_likelihood}')
    print(f'Mean Neg. Log-Likelihood: {-log_likelihood / n}')
    print('')

(., a): -1.9828919172286987
(a, n): -1.8295611143112183
(n, d): -3.259352207183838
(d, r): -2.562042474746704
(r, e): -2.012739896774292
(e, j): -5.917083740234375
(j, .): -3.7097861766815186
Neg. Log-Likelihood: 21.273456573486328
Mean Neg. Log-Likelihood: 3.03906512260437



# Neural Network Approach ( - )