# A Neural Probabilistic Language Model

In [2]:
# import key packages
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
# retina display
%config InlineBackend.figure_format = 'retina'

In [4]:
# read the dataset
words = open('./data/names.txt', 'r').read().splitlines()
print('Number of words:', len(words))

Number of words: 32033


In [5]:
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [7]:
# build up the dictionary for mapping characters to integers
chars = sorted(list(set(''.join(words))))
print('Number of characters:', len(chars))
# chars to integers
char2int = {c: i+1 for i, c in enumerate(chars)}
# add . as the padding character
char2int['.'] = 0
# integers to chars
int2char = {i: c for c, i in char2int.items()}

Number of characters: 26


In [10]:
print('char2int:', char2int)

char2int: {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [12]:
print('int2char:', int2char)

int2char: {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [13]:
# build up the dataset
block_size = 3 # the length of sequences of input data
X, Y = [], []

for word in words[:5]:
    print(word)
    for i in range(0, len(word) - block_size):
        # rolling window of block_size
        X.append([char2int[c] for c in word[i:i+block_size]])
        Y.append(char2int[word[i+block_size]])
        print('  ', word[i:i+block_size], '-->', word[i+block_size])

emma
   emm --> a
olivia
   oli --> v
   liv --> i
   ivi --> a
ava
isabella
   isa --> b
   sab --> e
   abe --> l
   bel --> l
   ell --> a
sophia
   sop --> h
   oph --> i
   phi --> a


In [16]:
# the above method does not add '.'
block_size = 3 # the length of sequences of input data
X, Y = [], []

for word in words[:5]:
    print(word)
    # initial context by building up the padding characters
    context = [0] * block_size
    for ch in word+ '.':
        idx = char2int[ch]
        X.append(context)
        Y.append(idx)
        # print the context and the next character
        print(''.join([int2char[i] for i in context]), '-->', ch)
        # update the context by rolling
        context = context[1:] + [idx]

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .
isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [18]:
# convert to tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

In [20]:
# right now 32 = document size, 3 = block size
print(X.shape, Y.shape, X.dtype, Y.dtype)

torch.Size([32, 3]) torch.Size([32]) torch.int64 torch.int64


In [22]:
print(X[:5], Y[:5])

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]]) tensor([ 5, 13, 13,  1,  0])


In [24]:
# initialize the lookup table
C_lookup = torch.randn(len(char2int), 2, requires_grad=True)
print(C_lookup.shape)

torch.Size([27, 2])


In [25]:
C_lookup

tensor([[ 0.4977,  0.8756],
        [-0.4006,  0.6435],
        [ 1.7300, -0.0446],
        [ 0.0446, -2.9870],
        [ 1.9398, -0.2666],
        [ 0.0790, -0.2298],
        [ 2.2499, -2.9052],
        [ 1.3151,  0.2223],
        [-0.1359,  1.1480],
        [ 1.0357, -0.6063],
        [ 0.1686, -1.5691],
        [ 0.4391, -1.6887],
        [ 0.2008, -1.5794],
        [ 0.7514, -0.3242],
        [-0.2992,  0.0307],
        [-0.3125, -0.0838],
        [ 0.5809,  1.7678],
        [ 1.1271,  0.8043],
        [ 0.3943, -1.1657],
        [-0.4073,  2.0113],
        [-0.6337, -0.0362],
        [ 0.7606,  0.8676],
        [-0.5598,  1.4374],
        [-0.9409,  0.4591],
        [ 0.0696,  0.4772],
        [-0.2564,  0.0926],
        [-0.1636, -1.1153]], requires_grad=True)

Now, it's time to have some reflections. Since we are doing a sequence to sequence model. We first need to construct our sequences:

- choosing input block size as 3 (rolling window)
- choosing output size as 1 (next word)

After that, we constructed our look-up table, which is an embedding layer. The embedding layer is a matrix of size (vocab_size, embedding_size). The embedding_size is a hyperparameter. In our case we have 

- vocab_size = 27
- embedding_size = 2 (to keep it simple)

Since each input sequence has 3 words, we have 3 embedding vectors.

In [27]:
# example of getting embedding vectors
print(C_lookup[0])
print(C_lookup[[3, 5, 9]])

tensor([0.4977, 0.8756], grad_fn=<SelectBackward0>)
tensor([[ 0.0446, -2.9870],
        [ 0.0790, -0.2298],
        [ 1.0357, -0.6063]], grad_fn=<IndexBackward0>)


In [30]:
# embedding all the characters in the input sequences
X_embed = C_lookup[X]
print(X_embed.shape)

torch.Size([32, 3, 2])


In [35]:
# create the hidden layer
# input dimension = 2x3 = 6, output dimension = 100
# set seed for reproducibility
torch.manual_seed(42)
W1 = torch.randn(6, 100, requires_grad=True)
b1 = torch.randn(100, requires_grad=True)
print(W1.shape, b1.shape)

torch.Size([6, 100]) torch.Size([100])


In [36]:
# reshape the input sequences
X_embed = X_embed.view(-1, 6)
print(X_embed.shape)

torch.Size([32, 6])


In [37]:
# calculate the hidden layer
H = torch.tanh(X_embed @ W1 + b1)
print(H.shape)

torch.Size([32, 100])


In [38]:
# output layer
# input dimension = 100, output dimension = 27
W2 = torch.randn(100, len(char2int), requires_grad=True)
b2 = torch.randn(len(char2int), requires_grad=True)
print(W2.shape, b2.shape)

torch.Size([100, 27]) torch.Size([27])


In [41]:
# calculate the output layer step by step
# logistic function
logits = H @ W2 + b2
print(logits.shape)
# exponentiation
counts = logits.exp()
# normalization
probs = counts / counts.sum(dim=1, keepdim=True)
print(probs.shape)

torch.Size([32, 27])
torch.Size([32, 27])


In [43]:
print(Y.shape)

torch.Size([32])


In [44]:
print(Y)

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])


In [45]:
# retrieve the probabilities of the next characters
probs[torch.arange(len(Y)), Y]

tensor([1.1295e-11, 3.9312e-02, 3.2966e-02, 6.9774e-08, 1.1928e-11, 3.2660e-10,
        5.5922e-07, 9.6566e-01, 1.6914e-14, 6.2913e-08, 9.9986e-01, 1.3768e-06,
        1.9034e-08, 3.9118e-08, 2.6768e-02, 5.8802e-01, 1.2138e-08, 4.2902e-09,
        2.4035e-10, 1.0108e-11, 1.7069e-06, 7.2179e-06, 6.6518e-14, 5.0524e-10,
        3.4076e-12, 1.1541e-17, 1.9069e-05, 9.9405e-07, 7.9744e-08, 6.2392e-04,
        1.7824e-07, 1.8046e-08], grad_fn=<IndexBackward0>)

In [46]:
# calculate the loss
loss = -probs[torch.arange(len(Y)), Y].log().mean()
print(loss)

tensor(16.2359, grad_fn=<NegBackward0>)


In [47]:
# organize the above steps into a function
def forward(X, C_lookup, W1, b1, W2, b2):
    # embedding all the characters in the input sequences
    X_embed = C_lookup[X]
    # reshape the input sequences
    X_embed = X_embed.view(-1, 6)
    # calculate the hidden layer
    H = torch.tanh(X_embed @ W1 + b1)
    # calculate the output layer
    logits = H @ W2 + b2
    # use coross-entropy loss
    loss = F.cross_entropy(logits, Y)

    return loss

In [60]:
# initialize the parameters
g_seed = torch.Generator().manual_seed(666)
C_lookup = torch.randn(len(char2int), 2, requires_grad=True, generator=g_seed)
W1 = torch.randn(6, 100, requires_grad=True, generator=g_seed)
b1 = torch.randn(100, requires_grad=True, generator=g_seed)
W2 = torch.randn(100, len(char2int), requires_grad=True, generator=g_seed)
b2 = torch.randn(len(char2int), requires_grad=True, generator=g_seed)
parameters = [C_lookup, W1, b1, W2, b2]

In [56]:
# calculate the total number of parameters
total_params = sum(p.numel() for p in parameters)
print('total parameters:', total_params)

total parameters: 3481


In [50]:
# unpack the parameters with * operator
# python is beautiful :)
forward(X, *parameters)

tensor(13.9521, grad_fn=<NllLossBackward0>)

In [62]:
# let's train the model
for _ in range(100):
    loss = forward(X, *parameters)
    # initialize the gradients
    for p in parameters:
        p.grad = None
    # backpropagation
    loss.backward()
    # update the parameters
    for p in parameters:
        p.data -= 0.1 * p.grad

print(loss)

3.155815839767456
2.789224624633789
2.4765310287475586
2.209885358810425
1.9780027866363525
1.7717442512512207
1.5868629217147827
1.4226648807525635
1.2795374393463135
1.1573749780654907
1.054980993270874
0.9694474339485168
0.8976451754570007
0.8375321626663208
0.7871580719947815
0.7443479895591736
0.7073003649711609
0.6747673749923706
0.6459058523178101
0.6201291084289551
0.5970106720924377
0.57622230052948
0.5574955344200134
0.5405979156494141
0.5253202319145203
0.5114725828170776
0.4988831877708435
0.48740071058273315
0.4768918752670288
0.46724188327789307
0.45835208892822266
0.45013684034347534
0.4425230622291565
0.4354473650455475
0.42885512113571167
0.4226987361907959
0.41693583130836487
0.41153088212013245
0.40645137429237366
0.40166860818862915
0.3971578776836395
0.39289674162864685
0.3888648748397827
0.38504457473754883
0.3814198672771454
0.3779762089252472
0.37470075488090515
0.371581107378006
0.3686072528362274
0.3657689392566681
0.36305734515190125
0.3604642450809479
0.3579