<a href="https://colab.research.google.com/github/mikeCode321/FetchMoreLanguageModel/blob/mlp_representation/fetchmore_mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this implementation of character level language modeling we will implement the Multi-Layer Perceptron (MLP). Represented by the Journal ->
*A Nueral Probabilistic Language Model*   Bengio et al.

Although, in this paper the authors do a word level model we will use the same principle, but for character level.

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [None]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [None]:
len(words)

32033

In [None]:
chars = sorted(list(set(''.join(words))))

In [None]:
stoi = {s:i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [None]:
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

The block size specifies the context in a rolling window fashion. This algorithm demos what it looks like for the context building up to be more and more like the word

In [None]:
block_size = 3

X, y = [], []

for w in words[:5]:
  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]

X = torch.tensor(X)
y = torch.tensor(y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [None]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [None]:
y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [None]:
X.shape, X.dtype, y.shape, y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [None]:
C = torch.randn((27, 2))

In [None]:
C[5]

tensor([-0.2869, -0.3951])

In [None]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-0.2869, -0.3951])

In [None]:
emb = C[X[:5]]
emb

tensor([[[ 0.3455, -1.2553],
         [ 0.3455, -1.2553],
         [ 0.3455, -1.2553]],

        [[ 0.3455, -1.2553],
         [ 0.3455, -1.2553],
         [-0.2869, -0.3951]],

        [[ 0.3455, -1.2553],
         [-0.2869, -0.3951],
         [-0.0965, -0.4722]],

        [[-0.2869, -0.3951],
         [-0.0965, -0.4722],
         [-0.0965, -0.4722]],

        [[-0.0965, -0.4722],
         [-0.0965, -0.4722],
         [ 0.3174, -1.0435]]])

In [None]:
emb.shape

torch.Size([5, 3, 2])

In [None]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [None]:
emb

tensor([[[ 0.3455, -1.2553],
         [ 0.3455, -1.2553],
         [ 0.3455, -1.2553]],

        [[ 0.3455, -1.2553],
         [ 0.3455, -1.2553],
         [-0.2869, -0.3951]],

        [[ 0.3455, -1.2553],
         [-0.2869, -0.3951],
         [-0.0965, -0.4722]],

        [[-0.2869, -0.3951],
         [-0.0965, -0.4722],
         [-0.0965, -0.4722]],

        [[-0.0965, -0.4722],
         [-0.0965, -0.4722],
         [ 0.3174, -1.0435]]])

In [None]:
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]], 1).shape

torch.Size([5, 6])

In [None]:
torch.cat(torch.unbind(emb, 1), 1)

tensor([[ 0.3455, -1.2553,  0.3455, -1.2553,  0.3455, -1.2553],
        [ 0.3455, -1.2553,  0.3455, -1.2553, -0.2869, -0.3951],
        [ 0.3455, -1.2553, -0.2869, -0.3951, -0.0965, -0.4722],
        [-0.2869, -0.3951, -0.0965, -0.4722, -0.0965, -0.4722],
        [-0.0965, -0.4722, -0.0965, -0.4722,  0.3174, -1.0435]])

In [None]:
emb.view(emb.shape[0], 6)[:2, :2]

tensor([[ 0.3455, -1.2553],
        [ 0.3455, -1.2553]])

In [None]:
W1[:2, :2]

tensor([[ 0.9716, -0.8156],
        [-0.5503,  1.3577]])

In [None]:
emb.view(emb.shape[0], 6)[:2, :2] @ W1[:2, :2]

tensor([[ 1.0265, -1.9861],
        [ 1.0265, -1.9861]])

In [None]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

In [None]:
h.shape

torch.Size([5, 100])

In [None]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [None]:
logits = h @ W2 + b2

In [None]:
logits.shape

torch.Size([5, 27])

In [None]:
counts = logits.exp()

In [None]:
prob = counts / counts.sum(dim=1, keepdims=True)

In [None]:
prob.shape

torch.Size([5, 27])

Negative log-likelihood

In [None]:
-prob[:, y].log().mean()

tensor(14.9421)

Put together

In [None]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters)

3481

In [None]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
logits = h @ W2 + b2 # (32, 27)
# counts = logits.exp()
# prob = counts / counts.sum(dim=1, keepdims=True)
# loss = -prob[:, y].log().mean()
loss = F.cross_entropy(logits, y)

In [None]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]
sum(p.nelement() for p in parameters)

3481

In [None]:
block_size = 3

X, y = [], []

for w in words:

  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    y.append(ix)
    context = context[1:] + [ix]

X = torch.tensor(X)
y = torch.tensor(y)

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
for _ in range(10):
  emb = C[X] # (32, 3, 2)
  h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, y)

  print(loss.item())
  for p in parameters:
    p.grad = None

  loss.backward()

  for p in parameters:
    p.data += -0.1 * p.grad

print(loss.item())

3.291400909423828
3.266794443130493
3.243562698364258
3.2216010093688965
3.2008213996887207
3.181147813796997
3.162510633468628
3.144848585128784
3.1281070709228516
3.112236499786377
3.112236499786377


In [None]:
logits.max(1)

torch.return_types.max(
values=tensor([6.4820, 5.0683, 5.8456,  ..., 4.2618, 2.0469, 2.4089],
       grad_fn=<MaxBackward0>),
indices=tensor([ 1, 18,  1,  ...,  0,  0,  0]))

In [None]:
y

tensor([ 5, 13, 13,  ..., 26, 24,  0])