In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in all the words.
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
# Build a vocab of characters and mappings to/from integers.
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [5]:
# Build the dataset.

# WE can update the block_size to get more context.
block_size = 3            # Context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]    # Crop and append.
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [8]:
# We will embed the 27 chars in a 2-dimensional space.
C = torch.randn((27, 2))

In [9]:
C

tensor([[-0.5317, -0.8100],
        [ 0.0686, -0.4340],
        [ 0.6754, -0.1459],
        [ 0.4900, -2.1244],
        [ 0.3606,  0.8209],
        [-1.4012, -0.4754],
        [ 0.1316,  0.4431],
        [-0.2416, -0.1228],
        [ 0.7791,  1.1401],
        [-0.1821,  0.0800],
        [ 1.2458,  0.1638],
        [-0.1977,  0.1500],
        [ 0.6284,  0.9660],
        [ 0.3503, -1.3784],
        [-0.4973,  0.2641],
        [ 0.1854, -0.9773],
        [ 0.0410,  0.7770],
        [ 0.2522, -0.6684],
        [ 0.7541, -0.3799],
        [-1.3384,  1.1626],
        [ 0.9145,  0.3126],
        [ 0.4282,  1.1034],
        [-0.2724,  0.1731],
        [ 0.5598,  0.2846],
        [-0.1600, -0.8552],
        [ 0.4487,  0.2387],
        [-0.4292, -2.3721]])

In [10]:
# Let's take the emb for the ix=5.
# One way to get it is just pluck the row.
C[5]

tensor([-1.4012, -0.4754])

In [11]:
# Another way to do it is by having a one-hot encoding then multiply it with the emb matrix
# (think of it like a weight matrix).
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

# We will get the same thing as above because of matrix multiplication works.
# But for this lecture we will just use the way it is done in the cell above.

tensor([-1.4012, -0.4754])

In [12]:
# In pytorch we can index lists. Infact it can be tensors.
C[torch.tensor([5, 6, 7])]

tensor([[-1.4012, -0.4754],
        [ 0.1316,  0.4431],
        [-0.2416, -0.1228]])

In [13]:
# It also works with 2D tensors. So we can directly plug in tensor X and get the embeddings for the chars.
C[X].shape

torch.Size([32, 3, 2])

In [15]:
X[13, 2]

tensor(1)

In [16]:
C[X][13, 2]

tensor([ 0.0686, -0.4340])

In [17]:
# As can be seen, it plucks out the emb for 1.
C[1]

tensor([ 0.0686, -0.4340])

In [18]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [None]:
# Initialize the weights and biases of the neurons (MLP).
# We are working with the figure 1 on page 6 of this paper:
# https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
# "A Neural Probabilistic Language Model"
# This is also available in the "relevant_papers" folder on the repo.