In [45]:
import mlx.core as mx
import mlx.nn as nn

In [46]:
# Load dataset
names = []
with open('names.txt', 'r') as file:
    while line := file.readline():
        names.append(line.rstrip())

In [47]:
# Get unique chars in dataset
chars = set()
for name in names:
    for c in name:
        chars.add(c)

# Assign value to each character -- this will be our embedding.
stoi = {ch:i+1 for i,ch in enumerate(sorted(chars))}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [48]:
# Build dataset
block_size = 3
X = []
y = []

for name in names[:5]:
    print('Name:', name)
    ctx = [0] * block_size
    for c in name + '.':
        idx = stoi[c]
        X.append(ctx)
        y.append(idx)
        print(''.join([itos[i] for i in ctx]), '-->', itos[idx])
        ctx = ctx[1:] + [stoi[c]]

X = mx.array(X)
y = mx.array(y)

Name: emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
Name: olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
Name: ava
... --> a
..a --> v
.av --> a
ava --> .
Name: isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
Name: sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [49]:
X.shape, X.dtype, y.shape, y.dtype

([32, 3], mlx.core.int32, [32], mlx.core.int32)

In [50]:
# Create 2D embedding for each letter
C = mx.random.normal([27, 2])

In [51]:
# Embed each letter in each of the examples in the training data, X
emb = C[X]
emb.shape

[32, 3, 2]

In [52]:
# Create new layer
# Input size = 3 x 2; This is b/c each example in `emb` contains 3 chars, each of which have 2 dims.
# Output size = 100; This is arbitrary.
W1 = mx.random.normal([6, 100])
b1 = mx.random.normal([100])

### First Layer
We want to multiply our embedded input by our first layer of weights, add the bias, and perform a tanh function over the results to normalize the output: 
    
    tanh(emb @ W1 + b1)


However, the current shapes of our tensors don't support this multiplication operation:

    emb.shape == [32, 3, 2]
    W1.shape == [6, 100]


To solve this, we need to combine the second and third dimensions of our embedded input tensor, giving us:
    
    emb.shape == [32, 6]


This represents 32 examples of 3 characters, each with a 2-dimensional embedding:
    
    Ex. [Char1FirstEmb, Char1SecondEmb, Char2FirstEmb, Char2SecondEmb, Char3FirstEmb, Char3SecondEmb]

In [55]:
# We can achieve this functionality by using `reshape()`.
# Provides a more memory-efficient way of re-shaping the array
emb_reshaped = mx.reshape(emb, (32, 6))
print(emb_reshaped[:5])
print(emb_reshaped.shape)

array([[0.192549, -0.398054, 0.192549, -0.398054, 0.192549, -0.398054],
       [0.192549, -0.398054, 0.192549, -0.398054, 0.666218, -1.37186],
       [0.192549, -0.398054, 0.666218, -1.37186, -0.304652, -0.812019],
       [0.666218, -1.37186, -0.304652, -0.812019, -0.304652, -0.812019],
       [-0.304652, -0.812019, -0.304652, -0.812019, -0.438571, 1.22245]], dtype=float32)
[32, 6]


In [56]:
# Generalize the re-shaping of the tensor to accommodate arbitrary block_size's
emb_flattened = mx.flatten(emb, start_axis=1)
print(emb_flattened[:5])
print(emb_flattened.shape)

array([[0.192549, -0.398054, 0.192549, -0.398054, 0.192549, -0.398054],
       [0.192549, -0.398054, 0.192549, -0.398054, 0.666218, -1.37186],
       [0.192549, -0.398054, 0.666218, -1.37186, -0.304652, -0.812019],
       [0.666218, -1.37186, -0.304652, -0.812019, -0.304652, -0.812019],
       [-0.304652, -0.812019, -0.304652, -0.812019, -0.438571, 1.22245]], dtype=float32)
[32, 6]


In [72]:
# Perform the matrix multiplication and apply tanh
h = mx.tanh(emb_flattened @ W1 + b1)
print(h[:5])
print(h.shape)

array([[0.47145, -0.122259, -0.929297, ..., 0.300227, -0.859318, 0.530754],
       [0.857721, -0.198757, -0.906659, ..., 0.0625539, -0.607978, 0.572188],
       [0.957443, 0.398662, -0.977714, ..., -0.550226, -0.99897, 0.987817],
       [0.975854, -0.47205, -0.988863, ..., -0.0823076, -0.939782, 0.80782],
       [-0.7669, -0.0193273, -0.891213, ..., 0.155822, -0.991842, -0.00118141]], dtype=float32)
[32, 100]


### Next Layer
This layer consists of another set of weights and biases, W2 and b2. It produces logits by multiplying the outputs of the previous layer by W2 and adding the bias vector b2.

In [64]:
# Next layer produces logits
W2 = mx.random.normal([100, 27])
b2 = mx.random.normal([27])
logits = h @ W2 + b2
logits.shape

[32, 27]

### Final Layer
To make the logits useful, we must perform a softmax operation. This gives us a vector of normalized probabilities for each character in an example.

In [65]:
# Complete Softmax over all logits (manually)
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [66]:
prob.shape

[32, 27]

In [69]:
# Get the probability of the correct character produced by the model, as defined by `Y`
print(y)
prob[mx.arange(32), y]

array([5, 13, 13, ..., 9, 1, 0], dtype=int32)


array([8.61489e-10, 0.928421, 0.062959, ..., 2.21079e-08, 1.20255e-10, 3.05629e-07], dtype=float32)

### Calculate Loss
With these probabilities, we are able to calculate the loss (negative log likelihood).

In [74]:
# For each example, index into the y-th position to retrieve the probability calculated for the correct label.
loss = -prob[mx.arange(32), y].log().mean()
loss

array(15.728, dtype=float32)