In [2]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2025-06-19 20:59:54--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2025-06-19 20:59:55 (4.28 MB/s) - ‘names.txt’ saved [228145/228145]



In [191]:
import torch
import matplotlib.pyplot as plt # for making figures
import torch.nn.functional as F
import time

In [192]:
# 1. Check if the MPS backend is available
if torch.backends.mps.is_available():
    # 2. Set the default device to 'mps'
    torch.set_default_device('mps')
    print("Default device has been set to 'mps' (Mac GPU)")
else:
    # Fallback to CPU if MPS is not available
    print("MPS not available. Default device remains 'cpu'.")

Default device has been set to 'mps' (Mac GPU)


In [193]:
torch.set_default_device('cpu')
print("Default device has been set to 'cpu'")

Default device has been set to 'cpu'


In [194]:
#get all words from names.txt
words = open('data/names.txt', 'r').read().splitlines()
print(words[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [195]:
#get all the unique characters in the words
chars = sorted(list(set(''.join(words))))
print(chars)

# create a mapping from characters to integers
stoi = {ch:i+1 for i, ch in enumerate(chars)}
itos = {i+1:ch for i, ch in enumerate(chars)}
stoi['.'] = 0
itos[0] = '.'
vocab_size = len(itos)
print(stoi)
print(itos)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [196]:
block_size = 3

#build the dataset
def build_dataset(words):
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for ch in word:
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return (X, Y)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

In [197]:
#Sequential model
class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        return [layer.parameters() for layer in self.layers]

    def append(self, layer):
        self.layers.append(layer)

#Embedding layer
class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.weight = torch.randn(num_embeddings, embedding_dim)/num_embeddings**0.5

    def __call__(self, x):
        self.out = self.weight[x]
        return self.out
    
    def parameters(self):
        return [self.weight]

#Linear layer
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.fan_in = fan_in
        self.fan_out = fan_out
        self.weight = torch.randn(fan_in, fan_out)/fan_in**0.5 #kaiming initialization
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight + self.bias
        return self.out
    def parameters(self):
        return [self.weight, self.bias]

#Tanh activation function
class Tanh:
    def __init__(self):
        pass
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

#softmax activation function
class Softmax:
    def __init__(self, dim = 1):
        self.dim = dim
    def __call__(self, x):
        self.out = torch.softmax(x, dim=self.dim)
        return self.out

#ReLU activation function
class ReLU:
    def __init__(self):
        pass
    def __call__(self, x):
        self.out = torch.relu(x)
        return self.out

'''#Batch normalization
class BatchNorm1d:
    def __init__(self, ):
        '''

'#Batch normalization\nclass BatchNorm1d:\n    def __init__(self, ):\n        '

In [198]:
batch_size = 20000
max_iters = 3000
learning_rate = 1e-2
n_embd = 10
n_hidden = 10
n_input = block_size * n_embd
n_output = vocab_size

In [199]:
#define the model
model = Sequential(
    [Embedding(vocab_size, n_embd),
    Linear(n_embd, n_hidden), Tanh(),
    Linear(n_hidden, n_output)]
)

In [200]:
#define the model
C = torch.randn(vocab_size, n_embd)
w1 = torch.randn(n_input, n_hidden)
w2 = torch.randn(n_hidden, n_output)
b1 = torch.randn(n_hidden)
b2 = torch.randn(n_output)

parameters = [C, w1, w2, b1, b2]

for p in parameters:
    p.requires_grad = True

In [202]:
start_time = time.time()
for iter in range(max_iters):
    idx = torch.randint(0, Xtr.shape[0], (batch_size,))

    x_batch = Xtr[idx]
    y_batch = Ytr[idx]

    #flatten the batch
    x_batch = C[x_batch].view(x_batch.shape[0], -1)


    #forward pass
    h1 = x_batch @ w1 + b1
    h1 = torch.tanh(h1)
    h2 = h1 @ w2 + b2

    #calculate the loss
    loss = F.cross_entropy(h2, y_batch)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #update the parameters
    for p in parameters:
        p.data -= learning_rate * p.grad
        
    
    if iter % 100 == 0:
        print(f"iter {iter}, loss {loss.item()}")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"The for loop took {elapsed_time:.4f} seconds to run.")


iter 0, loss 6.442702770233154
iter 100, loss 5.569154739379883
iter 200, loss 5.18377161026001
iter 300, loss 4.9155449867248535
iter 400, loss 4.661125183105469
iter 500, loss 4.5249199867248535
iter 600, loss 4.326540470123291
iter 700, loss 4.231530666351318
iter 800, loss 4.1117377281188965
iter 900, loss 3.9645440578460693
iter 1000, loss 3.8666975498199463
iter 1100, loss 3.8130598068237305
iter 1200, loss 3.7475059032440186
iter 1300, loss 3.6579160690307617
iter 1400, loss 3.6058695316314697
iter 1500, loss 3.5480453968048096
iter 1600, loss 3.501758098602295
iter 1700, loss 3.423393726348877
iter 1800, loss 3.4103245735168457
iter 1900, loss 3.3696296215057373
iter 2000, loss 3.326004981994629
iter 2100, loss 3.2905867099761963
iter 2200, loss 3.2701127529144287
iter 2300, loss 3.242154359817505
iter 2400, loss 3.2077934741973877
iter 2500, loss 3.2139241695404053
iter 2600, loss 3.1442012786865234
iter 2700, loss 3.1374199390411377
iter 2800, loss 3.1186327934265137
iter 290

In [188]:
learning_rate = 1e-3

Small batch:


MPS time: 25.2170 seconds 

CPU time: 5.1533 seconds


Big batch:

MPS time: 39.9550 seconds 

CPU time: 26.8415 seconds


The biggest batch (small loop):

MPS time: 38.4508 seconds 

CPU time: 21.8074 seconds