In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [2]:
# read in all the words
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
words = open('./names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

--2025-11-23 16:25:24--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2025-11-23 16:25:25 (11.4 MB/s) - ‘names.txt’ saved [228145/228145]

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [3]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))  # unique chars in all names
chars = ['.'] + chars                      # include start/end token

stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for c,i in stoi.items()}

vocab_size = len(chars)

In [4]:
# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

In [5]:
# initial hyperparameters
vocab_size = len(itos)
hidden_size = 32

In [6]:
dataset = []

for w in words:
    chs = ['.'] + list(w) + ['.']
    xs = [stoi[c] for c in chs[:-1]]   # inputs
    ys = [stoi[c] for c in chs[1:]]    # targets
    dataset.append((xs, ys))


In [7]:
dataset[:3]  # three examples of (input, target) sequences

[([0, 25, 21, 8, 5, 14, 7], [25, 21, 8, 5, 14, 7, 0]),
 ([0, 4, 9, 15, 14, 4, 18, 5], [4, 9, 15, 14, 4, 18, 5, 0]),
 ([0, 24, 1, 22, 9, 5, 14], [24, 1, 22, 9, 5, 14, 0])]

In [8]:
for i, (xs, ys) in enumerate(dataset[:8]):
    xin = ''.join(itos[x] for x in xs)
    yin = ''.join(itos[y] for y in ys)
    print(f"{i:2d}: {words[i]:12s}   {xin} -> {yin}")

 0: yuheng         .yuheng -> yuheng.
 1: diondre        .diondre -> diondre.
 2: xavien         .xavien -> xavien.
 3: jori           .jori -> jori.
 4: juanluis       .juanluis -> juanluis.
 5: erandi         .erandi -> erandi.
 6: phia           .phia -> phia.
 7: samatha        .samatha -> samatha.


In [9]:
# example one-hot vectors (first two time steps of first name)
xs0 = torch.tensor(dataset[0][0], dtype=torch.long)
print("Example input indices for first name:", xs0)
ys0 = torch.tensor(dataset[0][1], dtype=torch.long)
print("Example target indices for first name:", ys0)
one_hot_t0 = F.one_hot(xs0[0], num_classes=vocab_size).float().unsqueeze(1)
one_hot_t1 = F.one_hot(xs0[1], num_classes=vocab_size).float().unsqueeze(1)
print("\nExample one-hot shapes for first name:", one_hot_t0.shape, one_hot_t1.shape)

Example input indices for first name: tensor([ 0, 25, 21,  8,  5, 14,  7])
Example target indices for first name: tensor([25, 21,  8,  5, 14,  7,  0])

Example one-hot shapes for first name: torch.Size([27, 1]) torch.Size([27, 1])


In [10]:
h_prev = torch.zeros((hidden_size, 1)) # initial hidden state
Wxh = torch.randn((hidden_size, vocab_size), requires_grad=True) * 0.01 # weight input to hidden
Whh = torch.randn((hidden_size, hidden_size), requires_grad=True) * 0.01 # weight hidden to hidden
bh = torch.zeros((hidden_size, 1), requires_grad=True) # hidden bias
Why = torch.randn((vocab_size, hidden_size), requires_grad=True) * 0.01 # weight hidden to output
by = torch.zeros((vocab_size, 1), requires_grad=True) # output bias

In [11]:
a = Wxh @ one_hot_t0 + Whh @ h_prev + bh     # raw pre-activation
h = torch.tanh(a)                     # hidden state
y = Why @ h + by                     # logits
probs = F.softmax(y, dim=0)          # probabilities

# Wxh @ x0: meaning of the current character
# Whh @ h_prev: memory of previous characters
# tanh: bounded memory update
# Why @ h: convert hidden state to next-char prediction
# softmax: probability distribution over characters

In [12]:
# for entire na
xs, ys = dataset[0]   # e.g. ".emma."
T = len(xs)
h_prev = torch.zeros(hidden_size, 1)
loss = 0

# ----- forward pass -----
for t in range(T):
    x_t = F.one_hot(torch.tensor(xs[t]), num_classes=vocab_size).float().unsqueeze(1) # one-hot vector 27 * 1 
    a = Wxh @ x_t + Whh @ h_prev + bh # 32*27 @ 27*1 + 32*32 @ 32*1 + 32 * 1 = 32*1
    h = torch.tanh(a) # 32*1
    y = Why @ h + by # 27*32 @ 32*1 + 27*1 = 27*1
    loss += F.cross_entropy(y.T, torch.tensor([ys[t]]))
    h_prev = h # carry forward hidden state

# ----- backward pass -----
loss.backward()
# PyTorch will compute:
# dL/dWhy
# dL/dWxh
# dL/dWhh (most important part of BPTT)
# dL/dbh
# dL/dby
# across ALL timesteps.
# PyTorch keeps the entire chain linked.
# When you call .backward(), it walks the computational graph from the final loss back through all timesteps automatically.

In [13]:
loss

tensor(23.0707, grad_fn=<AddBackward0>)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
torch.device('cuda')

In [None]:
# Initialize random data scaled by 0.01 FIRST
Wxh = (torch.randn((hidden_size, vocab_size)) * 0.01).to(device) # directly specifying requires_grad later, as required_grad=True gives a warning
# in same variable raise error, as it is leaf variable and myltiplying by scalar makes it non-leaf
Whh = (torch.randn((hidden_size, hidden_size)) * 0.01).to(device)
bh = torch.zeros((hidden_size, 1)).to(device)
Why = (torch.randn((vocab_size, hidden_size)) * 0.01).to(device)
by = torch.zeros((vocab_size, 1)).to(device)

# THEN tell PyTorch to track gradients for these leaves
Wxh.requires_grad = True
Whh.requires_grad = True
bh.requires_grad = True
Why.requires_grad = True
by.requires_grad = True

In [None]:
dataset_tensors = []
for xs, ys in dataset:
    xs_t = torch.tensor(xs, dtype=torch.long).to(device)
    ys_t = torch.tensor(ys, dtype=torch.long).to(device)
    dataset_tensors.append((xs_t, ys_t))

In [25]:
learning_rate = 0.1
params = [Wxh, Whh, bh, Why, by]
V = vocab_size
H = hidden_size

for epoch in range(1_000):
    total_loss = 0.0

    for xs_t, ys_t in dataset_tensors:
        for p in params:
            p.grad = None

        h_prev = torch.zeros(H, 1)
        if torch.cuda.is_available():
            h_prev = h_prev.cuda()

        step_losses = []

        for t in range(xs_t.size(0)):
            x_t = F.one_hot(xs_t[t], num_classes=V).float().unsqueeze(1)
            a = Wxh @ x_t + Whh @ h_prev + bh
            h = torch.tanh(a)
            y = Why @ h + by

            step_loss = F.cross_entropy(y.T, ys_t[t].unsqueeze(0))
            step_losses.append(step_loss)

            h_prev = h

        loss = torch.stack(step_losses).sum()
        loss.backward()
        # --- NEW: Clip gradients to prevent explosion ---
        torch.nn.utils.clip_grad_norm_(params, max_norm=1.0) 
        # ------------------------------------------------
        with torch.no_grad():
            for p in params:
                p -= learning_rate * p.grad

        total_loss += float(loss)
    if epoch % 100==0:
        print(epoch, total_loss)

  p -= learning_rate * p.grad


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [21]:
def sample_model(start_char='.', max_length=20):
    # 1. Initialize input state
    # Start with the hidden state at zero (like at the start of training)
    h = torch.zeros((hidden_size, 1))
    
    # Start with the specific start_char (usually '.')
    ix = stoi[start_char]
    
    output_name = []
    
    for _ in range(max_length):
        # --- Forward Pass (Same as training, but one step at a time) ---
        
        # Create input vector (one-hot) for the current character
        x_t = F.one_hot(torch.tensor(ix), num_classes=vocab_size).float().unsqueeze(1)
        
        # Calculate hidden state update
        h = torch.tanh(Wxh @ x_t + Whh @ h + bh)
        
        # Calculate logits (raw scores) for next character
        logits = Why @ h + by
        
        # --- Sampling ---
        
        # Convert logits to probabilities (softmax)
        probs = F.softmax(logits, dim=0)
        
        # Sample the next character index from the distribution
        # (This adds variety; taking the argmax would just repeat the same letters)
        ix = torch.multinomial(probs.flatten(), num_samples=1).item()
        
        # Decode the index to a character
        next_char = itos[ix]
        
        # Stop if we predict the end token
        if next_char == '.':
            break
            
        output_name.append(next_char)
        
    return ''.join(output_name)

# Generate 10 names
print("--- Generated Names ---")
for _ in range(10):
    print(sample_model())

--- Generated Names ---
anaitia
jaa
bmerim
zamar
zur
goirie
jilonnia
alewia
sovaria
fayleigh
