##### Do Imports

In [1]:
# If needed:
# !pip install torch
import math
import random

##### Load Text

In [2]:
with open("frankenstein.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Length:", len(text))
print(text[:500])

Length: 438806
The Project Gutenberg eBook of Frankenstein; or, the modern prometheus
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before


##### Trim Gutenberg headers/footers

In [3]:
# Project Gutenberg texts include legal/licensing text at the start and end.
# We want only the actual story, so we trim it.

start_marker = "*** START OF"  # Marker indicating where the story begins
end_marker = "*** END OF"      # Marker indicating where the story ends

start = text.find(start_marker)  # Find start index of marker
end = text.find(end_marker)      # Find end index of marker

# If both markers exist, slice the text to only include the story
if start != -1 and end != -1:
    text = text[start + len(start_marker):end].strip()  # +len(...) skips the marker itself

# Print cleaned text length and first 500 characters
print("Cleaned text length:", len(text))
print(text[:500])

Cleaned text length: 419409
THE PROJECT GUTENBERG EBOOK FRANKENSTEIN; OR, THE MODERN PROMETHEUS ***

Frankenstein;

or, the Modern Prometheus

by Mary Wollstonecraft (Godwin) Shelley


 CONTENTS

 Letter 1
 Letter 2
 Letter 3
 Letter 4
 Chapter 1
 Chapter 2
 Chapter 3
 Chapter 4
 Chapter 5
 Chapter 6
 Chapter 7
 Chapter 8
 Chapter 9
 Chapter 10
 Chapter 11
 Chapter 12
 Chapter 13
 Chapter 14
 Chapter 15
 Chapter 16
 Chapter 17
 Chapter 18
 Chapter 19
 Chapter 20
 Chapter 21
 Chapter 22
 Chapter 23
 Chapter 24




Letter 1



##### Create character vocabulary

In [4]:
# Identify all unique characters in the text
# 'set(text)' returns only unique characters
# 'list(...)' converts the set to a list so we can index it
# 'sorted(...)' ensures a consistent order of characters
chars = sorted(list(set(text)))

# Total number of unique characters
vocab_size = len(chars)

# Mapping from character → integer index
stoi = {ch: i for i, ch in enumerate(chars)}
# Mapping from integer index → character
itos = {i: ch for i, ch in enumerate(chars)}

# Helper functions
def encode(s):
    """Convert a string into a list of integer token IDs"""
    return [stoi[c] for c in s]

def decode(l):
    """Convert a list of token IDs back into a string"""
    return ''.join([itos[i] for i in l])

# Inspect vocabulary
print("Vocab size:", vocab_size)
print("All unique characters:", chars)

Vocab size: 84
All unique characters: ['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'è', 'é', 'ê', 'ô', '—', '‘', '’', '“', '”']


In [5]:
# Print mappings in a single line, separated by commas
print(", ".join([f"{repr(ch)}:{i}" for ch, i in stoi.items()]))

'\n':0, ' ':1, '!':2, '(':3, ')':4, '*':5, ',':6, '-':7, '.':8, '0':9, '1':10, '2':11, '3':12, '4':13, '5':14, '6':15, '7':16, '8':17, '9':18, ':':19, ';':20, '?':21, 'A':22, 'B':23, 'C':24, 'D':25, 'E':26, 'F':27, 'G':28, 'H':29, 'I':30, 'J':31, 'K':32, 'L':33, 'M':34, 'N':35, 'O':36, 'P':37, 'R':38, 'S':39, 'T':40, 'U':41, 'V':42, 'W':43, 'Y':44, '[':45, ']':46, '_':47, 'a':48, 'b':49, 'c':50, 'd':51, 'e':52, 'f':53, 'g':54, 'h':55, 'i':56, 'j':57, 'k':58, 'l':59, 'm':60, 'n':61, 'o':62, 'p':63, 'q':64, 'r':65, 's':66, 't':67, 'u':68, 'v':69, 'w':70, 'x':71, 'y':72, 'z':73, 'æ':74, 'è':75, 'é':76, 'ê':77, 'ô':78, '—':79, '‘':80, '’':81, '“':82, '”':83


In [6]:
# Print mappings in a single line, separated by commas
print(", ".join([f"{repr(ch)}:{i}" for ch, i in itos.items()]))

0:
, 1: , 2:!, 3:(, 4:), 5:*, 6:,, 7:-, 8:., 9:0, 10:1, 11:2, 12:3, 13:4, 14:5, 15:6, 16:7, 17:8, 18:9, 19::, 20:;, 21:?, 22:A, 23:B, 24:C, 25:D, 26:E, 27:F, 28:G, 29:H, 30:I, 31:J, 32:K, 33:L, 34:M, 35:N, 36:O, 37:P, 38:R, 39:S, 40:T, 41:U, 42:V, 43:W, 44:Y, 45:[, 46:], 47:_, 48:a, 49:b, 50:c, 51:d, 52:e, 53:f, 54:g, 55:h, 56:i, 57:j, 58:k, 59:l, 60:m, 61:n, 62:o, 63:p, 64:q, 65:r, 66:s, 67:t, 68:u, 69:v, 70:w, 71:x, 72:y, 73:z, 74:æ, 75:è, 76:é, 77:ê, 78:ô, 79:—, 80:‘, 81:’, 82:“, 83:”


In [7]:
# Encode the entire text into a list of integers manually
encoded_text = encode(text)  # use the encode() function we defined earlier

# Sanity check
print("First 20 token IDs:", encoded_text[:20])

First 20 token IDs: [40, 29, 26, 1, 37, 38, 36, 31, 26, 24, 40, 1, 28, 41, 40, 26, 35, 23, 26, 38]


In [8]:
# Convert the first 20 token IDs back to characters
decoded_sample = decode(encoded_text[:20])
print("Decoded text:", repr(decoded_sample))

Decoded text: 'THE PROJECT GUTENBER'


In [9]:
# Create a list of tuples: (ID, character)
id_char_pairs = [(i, itos[i]) for i in encoded_text[:20]]

# Print neatly
for idx, char in id_char_pairs:
    print(f"ID {idx:2} -> {repr(char)}")

ID 40 -> 'T'
ID 29 -> 'H'
ID 26 -> 'E'
ID  1 -> ' '
ID 37 -> 'P'
ID 38 -> 'R'
ID 36 -> 'O'
ID 31 -> 'J'
ID 26 -> 'E'
ID 24 -> 'C'
ID 40 -> 'T'
ID  1 -> ' '
ID 28 -> 'G'
ID 41 -> 'U'
ID 40 -> 'T'
ID 26 -> 'E'
ID 35 -> 'N'
ID 23 -> 'B'
ID 26 -> 'E'
ID 38 -> 'R'


In [10]:
print(" | ".join([f"{i}:{repr(itos[i])}" for i in encoded_text[:20]]))

40:'T' | 29:'H' | 26:'E' | 1:' ' | 37:'P' | 38:'R' | 36:'O' | 31:'J' | 26:'E' | 24:'C' | 40:'T' | 1:' ' | 28:'G' | 41:'U' | 40:'T' | 26:'E' | 35:'N' | 23:'B' | 26:'E' | 38:'R'


In [11]:
# -----------------------------
# Step: Create Input-Target Sequences
# -----------------------------
# block_size = number of previous characters the model sees
block_size = 8  # small context window for a toy model

# Prepare empty lists to store sequences
x_manual = []  # input sequences (lists of integers)
y_manual = []  # target characters (next character after input)

# Loop over the encoded text to create sequences
# We stop at len(encoded_text) - block_size to ensure each input sequence
# has exactly `block_size` characters
for i in range(len(encoded_text) - block_size):
    
    # Slice the encoded text from i to i+block_size to get the input sequence
    # This is the "context" the model sees
    input_seq = encoded_text[i:i+block_size]
    x_manual.append(input_seq)
    
    # The target is the very next character (token ID) after the input sequence
    target = encoded_text[i + block_size]
    y_manual.append(target)

# -----------------------------
# Sanity Check: Print first 5 examples
# -----------------------------
for i in range(5):
    print(f"Example {i+1}")
    print("Input IDs:     ", x_manual[i])          # token IDs of input
    print("Decoded Input: ", decode(x_manual[i]))  # convert IDs back to characters
    print("Target ID:     ", y_manual[i])          # token ID of target
    print("Decoded Target:", itos[y_manual[i]])    # convert ID to character
    print("---")

Example 1
Input IDs:      [40, 29, 26, 1, 37, 38, 36, 31]
Decoded Input:  THE PROJ
Target ID:      26
Decoded Target: E
---
Example 2
Input IDs:      [29, 26, 1, 37, 38, 36, 31, 26]
Decoded Input:  HE PROJE
Target ID:      24
Decoded Target: C
---
Example 3
Input IDs:      [26, 1, 37, 38, 36, 31, 26, 24]
Decoded Input:  E PROJEC
Target ID:      40
Decoded Target: T
---
Example 4
Input IDs:      [1, 37, 38, 36, 31, 26, 24, 40]
Decoded Input:   PROJECT
Target ID:      1
Decoded Target:  
---
Example 5
Input IDs:      [37, 38, 36, 31, 26, 24, 40, 1]
Decoded Input:  PROJECT 
Target ID:      28
Decoded Target: G
---


In [12]:
import numpy as np

# -----------------------------
# Hyperparameters for our toy model
# -----------------------------
vocab_size = len(chars)     # number of unique characters
embedding_dim = 16          # size of each character embedding vector
hidden_dim = 32             # size of hidden layer
block_size = 8              # context length (number of previous characters)

# -----------------------------
# Step 5a: Initialize Model Parameters
# -----------------------------
# Embedding matrix: maps token IDs → dense vectors
# Shape: vocab_size x embedding_dim
# Initialized randomly (small numbers)
W_embed = np.random.randn(vocab_size, embedding_dim) * 0.01

# Hidden layer weights: flatten embeddings -> hidden_dim
# Shape: (block_size * embedding_dim) x hidden_dim
W1 = np.random.randn(block_size * embedding_dim, hidden_dim) * 0.01
b1 = np.zeros(hidden_dim)  # bias for hidden layer

# Output layer: hidden_dim -> vocab_size logits
# Shape: hidden_dim x vocab_size
W2 = np.random.randn(hidden_dim, vocab_size) * 0.01
b2 = np.zeros(vocab_size)   # bias for output layer

# -----------------------------
# Sanity Check
# -----------------------------
print("Embedding matrix shape:", W_embed.shape)
print("Hidden layer weight shape:", W1.shape)
print("Output layer weight shape:", W2.shape)

Embedding matrix shape: (84, 16)
Hidden layer weight shape: (128, 32)
Output layer weight shape: (32, 84)


In [13]:
def softmax(x):
    """Compute softmax probabilities for a 1D array"""
    e_x = np.exp(x - np.max(x))  # subtract max for numerical stability
    return e_x / e_x.sum()

def forward(x_seq):
    """
    Forward pass for a single input sequence.
    
    x_seq: list of token IDs (length = block_size)
    
    Returns: 
        probs: probability distribution over vocab for the next character
    """
    # -----------------------------
    # Step 1: Lookup embeddings
    # -----------------------------
    # For each token ID in x_seq, get its embedding vector
    embeds = W_embed[x_seq]  # shape: block_size x embedding_dim
    
    # -----------------------------
    # Step 2: Flatten embeddings into a single vector
    # -----------------------------
    # So we can feed them into a simple feedforward layer
    h_input = embeds.flatten()  # shape: block_size * embedding_dim
    
    # -----------------------------
    # Step 3: Hidden layer with tanh activation
    # -----------------------------
    # Compute h = tanh(xW + b)
    h = np.tanh(h_input @ W1 + b1)  # shape: hidden_dim
    
    # -----------------------------
    # Step 4: Output layer → logits
    # -----------------------------
    logits = h @ W2 + b2  # shape: vocab_size
    
    # -----------------------------
    # Step 5: Softmax → probabilities
    # -----------------------------
    probs = softmax(logits)
    
    return probs

In [14]:
# Take first input sequence
x0 = x_manual[0]
y0 = y_manual[0]

# Forward pass: predicted probability distribution
probs = forward(x0)

# Predicted character ID (highest probability)
pred_id = np.argmax(probs)

# Print input, target, and prediction
print("Input sequence (decoded):", decode(x0))
print("Target character:", itos[y0])
print("Predicted character (before training):", itos[pred_id])

# Optional: show probabilities of top 5 characters
top5 = np.argsort(probs)[-5:][::-1]
print("Top 5 predictions:")
for i in top5:
    print(f"{itos[i]}: {probs[i]:.3f}")

Input sequence (decoded): THE PROJ
Target character: E
Predicted character (before training): ‘
Top 5 predictions:
‘: 0.012
R: 0.012
]: 0.012
k: 0.012
0: 0.012


In [15]:
# -----------------------------
# Hyperparameters
# -----------------------------
learning_rate = 0.1       # step size for gradient descent
num_epochs = 5            # number of passes over the dataset
sample_every = 2000       # how often to print generated text

# -----------------------------
# Training loop
# -----------------------------
for epoch in range(num_epochs):
    total_loss = 0
    
    # Loop over each input-target pair
    for i in range(len(x_manual)):
        x_seq = x_manual[i]    # input sequence of token IDs
        y_true = y_manual[i]   # target token ID
        
        # -----------------------------
        # Forward Pass
        # -----------------------------
        embeds = W_embed[x_seq]                  # Lookup embeddings: block_size x embedding_dim
        h_input = embeds.flatten()               # Flatten embeddings to a single vector
        h = np.tanh(h_input @ W1 + b1)           # Hidden layer activation: shape hidden_dim
        logits = h @ W2 + b2                     # Output layer logits: shape vocab_size
        probs = softmax(logits)                  # Probabilities over vocab
        
        # -----------------------------
        # Compute Loss (Cross-Entropy)
        # -----------------------------
        loss = cross_entropy_loss(probs, y_true)
        total_loss += loss
        
        # -----------------------------
        # Backpropagation
        # -----------------------------
        
        # Gradient of loss w.r.t logits
        dlogits = probs.copy()
        dlogits[y_true] -= 1  # derivative of cross-entropy with softmax
        
        # -------- Gradients for output layer --------
        dW2 = np.outer(h, dlogits)   # hidden_dim x vocab_size
        db2 = dlogits                # vocab_size
        
        # -------- Gradients for hidden layer --------
        dh = dlogits @ W2.T                    # propagate gradient to hidden layer: hidden_dim
        dh_raw = dh * (1 - h**2)               # tanh derivative
        
        dW1 = np.outer(h_input, dh_raw)        # block_size*embedding_dim x hidden_dim
        db1 = dh_raw                           # hidden_dim
        
        # -------- Gradients for embeddings --------
        dembed_flat = dh_raw @ W1.T            # block_size*embedding_dim
        dembed = dembed_flat.reshape(block_size, embedding_dim)  # reshape
        dW_embed = np.zeros_like(W_embed)
        for j, idx in enumerate(x_seq):
            dW_embed[idx] += dembed[j]        # accumulate gradients for each token ID
        
        # -----------------------------
        # Update Weights
        # -----------------------------
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W_embed -= learning_rate * dW_embed
        
    # -----------------------------
    # End of epoch
    # -----------------------------
    avg_loss = total_loss / len(x_manual)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    # -----------------------------
    # Sanity Check: Generate Sample Text
    # -----------------------------
    start_idx = np.random.randint(0, len(x_manual))
    generated_seq = x_manual[start_idx].copy()
    print("Sample start:", decode(generated_seq))
    
    # Generate 50 characters
    for _ in range(50):
        probs = forward(generated_seq[-block_size:])
        next_id = np.random.choice(len(probs), p=probs)
        generated_seq.append(next_id)
        
    print("Generated text:", decode(generated_seq))
    print("---")

NameError: name 'cross_entropy_loss' is not defined

In [None]:
# Correct dimensions
block_size = 16
embedding_dim = 32
hidden_dim = 64
vocab_size = len(chars)

np.random.seed(42)
W_embed = np.random.randn(vocab_size, embedding_dim) * 0.01
W1 = np.random.randn(block_size * embedding_dim, hidden_dim) * 0.01  # 16*32 = 512
b1 = np.zeros(hidden_dim)
W2 = np.random.randn(hidden_dim, vocab_size) * 0.01
b2 = np.zeros(vocab_size)

In [None]:
def generate_text(start_seq, length=200, temperature=0.8, top_k=5):
    """Generate text from start sequence using current model."""
    generated = start_seq.copy()
    
    for _ in range(length):
        # Take last block_size tokens
        context = generated[-block_size:]
        
        # Forward pass
        embeds = W_embed[context].flatten()
        h = np.tanh(embeds @ W1 + b1)
        logits = h @ W2 + b2
        probs = np.exp(logits) / np.sum(np.exp(logits))  # softmax
        
        # Apply temperature
        probs = probs ** (1/temperature)
        probs /= probs.sum()
        
        # Apply top-k filtering
        if top_k is not None:
            top_idx = np.argsort(probs)[-top_k:]
            mask = np.zeros_like(probs)
            mask[top_idx] = probs[top_idx]
            probs = mask / mask.sum()
        
        # Sample next token
        next_id = np.random.choice(len(probs), p=probs)
        generated.append(next_id)
    
    return generated

In [None]:
learning_rate = 0.1
num_epochs = 20  # more epochs for better learning

for epoch in range(num_epochs):
    total_loss = 0
    
    for i in range(len(x_manual)):
        x_seq = x_manual[i]
        y_true = y_manual[i]
        
        # Forward pass
        embeds = W_embed[x_seq].flatten()
        h = np.tanh(embeds @ W1 + b1)
        logits = h @ W2 + b2
        probs = np.exp(logits) / np.sum(np.exp(logits))
        
        # Loss
        loss = -np.log(probs[y_true] + 1e-9)
        total_loss += loss
        
        # Backpropagation
        dlogits = probs.copy()
        dlogits[y_true] -= 1
        
        dW2 = np.outer(h, dlogits)
        db2 = dlogits
        
        dh = dlogits @ W2.T
        dh_raw = dh * (1 - h**2)
        
        dW1 = np.outer(embeds, dh_raw)
        db1 = dh_raw
        
        dembed_flat = dh_raw @ W1.T
        dembed = dembed_flat.reshape(block_size, embedding_dim)
        dW_embed = np.zeros_like(W_embed)
        for j, idx in enumerate(x_seq):
            dW_embed[idx] += dembed[j]
        
        # Update weights
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W_embed -= learning_rate * dW_embed
    
    avg_loss = total_loss / len(x_manual)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    # Sample text every 5 epochs
    if (epoch+1) % 5 == 0:
        start_idx = np.random.randint(0, len(x_manual))
        start_seq = x_manual[start_idx]
        generated_seq = generate_text(start_seq, length=200, temperature=0.8, top_k=5)
        print("Sample generation:")
        print(decode(generated_seq))
        print("---")

In [None]:
x_seq = x_manual[0]
embeds = W_embed[x_seq].flatten()
print("embeds.shape:", embeds.shape)
print("W1.shape:", W1.shape)
h = np.tanh(embeds @ W1 + b1)
print("h.shape:", h.shape)