In [2]:
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Define the text
text = ["I", "love", "deep", "learning"]

# Build vocabulary
vocab = list(set(text))
vocab_size = len(vocab)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Hyperparameters
input_size = vocab_size
hidden_size = 8  # enough hidden size
output_size = vocab_size
learning_rate = 0.1
epochs = 100  # only 100 epochs now!

# Initialize weights
Wx = np.random.randn(hidden_size, input_size) * 0.01
Wh = np.random.randn(hidden_size, hidden_size) * 0.01
b = np.zeros((hidden_size, 1))

Wy = np.random.randn(output_size, hidden_size) * 0.01
by = np.zeros((output_size, 1))

# One-hot encoding
def one_hot(idx, vocab_size):
    vec = np.zeros((vocab_size, 1))
    vec[idx] = 1
    return vec

# Prepare input and target
inputs = [one_hot(word_to_idx[word], vocab_size) for word in text[:-1]]
target = word_to_idx[text[-1]]

# Training loop
for epoch in range(epochs):
    # Forward pass
    hprev = np.zeros((hidden_size, 1))
    hs = {}
    hs[-1] = np.copy(hprev)
    xs = {}

    for t in range(len(inputs)):
        xs[t] = inputs[t]
        hprev = np.tanh(np.dot(Wx, xs[t]) + np.dot(Wh, hprev) + b)
        hs[t] = hprev

    # Output
    y = np.dot(Wy, hprev) + by
    exp_y = np.exp(y - np.max(y))  # for stability
    p = exp_y / np.sum(exp_y)

    # Loss
    loss = -np.log(p[target, 0])

    # Backward pass
    dWy = np.zeros_like(Wy)
    dby = np.zeros_like(by)
    dWx = np.zeros_like(Wx)
    dWh = np.zeros_like(Wh)
    db = np.zeros_like(b)
    dhnext = np.zeros_like(hs[0])

    dy = np.copy(p)
    dy[target] -= 1

    dWy += np.dot(dy, hs[len(inputs)-1].T)
    dby += dy

    dh = np.dot(Wy.T, dy) + dhnext

    for t in reversed(range(len(inputs))):
        dhraw = (1 - hs[t] * hs[t]) * dh
        db += dhraw
        dWx += np.dot(dhraw, xs[t].T)
        dWh += np.dot(dhraw, hs[t-1].T)
        dh = np.dot(Wh.T, dhraw)

    # Clip gradients
    for dparam in [dWx, dWh, dWy, db, dby]:
        np.clip(dparam, -1, 1, out=dparam)

    # Update parameters
    Wx -= learning_rate * dWx
    Wh -= learning_rate * dWh
    b -= learning_rate * db
    Wy -= learning_rate * dWy
    by -= learning_rate * dby

    # Print loss occasionally
    if epoch % 10 == 0 or epoch == epochs-1:
        print(f"Epoch {epoch}, Loss: {loss:.6f}")

# After training: Make a prediction
# Forward pass one more time
hprev = np.zeros((hidden_size, 1))
for t in range(len(inputs)):
    hprev = np.tanh(np.dot(Wx, inputs[t]) + np.dot(Wh, hprev) + b)

# Output
y = np.dot(Wy, hprev) + by
exp_y = np.exp(y - np.max(y))
p = exp_y / np.sum(exp_y)

predicted_idx = np.argmax(p)
predicted_word = idx_to_word[predicted_idx]

print(f"\nPredicted word: {predicted_word}")


Epoch 0, Loss: 1.386635
Epoch 10, Loss: 0.805816
Epoch 20, Loss: 0.499877
Epoch 30, Loss: 0.336444
Epoch 40, Loss: 0.240357
Epoch 50, Loss: 0.178404
Epoch 60, Loss: 0.135910
Epoch 70, Loss: 0.105717
Epoch 80, Loss: 0.083808
Epoch 90, Loss: 0.067656
Epoch 99, Loss: 0.056629

Predicted word: learning
