In [348]:
import numpy as n
import matplotlib.pyplot as plt
import random

In [279]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x, axis=0, keepdims=True)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [280]:
def rnn_cell_forward(xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """

    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, xt) + ba)  # (n_a, m)
    yt_pred = softmax(np.dot(Wya, a_next) + by)                   # (n_y, m)
    cache = (a_next, a_prev, xt, parameters)

    return a_next, yt_pred, cache

In [281]:
def rnn_forward(x, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network

    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing parameters matrices

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    """

    caches = []
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape

    a = np.zeros((n_a, m, T_x))
    y_pred = np.zeros((n_y, m, T_x))

    a_next = a0
    for t in range(T_x):
        a_next, yt_pred, cache = rnn_cell_forward(x[:, :, t], a_next, parameters)
        a[:, :, t] = a_next
        y_pred[:, :, t] = yt_pred
        caches.append(cache)

    caches = (caches, x)

    return a, y_pred, caches

In [313]:
def rnn_cell_backward(da_next, cache):
    """
    Implements the backward pass for a single RNN cell (1 time-step).

    Arguments:
    da_next -- Gradients of the next hidden state, of shape (n_a, m)
               the gradient of loss w.r.t. the current hidden state (from the future)
    cache -- Tuple of values (a_next, a_prev, xt, parameters) from forward pass

    Returns:
    gradients -- Dictionary containing:
        dxt -- Gradient of input data at time-step t, of shape (n_x, m)
        da_prev -- Gradient w.r.t. previous hidden state, of shape (n_a, m)
        dWax -- Gradient w.r.t. input weight matrix, shape (n_a, n_x)
        dWaa -- Gradient w.r.t. hidden state weight matrix, shape (n_a, n_a)
        dba -- Gradient w.r.t. bias vector, shape (n_a, 1)
    """
    # Shapes from forward
    # xt     → (n_x, m)
    # a_prev → (n_a, m)
    # Wax    → (n_a, n_x)
    # Waa    → (n_a, n_a)
    # Wya    → (n_y, n_a)
    # ba     → (n_a, 1)
    # by     → (n_y, 1)

    (a_next, a_prev, xt, parameters) = cache

    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    # Derivative through tanh activation
    dtanh = (1 - a_next**2) * da_next     # (n_a, m)

    # Gradients
    dxt    = np.dot(Wax.T, dtanh)         # (n_x, m)
    dWax   = np.dot(dtanh, xt.T)          # (n_a, n_x)
    da_prev = np.dot(Waa.T, dtanh)        # (n_a, m)
    dWaa   = np.dot(dtanh, a_prev.T)      # (n_a, n_a)
    dba    = np.sum(dtanh, axis=1, keepdims=True)  # (n_a, 1)

    gradients = {
        "dxt": dxt,
        "da_prev": da_prev,
        "dWax": dWax,
        "dWaa": dWaa,
        "dba": dba
    }

    return gradients

In [330]:
def rnn_backward(da, caches):
    """
    Implements the backward pass for a vanilla RNN over an entire sequence.

    Arguments:
    da -- Gradients of all hidden states, shape (n_a, m, T_x)
    caches -- Tuple of values from rnn_forward (list of caches, x)

    Returns:
    gradients -- Dictionary with:
        dx -- Gradient of input data, shape (n_x, m, T_x)
        da0 -- Gradient of initial hidden state, shape (n_a, m)
        dWax -- Gradient of input weights, shape (n_a, n_x)
        dWaa -- Gradient of hidden weights, shape (n_a, n_a)
        dba -- Gradient of bias vector, shape (n_a, 1)
    """

    caches_list, x = caches
    (a1, a0, xt, parameters) = caches_list[0]

    n_a, m, T_x = da.shape
    n_x, _ = xt.shape

    dx    = np.zeros((n_x, m, T_x))
    dWax  = np.zeros((n_a, n_x))
    dWaa  = np.zeros((n_a, n_a))
    dba   = np.zeros((n_a, 1))
    da0   = np.zeros((n_a, m))

    da_next = np.zeros((n_a, m))

    for t in reversed(range(T_x)):
        da_current = da[:, :, t] + da_next
        gradients = rnn_cell_backward(da_current, caches_list[t])

        dx[:, :, t] = gradients["dxt"]
        dWax += gradients["dWax"]
        dWaa += gradients["dWaa"]
        dba  += gradients["dba"]
        da_next = gradients["da_prev"]

    da0 = da_next

    gradients = {
        "dx": dx,
        "da0": da0,
        "dWax": dWax,
        "dWaa": dWaa,
        "dba": dba
    }

    return gradients

In [331]:
def initialize_parameters(n_a, n_x, n_y):
    np.random.seed(1)
    parameters = {}
    parameters['Wax'] = np.random.randn(n_a, n_x) * 0.01
    parameters['Waa'] = np.random.randn(n_a, n_a) * 0.01
    parameters['Wya'] = np.random.randn(n_y, n_a) * 0.01
    parameters['ba'] = np.zeros((n_a, 1))
    parameters['by'] = np.zeros((n_y, 1))
    return parameters

In [332]:
def update_parameters(parameters, gradients, lr):
    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['ba']  += -lr * gradients['dba'] 
    parameters['by']  += -lr * gradients['dby']
    return parameters

In [333]:
def clip_gradients(gradients, max_value):
    dWaa = gradients['dWaa']
    dWax = gradients['dWax']
    dWya = gradients['dWya']
    dba =  gradients['dba']
    dby = gradients['dby']
    
    for gradient in [dWax, dWaa, dWya, dba, dby]:
        np.clip(gradient, -max_value, max_value, out = gradient)
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "dba": dba, "dby": dby}
    
    return gradients

In [334]:
def sample(parameters, char_to_ix, length=20, temperature=1.0, start_char=None):
    Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
    vocab_size = by.shape[0]
    n_a = Waa.shape[0]
    
    x = np.zeros((vocab_size, 1))
    
    indices = []
    
    if start_char is not None:
        x[char_to_ix[start_char]] = 1
        indices.append(char_to_ix[start_char])

    a_prev = np.zeros((n_a, 1))
    
    idx = -1
    counter = 0
    newline_char = char_to_ix['\n']
    
    while idx != newline_char and counter < length:
        a = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, x) + ba)
        z = np.dot(Wya, a) + by
        
        # Apply temperature before softmax
        z = z / temperature
        y = softmax(z)

        # Random sampling with temperature-scaled probabilities
        idx = np.random.choice(range(vocab_size), p=y.flatten())
        
        x = np.zeros((vocab_size, 1))
        x[idx] = 1
        indices.append(idx)
        
        a_prev = a
        counter += 1

    if counter == length:
        indices.append(char_to_ix['\n'])

    return indices

In [335]:
file = open('dinos.txt', 'r')
data = file.read().lower()
file.close()

In [336]:
chars = sorted(list(set(data)))
print(chars)

['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [337]:
vocab_size = len(chars)
data_size = len(data)

print('Unique Chars:', vocab_size)
print('Total Names:', data_size)

Unique Chars: 27
Total Names: 19909


In [338]:
char_to_ix = { ch:i for i, ch in enumerate(chars) }
ix_to_char = { i:ch for i, ch in enumerate(chars) }

In [339]:
print(char_to_ix)
print(ix_to_char)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [340]:
def create_dataset(data):
    """
    Creates dataset with one example per dinosaur name.
    Each x_seq starts with '\n' and ends before the final character,
    each y_seq starts from the first char and ends at '\n'.

    Returns:
    examples -- List of (x_seq, y_seq) pairs for each name
    """
    names = data.strip().split('\n')
    examples = []

    for name in names:
        full_name = '\n' + name.lower() + '\n'  # Add start and end token
        x_seq = full_name[:-1]
        y_seq = full_name[1:]
        examples.append((x_seq, y_seq))

    return examples

examples = create_dataset(data)
examples[:3]

[('\naachenosaurus', 'aachenosaurus\n'),
 ('\naardonyx', 'aardonyx\n'),
 ('\nabdallahsaurus', 'abdallahsaurus\n')]

In [341]:
def vectorize_example(x_seq, y_seq, char_to_ix, vocab_size):
    T_x = len(x_seq)
    x = np.zeros((vocab_size, 1, T_x))
    y = np.zeros((vocab_size, 1, T_x))
    for t in range(T_x):
        x[char_to_ix[x_seq[t]], 0, t] = 1
        y[char_to_ix[y_seq[t]], 0, t] = 1
    return x, y

In [342]:
def get_dino_name_from_indices(indices, ix_to_char):
    name = ''.join([ix_to_char[ix] for ix in indices])
    return name.capitalize()

In [350]:
losses = []
n_a = 64
n_x = vocab_size
n_y = vocab_size

In [360]:
parameters = initialize_parameters(n_a=n_a, n_x=n_x, n_y=n_y)

In [361]:
for epoch in range(50000):
    # Sample one random training pair
    idx = np.random.randint(0, len(examples))
    x_seq, y_seq = examples[idx]
    x, y = vectorize_example(x_seq, y_seq, char_to_ix, vocab_size)
    T_x = x.shape[2]
    
    # forward pass
    a0 = np.zeros((n_a, 1))
    a, y_pred, caches = rnn_forward(x, a0, parameters)
    
    # gradients wrt output
    dy = y_pred - y
    da = np.zeros_like(a)
    for t in range(T_x):
        da[:, :, t] = np.dot(parameters["Wya"].T, dy[:, :, t])
        
    # backward pass
    gradients = rnn_backward(da, caches)
    
    # dWya, dby (output layer gradients)
    dWya = np.zeros_like(parameters["Wya"])
    dby  = np.zeros_like(parameters["by"])
    
    for t in range(T_x):
        dWya += np.dot(dy[:, :, t], a[:, :, t].T)
        dby  += np.sum(dy[:, :, t], axis=1, keepdims=True)

    gradients["dWya"] = dWya
    gradients["dby"]  = dby
    
    gradients = clip_gradients(gradients, max_value=5.0)
    
    learning_rate = 0.001
    parameters = update_parameters(parameters, gradients, learning_rate)
    
    loss = -np.sum(y * np.log(y_pred + 1e-8)) / T_x
    
    if epoch % 1000 == 0:
        losses.append(loss)
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
#         sam_indices = sample(parameters, char_to_ix, length=20)
        sam_indices = sample(parameters, char_to_ix, length=20, temperature=1.0)
        print("Sample:", sam_indices)
        print("Sample Name:", get_dino_name_from_indices(sam_indices, ix_to_char))
        print("-" * 50)

Epoch 0, Loss: 3.2963
Sample: [22, 23, 17, 9, 5, 5, 11, 22, 23, 2, 24, 24, 14, 16, 3, 2, 0]
Sample Name: Vwqieekvwbxxnpcb

--------------------------------------------------
Epoch 1000, Loss: 2.9999
Sample: [5, 14, 16, 1, 3, 23, 16, 8, 19, 18, 7, 16, 23, 7, 11, 19, 12, 26, 21, 3, 0]
Sample Name: Enpacwphsrgpwgkslzuc

--------------------------------------------------
Epoch 2000, Loss: 2.5650
Sample: [16, 19, 25, 19, 12, 18, 21, 1, 21, 9, 9, 9, 22, 5, 0]
Sample Name: Psyslruauiiive

--------------------------------------------------
Epoch 3000, Loss: 2.5197
Sample: [15, 18, 19, 1, 19, 15, 15, 18, 18, 9, 21, 0]
Sample Name: Orsasoorriu

--------------------------------------------------
Epoch 4000, Loss: 2.2748
Sample: [20, 18, 21, 19, 20, 14, 1, 1, 19, 21, 0]
Sample Name: Trustnaasu

--------------------------------------------------
Epoch 5000, Loss: 2.7683
Sample: [1, 15, 1, 9, 18, 21, 16, 21, 6, 1, 18, 18, 16, 21, 19, 0]
Sample Name: Aoairupufarrpus

---------------------------------

In [392]:
for _ in range(10):
    start_char = random.choice(list(char_to_ix.keys()))
    if start_char == '\n': continue
    indices = sample(parameters, char_to_ix, length=30, temperature=0.5, start_char=start_char)
    name = get_dino_name_from_indices(indices, ix_to_char)
    print(f"{start_char.upper()} ----> {name}")

C ----> Chuanosaurus

V ----> Veratos

C ----> Cratasaurus

J ----> Juranosaurus

T ----> Tanlosaurus

W ----> Wianosaurus

V ----> Vurichisaurus

B ----> Borosaurus

Y ----> Yropatos

T ----> Touryratrus



In [393]:
with open("generated_names.txt", "a") as f:
    for ch in "abcdefghijklmnopqrstuvwxyz":
        if ch in char_to_ix:
            indices = sample(parameters, char_to_ix, length=30, temperature=0.8, start_char=ch)
            name = get_dino_name_from_indices(indices, ix_to_char)
            f.write(f"{ch.upper()} ----> {name}\n")