In [1]:
import numpy as np
from collections import defaultdict
from torch.utils import data

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Generate Dataset
np.random.seed(42)

In [3]:
def generate_dataset(num_sequences=2**8):
    sequences = []
    for _ in range(num_sequences):
        token_length = np.random.randint(1, 12)
        sequence = f'{"a"*token_length}{"b"*token_length}EOS'
        sequences.append(sequence)
        
    return sequences

In [4]:
def word_encoding(sequences):
    
    # Get 1D list of all words in all sequences
    flatten = lambda l: [item for sublist in l for item in sublist]
    all_words = flatten(sequences)
    
    # Create dictionary mapping word to word frequency across all sequences
    word_to_count = defaultdict(int)
    for word in all_words:
        word_to_count[word] += 1
    word_to_count = sorted(list(word_to_count.items()), key=lambda l: -l[1]) # sorting according to frequency
    
    # List of unique words
    dictionary = [item[0] for item in word_to_count]
    dictionary.append('UNK')
    
    # Calculate lengths
    num_sequences = len(sequences)
    vocab_size = len(dictionary)
    
    # Make word to index and index to word mappings
    word_to_idx = defaultdict(lambda: vocab_size-1)
    idx_to_word = defaultdict(lambda: 'UNK')
    for idx, word in enumerate(dictionary):
        word_to_idx[word] = idx
        idx_to_word[idx] = word
    
    return word_to_idx, idx_to_word, vocab_size

In [56]:
def one_hot_encode(idx, vocab_size):
    """
    One-hot encodes a single word given its index and the size of the vocabulary.
    
    Args:
     `idx`: the index of the given word
     `vocab_size`: the size of the vocabulary
    
    Returns a 1-D numpy array of length `vocab_size`.
    """
    # Initialize the encoded array
    one_hot = np.zeros(vocab_size)
    
    # Set the appropriate element to one
    one_hot[idx] = 1.0

    return one_hot


def one_hot_encode_sequence(sequence, vocab_size, word_to_idx):
    """
    One-hot encodes a sequence of words given a fixed vocabulary size.
    
    Args:
     `sentence`: a list of words to encode
     `vocab_size`: the size of the vocabulary
     
    Returns a 3-D numpy array of shape (num words, vocab size, 1).
    """
    # Encode each word in the sentence
    encoding = np.array([one_hot_encode(word_to_idx[word], vocab_size) for word in sequence])

    # Reshape encoding s.t. it has shape (num words, vocab size, 1)
    encoding = encoding.reshape(encoding.shape[0], encoding.shape[1], 1)
    
    return encoding

In [6]:
class Dataset(data.Dataset):
    def __init__(self, inputs, targets):
        self.X = inputs
        self.y = targets

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [7]:
def prepare_data(sequences, train_size=0.8, test_size=0.1, val_size=0.1):
    
    # Split data
    num_train = int(train_size*len(sequences))
    num_test = int(test_size*len(sequences))
    num_val = int(val_size*len(sequences))
#     print(f'{num_train}, {num_test}, {num_val}')
    
    train_seq = sequences[:num_train]
    test_seq = sequences[num_train:num_train+num_test]
    val_seq = sequences[-num_val:]
#     print(f'{len(train_seq)}, {len(test_seq)}, {len(val_seq)}')
    
    # prepare input & target sequences
    def prepare_sequences(sequences):
        inputs = []
        targets = []
        
        for sequence in sequences:
            inputs.append(sequence[:-1])
            targets.append(sequence[1:])
        
        return inputs, targets
    
    train_inputs, train_targets = prepare_sequences(train_seq)
    test_inputs, test_targets = prepare_sequences(test_seq)
    val_inputs, val_targets = prepare_sequences(val_seq)
#     print(f'{len(train_inputs)}, {len(test_inputs)}, {len(val_inputs)}')
    
    # create datasets
    train_set = Dataset(train_inputs, train_targets)
    test_set = Dataset(test_inputs, test_targets)
    val_set = Dataset(val_inputs, val_targets)
    
    return train_set, test_set, val_set

In [8]:
# RNN from scratch

In [9]:
def init_orthogonal_weights(dim1, dim2):
    
    # initialize
    weights = np.random.randn(dim1, dim2)
#     print(f'inital random: {weights}')
    if dim1 < dim2:
        weights = weights.T
        
    # QR factorization (Q = orthogonal)
    q, r = np.linalg.qr(weights)
#     print(f'q: {q}')
    
    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
    d = np.diag(r, 0)
    ph = np.sign(d)
    q *= ph
#     print(f'q final: {q}')

    if dim1 < dim2:
        q = q.T
        
    return q # q is orthogonal

In [10]:
def init_rnn(hidden_size, vocab_size):
    '''
    Initializes RNN
    
    Args:
        hidden_size --> hidden state dimensions
        vocab_size --> input vector dimensions

    Returns:
        U --> Weight matrix applied to input, passed to hidden state
        V --> Weight matrix from previous hidden state passed to hidden state
        W --> Weight matrix applied to output from hidden state to give final output

        bias_hidden = bias applied in hidden state
        bias_output = bias applied to output
    '''
    
    U = init_orthogonal_weights(hidden_size, vocab_size)
    V = init_orthogonal_weights(hidden_size, hidden_size)
    W = init_orthogonal_weights(vocab_size, hidden_size)
    
    bias_hidden = init_orthogonal_weights(hidden_size, hidden_size)
    bias_output = init_orthogonal_weights(vocab_size, vocab_size)
    
    return (U, V, W, bias_hidden, bias_output)

In [11]:
# Activation Functions

In [12]:
def sigmoid(x, derivative=False):
    """
    Computes sigmoid of array x
    
    Args:
        x --> input array
        derivative --> when set to True will return derivative instead of forward pass
    """
    
    x_safe = x + 1e-12
    f = 1 / (1 + np.exp(-x_safe))
    
    if derivative: 
        return f * (1 - f)
    else: 
        return f

In [13]:
def tanh(x, derivative=False):
    """
    Computes tanh of array x
    
    Args:
        x --> input array
        derivative --> when set to True will return derivative instead of forward pass
    """
    
    x_safe = x + 1e-12
    f = (np.exp(x_safe)-np.exp(-x_safe))/(np.exp(x_safe)+np.exp(-x_safe))
    
    if derivative: 
        return f * (1 - f)
    else: 
        return f

In [14]:
def softmax(x):
    """
    Computes softmax of array x
    
    Args:
        x --> input array
    """
    return np.exp(x+1e-12) / np.sum(np.exp(x+1e-12))

In [15]:
# Forward Pass

In [16]:
def forward_pass(inputs, hidden_state, parameters):
    
    U, V, W, bias_hidden, bias_output = parameters
    outputs, hidden_states = [], [hidden_state]
#     print(f'U: {U}, V: {V}, W: {W}')
    
    for i in range(len(inputs)):
        
#         print(f'U: {U.shape}, input: {inputs[i].shape}, v: {V.shape}, hidden: {hidden_state.shape}')
        hidden_state = tanh((np.dot(U, inputs[i]) + np.dot(V, hidden_states[-1])))
        output = np.dot(W, hidden_state)
#         print(f'hidden: {hidden_state>0}, output: {output}')
        
        hidden_states.append(hidden_state)
        outputs.append(output)
        
    return outputs, hidden_states

In [17]:
def clip_gradient_norm(grads, max_norm=0.25):
    """
    Prevents exploding gradient by clipping 
    Clips gradients to have max norm of max_norm
    """

    max_norm = float(max_norm)
    total_norm = 0

    # Using L2 norm squared
    for grad in grads:
        grad_norm = np.sum(np.power(grad, 2))
        total_norm += grad_norm

    total_norm = np.sqrt(total_norm)

    clip_coef = max_norm / (total_norm + 1e-6)

    if clip_coef < 1:
        for grad in grads:
            grad *= clip_coef

    return grads

In [18]:
def cross_entropy_loss(output, target):
    loss = 0
    for j in range(len(output)):
        
#         print(f'target: {target[j]}, out: {output[j]}, val: {output[j]}, log: {np.log(output[j] + 1e-9)}')
        loss += target[j] * np.log(output[j] + 1e-9) 
    
    return -loss    

In [19]:
def backward_pass(inputs, outputs, hidden_states, targets, params):
    U, V, W, bias_hidden, bias_output = params
    
    # Initialize gradients as zero
    d_U, d_V, d_W = np.zeros_like(U), np.zeros_like(V), np.zeros_like(W)
    d_bias_hidden, d_bias_output = np.zeros_like(bias_hidden), np.zeros_like(bias_output)
    
    d_hidden_next = np.zeros_like(hidden_states[0])
    loss = 0
    
    # Iterate backwards through elements
    for i in reversed(range(len(outputs))):
        
        # Calculate loss
#         print(f'{cross_entropy_loss(outputs[i], targets[i])}')
        loss += (cross_entropy_loss(softmax(outputs[i]), targets[i])/len(targets))
        
        # Backpropagate into output
        d_output = outputs[i].copy()
        d_output[np.argmax(targets[i])] -= 1
        
        # Backpropagate into W
#         print(f'h: {hidden_states[i].T.shape}, out: {d_output.shape}')
        d_W += np.dot(d_output, hidden_states[i].T)
        d_bias_output += d_output
        
        # Backpropagate into h
        d_h = np.dot(W.T, d_output) + d_hidden_next
        
        # Backpropagate through non-linearity (tanh)
        d_f = (1 - hidden_states[i]**2) * d_h
        d_bias_hidden += d_f
        
        # Backpropagate into U
#         print(f'h: {inputs[i].T.shape}, out: {d_f.shape}')
        d_U += np.dot(d_f, inputs[i].T)
        
        # Backpropagate into V
#         print(f'h: {hidden_states[i-1].T.shape}, out: {d_f.shape}')
        d_V += np.dot(hidden_states[i-1].T, d_f)
        d_hidden_next = np.dot(V.T, d_f)
        
    # Clip gradients
    grads = d_U, d_V, d_W, d_bias_hidden, d_bias_output
    grads = clip_gradient_norm(grads)
    
    return loss, grads
        
        

In [20]:
def optimizer(parameters, gradients, learning_rate=1e-3):
    for parameter, gradient in zip(parameters, gradients):
        parameter -= learning_rate * gradient
    
    return parameters

In [52]:
def encode_data(dataset, vocab_size, word_to_idx):
    
    x, y = [], []
    for inputs, targets in dataset:
#         print(f'input: {len(inputs)}\ntargets{len(targets)}\n')
        x.append(one_hot_encode_sequence(inputs, vocab_size, word_to_idx))
        y.append(one_hot_encode_sequence(targets, vocab_size, word_to_idx))
        
#     print(f'lengths {len(x)}, {len(y)}')
    return (x, y)

In [22]:
def train(training_set, hidden_state, parameters, epochs=1000):
    
    training_loss = []
    inputs, targets = training_set
    for i in range(epochs):
        
        epoch_training_loss = 0
        for x, y in zip(inputs, targets):
            hidden_state = np.zeros_like(hidden_state)

            # Forward pass
            outputs, hidden_states = forward_pass(x, hidden_state, parameters)

            # Backward pass
            loss, gradients = backward_pass(x, outputs, hidden_states, y, parameters)
            if np.isnan(loss):
                raise ValueError('ERROR: Gradients have vanished')

            # Update parameters (optimizer)
            parameters = optimizer(parameters, gradients)
            epoch_training_loss += loss
            
        training_loss.append(epoch_training_loss/len(training_set))
    
        if i%100 == 0:
            print(f'Epoch {i}, training loss: {training_loss[-1]}')
        
    return parameters, training_loss

In [23]:
def validate(val_set, hidden_state, parameters, epochs=100):
    
    validation_loss = []
    inputs, targets = val_set
    for i in range(epochs):
        epoch_validation_loss = 0
        for x, y in zip(inputs, targets):
            hidden_state = np.zeros_like(hidden_state)
            
            #Forward pass
            outputs, hidden_states = forward_pass(x, hidden_state, parameters)
            
            # Backward pass
            loss, _ = backward_pass(x, outputs, hidden_states, y, parameters)
            if np.isnan(loss):
                raise ValueError('ERROR: Gradients have vanished')
            
        validation_loss.append(epoch_validation_loss/len(val_set))
        
        if i%100 == 0:
            print(f'Epoch {i}, validation loss: {validation_loss[-1]}')
            
    return validation_loss

In [24]:
def test(test_set, hidden_state, parameters, idx_to_word):
    inputs, targets = test_set
    results = defaultdict()
    for x in inputs:
        hidden_state = np.zeros_like(hidden_state)
        outputs, hidden_states = forward_pass(x, hidden_state, parameters)
        x_decoded = [ind_to_word[np.argmax(x[i])] for i in range(len(x))]
        y_decoded = [ind_to_word[np.argmax(output)] for output in outputs]
        x_decoded = ('').join(x_decoded)
        y_decoded = ('').join(y_decoded)
        results[x_decoded] = y_decoded
    return results
        

In [53]:
def rnn():
    
    # Constants
    epochs = 100
    hidden_size = 50
    hidden_state = np.zeros((hidden_size, 1))
    
    # Data Preparation
    sequences = generate_dataset()
    word_to_idx, idx_to_word, vocab_size = word_encoding(sequences)
    train_set, test_set, val_set = prepare_data(sequences)
    
    # Data encoding
    train_set = encode_data(train_set, vocab_size, word_to_idx)
    test_set = encode_data(test_set, vocab_size, word_to_idx)
    val_set = encode_data(val_set, vocab_size, word_to_idx)
    
    # Initialize rnn
    parameters = init_rnn(hidden_size, vocab_size)
    training_loss, validation_loss = [], []
    
    # Train
    parameters, training_loss = train(train_set, hidden_state, parameters, epochs)
    
    # Validate
    validation_loss = validate(val_set, hidden_state, parameters, epochs)
    
    # Test
    results = test(test_set, hidden_state, parameters, idx_to_word)
    
    # Print results
    for key in results:
        print(f'Input: {key}, Output: {results[key]}')

In [26]:
# rnn()

In [27]:
# LSTM

In [28]:
def init_lstm(hidden_size, vocab_size):
    
    z_size = hidden_size + vocab_size
    
    # Forget gate
    W_forget = np.zeros((hidden_size, z_size))
    b_forget = np.zeros((hidden_size, 1))
    
    # Update gate
    W_update = np.zeros((hidden_size, z_size))
    b_update = np.zeros((hidden_size, 1))
    
    # Output gate
    W_output = np.zeros((hidden_size, z_size))
    b_output = np.zeros((hidden_size, 1))
    
    # Candidate
    W_g = np.zeros((hidden_size, z_size))
    b_g = np.zeros((hidden_size, 1))
    
    # Output: output = W_v * h(t) + b_v
    W_v = np.zeros((vocab_size, hidden_size))
    b_v = np.zeros((vocab_size, 1))
    
    # Initialize weights
    W_forget = init_orthogonal_weights(W_forget.shape[0], W_forget.shape[1])
    W_update = init_orthogonal_weights(W_update.shape[0], W_update.shape[1])
    W_output = init_orthogonal_weights(W_output.shape[0], W_output.shape[1])
    W_g = init_orthogonal_weights(W_g.shape[0], W_g.shape[1])
    W_v = init_orthogonal_weights(W_v.shape[0], W_v.shape[1])
    
    return W_forget, W_update, W_output, W_g, W_v, b_forget, b_update, b_output, b_g, b_v

    

In [29]:
def forward_lstm(inputs, prev_hidden, prev_cell, parameters):
    
    # Unpack parameters
    W_forget, W_update, W_output, W_g, W_v, b_forget, b_update, b_output, b_g, b_v = parameters
    
    # Lists for computations to be saved
    inputs_list = []
    forget_gate, update_gate, output_gate = [], [], []
    g_comp, v_comp = [], []
    hidden_state, cell_state = [], []
    outputs_list = []
    
    # Hidden and cell states
    hidden_state.append(prev_hidden)
    cell_state.append(prev_cell)
    
    # Parse through input
    for x in inputs:
        
        # Concatenate input
        z = np.row_stack((prev_hidden, x))
        inputs_list.append(z)
        
        # Forget gate
        f = sigmoid((np.dot(W_forget, z)) + b_forget)
        forget_gate.append(f)
        
        # Update gate (Input gate)
        u = sigmoid((np.dot(W_update, z)) + b_update)
        update_gate.append(u)
        
        # Candidate (g)
        g = tanh(np.dot(W_g, z) + b_g)
        g_comp.append(g)
        
        # Memory state (Cell state)
        c = prev_cell * f + g * u
        cell_state.append(c)
        
        # Output gate
        o = sigmoid((np.dot(W_output, z)) + b_output)
        output_gate.append(o)
        
        # Hidden state
        h = o * tanh(c)
        hidden_state.append(h)
        
        # Calculate Logits (Intermediate step)
        v = np.dot(W_v, prev_hidden) + b_v
        v_comp.append(v)
        
        # Calculate final output (using softmax)
        output = softmax(v)
        outputs_list.append(output)
        
    return inputs_list, forget_gate, update_gate, g_comp, cell_state, output_gate, hidden_state, v_comp, outputs_list

        

In [30]:
def backward_lstm(computation_lists, targets, parameters):
    
    # Unpack inputs
    inputs_list, forget_gate, update_gate, g_comp, cell_state, output_gate, hidden_state, v_comp, outputs_list = computation_lists
    W_forget, W_update, W_output, W_g, W_v, b_forget, b_update, b_output, b_g, b_v = parameters
    
    # Initialize gradients (as zero) & other variables
    W_f_d = np.zeros_like(W_forget)
    b_f_d = np.zeros_like(b_forget)

    W_u_d = np.zeros_like(W_update)
    b_u_d = np.zeros_like(b_update)

    W_g_d = np.zeros_like(W_g)
    b_g_d = np.zeros_like(b_g)

    W_o_d = np.zeros_like(W_output)
    b_o_d = np.zeros_like(b_output)

    W_v_d = np.zeros_like(W_v)
    b_v_d = np.zeros_like(b_v)
    
    d_hidden_prev = np.zeros_like(hidden_state[0])
    d_cell_prev = np.zeros_like(cell_state[0])
    hidden_size = len(hidden_state)
    
    loss = 0
    
    for i in reversed(range(len(outputs_list))):
        
        # Cross entropy
        loss += (cross_entropy_loss(outputs_list[i], targets[i])/len(targets))
        
        # Previous cell state
        prev_cell = cell_state[-1]
    
        # Derivative for v (relation of hidden state to output)
        dv = np.copy(outputs_list[i])
        dv[np.argmax(targets[i])] -= 1
        W_v_d += np.dot(dv, hidden_state[i].T)
        b_v_d += dv
        
        # Derivative for hidden state (h)
        dh = np.dot(W_v.T, dv)        
        dh += d_hidden_prev
        
        # Derivative for output (o)
        do = dh * tanh(cell_state[i])
        do = sigmoid(output_gate[i], derivative=True)*do
        W_o_d += np.dot(do, inputs_list[i].T)
        b_o_d += do
        
        # Derivative for cell state (c)
        dC = np.copy(d_cell_prev)
        dC += dh * output_gate[i] * tanh(tanh(cell_state[i]), derivative=True)
        
        # Derivative for candidate (g)
        dg = dC * update_gate[i]
        dg = tanh(g_comp[i], derivative=True) * dg
        W_g_d += np.dot(dg, inputs_list[i].T)
        b_g_d += dg
        
        # Derivative for update gate (input gate)
        du = dC * g_comp[i]
        du = sigmoid(update_gate[i], True) * du
        W_u_d += np.dot(du, inputs_list[i].T)
        b_u_d += du
        
        # Derivative for forget gate (f)
        df = dC * prev_cell
        df = sigmoid(forget_gate[i]) * df
        W_f_d += np.dot(df, inputs_list[i].T)
        b_f_d += df
        
        # Update derivatives of prev cell and hidden states
        dz = (np.dot(W_forget.T, df)
             + np.dot(W_update.T, du)
             + np.dot(W_g.T, dg)
             + np.dot(W_output.T, do))
        d_hidden_prev = dz[:hidden_size, :]
        d_cell_prev = forget_gate[i] * dC
        
        # Clip gradients
        gradients = W_f_d, W_u_d, W_g_d, W_o_d, W_v_d, b_f_d, b_u_d, b_g_d, b_o_d, b_v_d
        gradients = clip_gradient_norm(gradients)
        
        return loss, gradients

In [31]:
def train_lstm(train_set, parameters, hidden_size, epochs):
    
    training_loss = []
    inputs, targets = train_set
    for i in range(epochs):
        epoch_training_loss = 0
        for x, y in zip(inputs, targets):
            hidden_state = np.zeros((hidden_size, 1))
            cell_state = np.zeros((hidden_size, 1))
            
            # Forward pass
            computation_lists = forward_lstm(x, hidden_state, cell_state, parameters)
            
            # Backward pass
            loss, gradients = backward_lstm(computation_lists, y, parameters)
            
            # Update parameters (optimizer)
            parameters = optimizer(parameters, gradients)
            
            # Update loss
            epoch_training_loss += loss
        
        training_loss.append(epoch_training_loss)
    
    return training_loss, parameters
            
            
            

In [32]:
def validate_lstm(val_set, parameters, hidden_size, epochs):
    validation_loss = []
    inputs, targets = val_set
    for i in range(epochs):
        epoch_validation_loss = 0
        for x, y in zip(inputs, targets):
            hidden_state = np.zeros((hidden_size, 1))
            cell_state = np.zeros((hidden_size, 1))
            
            # Forward pass
            computation_lists = forward_lstm(x, hidden_state, cell_state, parameters)
            
            # Backward pass
            loss, gradients = backward_lstm(computation_lists, y, parameters)
            
            # Update loss
            epoch_validation_loss += loss
        
        validation_loss.append(epoch_validation_loss)
    
    return validation_loss

In [46]:
def test_lstm(test_set, parameters, hidden_size, ind_to_word=None):
    inputs, targets = test_set
    results1 = defaultdict()
    results2 = []
    for x in inputs:
        hidden_state = np.zeros((hidden_size, 1))
        cell_state = np.zeros((hidden_size, 1))
        
        computation_lists = forward_lstm(x, hidden_state, cell_state, parameters)
        inputs_list, forget_gate, update_gate, g_comp, cell_state, output_gate, hidden_state, v_comp, outputs = computation_lists

        if ind_to_word:
            x_decoded = [ind_to_word[np.argmax(x[i])] for i in range(len(x))]
            y_decoded = [ind_to_word[np.argmax(output)] for output in outputs]
            x_decoded = ('').join(x_decoded)
            y_decoded = ('').join(y_decoded)
            results1[x_decoded] = y_decoded
        else:
            results2.append(outputs)
        
    if ind_to_word:
        return results1
    else: 
        return results2

In [69]:
def lstm():
    
    # Data Preparation
    sequences = generate_dataset()
    word_to_idx, idx_to_word, vocab_size = word_encoding(sequences)
    train_set, test_set, val_set = prepare_data(sequences)
    print(vocab_size)
    
    # Data encoding
    train_set = encode_data(train_set, vocab_size, word_to_idx)
#     test_set = encode_data(test_set, vocab_size)
#     val_set = encode_data(val_set, vocab_size)
    
#     # Initialize network
#     epochs = 20
#     hidden_size = 50
#     z_size = hidden_size + vocab_size
#     parameters = init_lstm(hidden_size, vocab_size)
    
#     # Train
#     training_loss, parameters = train_lstm(train_set, parameters, hidden_size, epochs)
    
#     # Validate
#     validation_loss = validate_lstm(val_set, parameters, hidden_size, epochs)
    
#     # Test
#     results = test_lstm(test_set, parameters, hidden_size, idx_to_word)
    
#     # Print results
#     for key in results:
#         print(f'Input: {key}, Output: {results[key]}')

    return train_set

In [70]:
test = lstm()
test

6


([array([[[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[0.],
          [1.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[0.],
          [0.],
          [1.],
          [0.],
          [0.],
          [0.]],
  
         [[0.],
          [0.],
          [0.],
          [1.],
          [0.],
          [0.]]]),
  array([[[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
         [[1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
  
 

In [64]:
train, test = test

In [68]:
train[0]

array([[[1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]]])

In [37]:
def lstm_modified(vocab_size, epochs=20, hidden_size=50):
    epochs = epochs
    hidden_size = hidden_size
    z_size = hidden_size + vocab_size
    parameters = init_lstm(hidden_size, vocab_size)
    
    # Train
    training_loss, parameters = train_lstm(train_set, parameters, hidden_size, epochs)
    
    # Validate
    validation_loss = validate_lstm(val_set, parameters, hidden_size, epochs)
    
    # Test
    results = test_lstm(test_set, parameters, hidden_size, idx_to_word)

In [48]:
class LSTM():
    def __init__(self, vocab_size, hidden_size=50):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.z_size = hidden_size + vocab_size
        self.parameters = []
        self.training_loss = []
        self.validation_loss = []
        
    def initialize(self):
        self.parameters = init_lstm(self.hidden_size, self.vocab_size)
        
    def train(self, train_set, epochs=20):
        self.training_loss, self.parameters = train_lstm(train_set, self.parameters, self.hidden_size, epochs)
        
    def validate(self, val_set, epochs):
        self.validation_loss = validate_lstm(val_set, self.parameters, self.hidden_size, epochs)
        
    def test(self, test_set, idx_to_word=None):
        results = test_lstm(test_set, self.parameters, self.hidden_size, idx_to_word)
        return results