# LSTM - Long Short Term Memory

In [None]:
import sys
import pickle

sys.path.append('../')
import numpy as np  # noqa: E402
from dl_framework.tensor import Tensor  # noqa: E402
from dl_framework.optimisers import SGD  # noqa: E402
from dl_framework.layers import Embedding, RNNCell, LSTMCell  # noqa: E402
from dl_framework.loss import CrossEntropy  # noqa: E402

## Data preprocessing

In [None]:
with open('../data/shakespeare/shakespeare.txt', 'r') as f:
    raw = f.read()
    vocab = list(set(raw))

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}

indices = np.array([word2idx[w] for w in raw])

## Model with simple RNN cell

In [None]:
embed = Embedding(vocab_size=len(vocab), dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))

criterion = CrossEntropy()
optimiser = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

batch_size = 32
bptt = 16
n_batches = int((indices.shape[0] / batch_size))

trimmed_indices = indices[:n_batches * batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches - 1) / bptt))
input_batches = input_batched_indices[:n_bptt * bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt * bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

In [None]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2idx[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()

        m = (temp_dist > np.random.rand()).argmax()
        #         m = output.data.argmax()
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

In [None]:
def train(epochs=100):
    for e in range(epochs):
        total_loss = 0

        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = []
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if t == 0:
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            for loss in losses:
                ""
            loss.backward()
            optimiser.step()
            total_loss += loss.data
            log = '\r Iter: ' + str(e)
            log += ' - Batch ' + str(batch_i + 1) + '/' + str(n_batches)
            log += ' - Loss: ' + str(np.exp(total_loss / (batch_i + 1)))
            if batch_i == 0:
                log += ' - ' + generate_sample(70, '\n').replace('\n', ' ')
            if batch_i % 10 == 0 or batch_i - 1 == n_batches:
                print(log)
        optimiser.alpha *= 0.99
        print()

In [None]:
train(epochs=100)

In [None]:
print(generate_sample(n=2000, init_char='\n'))

## Model with LSTM cell

In [None]:
embed = Embedding(vocab_size=len(vocab), dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_outputs=len(vocab))
model.w_ho.weight.data *= 0  # helps with training

criterion = CrossEntropy()
optimiser = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

batch_size = 16
bptt = 25
n_batches = int((indices.shape[0] / batch_size))

trimmed_indices = indices[:n_batches * batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches - 1) / bptt))
input_batches = input_batched_indices[:n_bptt * bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt * bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

In [None]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2idx[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        output.data *= 15
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()

#         m = (temp_dist > np.random.rand()).argmax() # sample from predictions
        m = output.data.argmax() # take the max prediction
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c
    return s

In [None]:
def train(epochs=100):
    min_loss = 1000
    for e in range(epochs):
        total_loss = 0
        n_loss = 0

        hidden = model.init_hidden(batch_size=batch_size)
        batches_to_train = len(input_batches)

        for batch_i in range(batches_to_train):
            hidden = (Tensor(hidden[0].data, autograd=True),
                      Tensor(hidden[1].data, autograd=True))
            losses = []
            
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)

                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                
                if t == 0:
                    losses.append(batch_loss)
                else:
                    losses.append(batch_loss + losses[-1])
            loss = losses[-1]
            
            loss.backward()
            optimiser.step()
            
            total_loss += loss.data / bptt
            epoch_loss = np.exp(total_loss / (batch_i + 1))
            if epoch_loss < min_loss:
                min_loss = epoch_loss
                # print(generate_sample(n=70, init_char='\n').replace('\n', ''))
            log = '\r Iter: ' + str(e)
            log += ' - Batch ' + str(batch_i + 1) + '/' + str(batches_to_train)
            log += ' - Min Loss: ' + str(min_loss)
            log += ' - Loss: ' + str(epoch_loss)
            if batch_i == 0:
                log += ' - ' + generate_sample(n=70, init_char='\n').replace('\n', ' ')
            if batch_i % 10 == 0 or batch_i - 1 == n_batches:
                print(log)
        optimiser.alpha *= 0.99            

In [None]:
train(epochs=100)

In [None]:
print(generate_sample(n=2000, init_char='\n'))

In [None]:
with open('lstm_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
load_from_pickle = True
if load_from_pickle:
    with open('lstm_model.pkl', 'rb') as f:
        model = pickle.load(f)