# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os

## Vanilla RNN

### Load and Preprocess Data

In [2]:

book_data = np.array(load_data('../Dataset/Training/vol1.txt'))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01



### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
epochs = 10
losslist = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 0:
            print('Iteration: {}, Loss: {}'.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence)
            sentences.append(sentence)
            losslist.append(smoothloss)
        iteration += 1
        if iteration>400001:
            break
    if iteration>400001:
        break

### Evaluate and plot

## LSTM

### Load and Preprocess Data

In [3]:

book_data = np.array(load_data('../Dataset/Training/edgar.txt',remove_footnotes=False))
print(book_data)
book_chars = np.unique(book_data)
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
batch_size = 50
seq_length = 50

book_data_ind = np.array([char_to_ind[c] for c in book_data])
print(char_to_ind)
# Split data into sequences
char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(data):
    input = data[:-1]
    target = data[1:]
    return input, target

# Split data into X, Y
dataset = sequences.map(split_input_target)

# This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)


['´' '╗' '┐' ... '\n' '\n' '\n']
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '&': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, '[': 52, ']': 53, '_': 54, 'a': 55, 'b': 56, 'c': 57, 'd': 58, 'e': 59, 'f': 60, 'g': 61, 'h': 62, 'i': 63, 'j': 64, 'k': 65, 'l': 66, 'm': 67, 'n': 68, 'o': 69, 'p': 70, 'q': 71, 'r': 72, 's': 73, 't': 74, 'u': 75, 'v': 76, 'w': 77, 'x': 78, 'y': 79, 'z': 80, '~': 81, '£': 82, 'ª': 83, '¬': 84, '®': 85, '´': 86, '»': 87, '¿': 88, 'Á': 89, 'Â': 90, 'Ç': 91, 'Ô': 92, 'Ö': 93, 'Ø': 94, 'á': 95, 'í': 96, 'ó': 97, 'ö': 98, 'ÿ': 99, '│': 100, '┐': 101, '├': 102, '┤': 103, '┬': 104, '╗': 105, '░': 106, '▓': 

### Build and train model

In [4]:

def loss(Y_true,Y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true=Y_true, y_pred=Y_pred, from_logits=True)

# Length of the vocabulary in chars.
vocab_size = len(book_chars)

# The embedding dimension.
embedding_dim = 256

# Number of RNN units.
rnn_units = 200


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    m = tf.keras.Sequential()
    m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    m.add(tf.keras.layers.BatchNormalization(synchronized=True))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.Dense(vocab_size))
    #m.summary()
    return m

m = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

# try the model
for i_ex, t_ex in dataset.take(1):
    example_pred = m(i_ex)  # this step builds the model
    sampled_indices = tf.random.categorical(logits=example_pred[0], num_samples=1)
    sample_ind_1d = tf.squeeze(input=sampled_indices, axis=-1).numpy()
print('Input:\n', repr(''.join([ind_to_char[c] for c in i_ex.numpy()[0]])))
print('Next char prediction:\n', repr(''.join([ind_to_char[c] for c in sample_ind_1d])))  

# Directory where the checkpoints will be saved.
current_dir_path = os.getcwd()
checkpoint_dir = current_dir_path + "\\tmp\\checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, 
    save_best_onl =True
)


# Specify update rule and compile model
adam_opt = tf.keras.optimizers.Adam(learning_rate=0.01)
m.compile(optimizer=adam_opt, loss=loss)

# train
history = m.fit(x=dataset, epochs=2, callbacks=[checkpoint_callback])



Input:
 'soon became sufficiently evident. From\n      behin'
Next char prediction:
 'zf)4xH-5py░[r\nw▓;c?´ OA)Ö¿ jzÿ1frlö56p[CcEz¿,KF?ÔR'
Epoch 1/2
Epoch 2/2


### Generate text

In [14]:
# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)

latest_checkpoint_file = "ckpt_02.hdf5"

simplified_batch_size = 1
m = build_model(vocab_size, embedding_dim, rnn_units, simplified_batch_size)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([simplified_batch_size, None]))

def generate_text(model, start_string, text_size):
    # Convert start string to numbers
    input_indices = tf.expand_dims([char_to_ind[s] for s in start_string], 0)

    generated_text = ""
    model.reset_states()
    for i in range(text_size):
        predictions = model(input_indices)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # Sample a new character based on the log probability distribution in 'predictions'
        sampled_id = tf.random.categorical(
        predictions,
        num_samples=1
        )[-1,0].numpy()

        # Use sampled char as input for next iteration
        input_indices = tf.expand_dims([sampled_id], 0)
        generated_text += ind_to_char[sampled_id]

    return start_string + generated_text

gen_text = generate_text(model=m, start_string=u"water", text_size=200)
print(gen_text)


ckpt_02.hdf5
mans like ould unufais, vased to mirrons 12.

      ÔÇ£_Orewhs back the
      the backe?ÔÇÖ I
      contence to untertainal caglies, tellowindly contentained in part of the patily condeªtes of
      take


In [None]:
X = []
Y = []
for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
    X_chars = book_data[e:e+seq_length]
    Y_chars = book_data[e+1:e+seq_length+1]
    #X.append(X_chars)
    #X.append(Y_chars)
    X.append(one_hot_encoding(X_chars, char_to_ind, k))
    Y.append(one_hot_encoding(Y_chars, char_to_ind, k))
    #hist = model.fit(X,Y)
print(len(X))
print(len(Y))
# Length of the vocabulary in chars.
vocab_size = len(book_chars)

# The embedding dimension.
embedding_dim = 256

# Number of RNN units.
rnn_units = 1024
batch_size = 1
m = tf.keras.models.Sequential()
m.add(tf.keras.layers.LSTM(units=50, return_sequences=True, input_shape=(X)))
m.add(tf.keras.layers.Dropout(0.2))
m.add(tf.keras.layers.LSTM(units=50))
m.add(tf.keras.layers.Dropout(0.2))
m.add(tf.keras.layers.Dense(units=1))
m.compile(optimizer = 'adam', loss = 'mean_squared_error')
print(X[0].shape)
history = m.fit(X,Y, epochs=1, batch_size=1, verbose=0)

### Evaluate and plot