# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os

## Vanilla RNN

### Load and Preprocess Data

In [None]:
training_data_filename = "../Dataset/Training/vol1.txt"
book_data = np.array(load_data(training_data_filename))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01



### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
epochs = 10
losslist = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 0:
            print('Iteration: {}, Loss: {}'.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence)
            sentences.append(sentence)
            losslist.append(smoothloss)
        iteration += 1
        if iteration>400001:
            break
    if iteration>400001:
        break

### Evaluate and plot

## LSTM

### Load and Preprocess Data

In [2]:
training_data_filename = "../Dataset/Training/vol1.txt"
book_data = np.array(load_data(training_data_filename,remove_footnotes=False))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
batch_size = 50
seq_length = 50

book_data_ind = np.array([char_to_ind[c] for c in book_data])
# Split data into sequences
char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(data):
    input = data[:-1]
    target = data[1:]
    return input, target

# Split data into X, Y
dataset = sequences.map(split_input_target)

# This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)


### Build and train model

In [3]:

def loss(Y_true,Y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true=Y_true, y_pred=Y_pred, from_logits=True)

# Length of the vocabulary in chars.
vocab_size = len(book_chars)

# The embedding dimension.
embedding_dim = 200

# Number of RNN units.
rnn_units = 200

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    m = tf.keras.Sequential()
    m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    m.add(tf.keras.layers.BatchNormalization(synchronized=True))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.Dense(vocab_size))
    #m.summary()
    return m

m = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

# try the model
for i_ex, t_ex in dataset.take(1):
    example_pred = m(i_ex)  # this step builds the model
    sampled_indices = tf.random.categorical(logits=example_pred[0], num_samples=1)
    sample_ind_1d = tf.squeeze(input=sampled_indices, axis=-1).numpy()
print('Input:\n', repr(''.join([ind_to_char[c] for c in i_ex.numpy()[0]])))
print('Next char prediction:\n', repr(''.join([ind_to_char[c] for c in sample_ind_1d])))  

# Directory where the checkpoints will be saved.
current_dir_path = os.getcwd()
checkpoint_dir = current_dir_path + "\\tmp\\checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, 
    save_best_onl =True
)

# Specify update rule and compile model
adam_opt = tf.keras.optimizers.Adam(learning_rate=0.01)
m.compile(optimizer=adam_opt, loss=loss)

# train
history = m.fit(x=dataset, epochs=1, callbacks=[checkpoint_callback])



Input:
 'e folio\nMS., and are printed i'
Next char prediction:
 'b-A&4%)┤╣é$kfp╗ñ{8Jqb¼@5FWN0u®'


### Generate text

In [6]:
# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
# latest_epoch = 0
# latest_checkpoint_file = ""
# for file in os.listdir(checkpoint_dir):
#     e = int(file.split("_")[1].split(".")[0])
#     if e>latest_epoch:
#         latest_epoch = e
#         latest_checkpoint_file = file
# print(latest_checkpoint_file)
latest_checkpoint_file = "ckpt_01.hdf5"

simplified_batch_size = 1
m = build_model(vocab_size, embedding_dim, rnn_units, simplified_batch_size)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([simplified_batch_size, None]))

# Generate text
gen_text = generate_text(model=m, start_string=u"the", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char)
print(gen_text)


the the coping hid, thougrasties'.          Ire rositz.


28-175) the vitzs mer. (Ol
[Foto nure blol ingut hall cher Vouly coep_ts and mave's for Lockssecke; wat shese',
  Nor Histy
boll Magoterbigle, "Our,) fagldericallerg the Eurceran mesis najecers. Is foultry of
commice allselsat, has slistione by than by alsron, in alprent of I can bain's, to Spee ofk
  you
rispleed,-1.
  Bide's and be; hally, chil of 1803.
  The mare might to G. Coperish Shawhemt wa . _N's hight, ergestet;
  T no uspry in, and oby
('Nettions, auther a shalll this said, yation;
  But work as fince at, who sastear meng bleoight; wapleents sexprew's projeccanbert, lline his sirnsd distic. Leagen Foore
Reatuound abty rearating--nocceop;" 1817. The loved of
  To the
Porged tomards'

"Ocation; poosk:
  enther weored Hy this this 'watorn tronitionic
  Veentle his all eghe,
ignions.
  'MA.
['.' The a heu ut


1, Theak as;
  Barks, whulers, vert--none, lating flool,' fay, the "blequenhered, ingresple--

['MS. [F.S' outsum

### Evaluate and plot

In [9]:
# Calculate performance metrics for generated text
with open(training_data_filename) as file:
    validation_text = file.read()

fraction_correct_words, bleu_score = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
repetition_score = measure_diversity(text_generated=gen_text, n_max=2)
print("fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))


fraction of correctly spelled words: 0.4409937888198758 
 Bleu score: 0.00026698068015285157 
 Repetition score: 0.0012924051150995462
