# DD2424 Project in Deep Learning in Data Science

## Imports

In [2]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os

## Vanilla RNN

### Load and Preprocess Data

In [3]:

book_data = np.array(load_data('../Dataset/Training/vol1.txt'))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01



### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
epochs = 10
losslist = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 0:
            print('Iteration: {}, Loss: {}'.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence)
            sentences.append(sentence)
            losslist.append(smoothloss)
        iteration += 1
        if iteration>400001:
            break
    if iteration>400001:
        break

### Evaluate and plot

## LSTM

### Load and Preprocess Data

In [4]:

training_data_filename = '../Dataset/Training/edgar.txt'
book_data = np.array(load_data(training_data_filename,remove_footnotes=False))
with open(training_data_filename,encoding='cp850',mode='r') as f:
   words = f.read().split()
with open(training_data_filename,encoding='cp850',mode='r') as file:
    validation_text = file.read()
print(book_data)
book_chars = np.unique(book_data)
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]

# Hyperparameters
batch_size = 50 # [25, 75, 125]
seq_length = 50 # [25, 75, 125]
learningrate = 0.1 # [0.1, 0.01, 0.001]
book_data_ind = np.array([char_to_ind[c] for c in book_data])
print(char_to_ind)
# Split data into sequences
char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(data):
    input = data[:-1]
    target = data[1:]
    return input, target

# Split data into X, Y
dataset = sequences.map(split_input_target)

# This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)


['´' '╗' '┐' ... '\n' '\n' '\n']
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '&': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, '[': 52, ']': 53, '_': 54, 'a': 55, 'b': 56, 'c': 57, 'd': 58, 'e': 59, 'f': 60, 'g': 61, 'h': 62, 'i': 63, 'j': 64, 'k': 65, 'l': 66, 'm': 67, 'n': 68, 'o': 69, 'p': 70, 'q': 71, 'r': 72, 's': 73, 't': 74, 'u': 75, 'v': 76, 'w': 77, 'x': 78, 'y': 79, 'z': 80, '~': 81, '£': 82, 'ª': 83, '¬': 84, '®': 85, '´': 86, '»': 87, '¿': 88, 'Á': 89, 'Â': 90, 'Ç': 91, 'Ô': 92, 'Ö': 93, 'Ø': 94, 'á': 95, 'í': 96, 'ó': 97, 'ö': 98, 'ÿ': 99, '│': 100, '┐': 101, '├': 102, '┤': 103, '┬': 104, '╗': 105, '░': 106, '▓': 

### Build and train model

In [5]:

modelpathname = 'v1'
def loss(Y_true,Y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true=Y_true, y_pred=Y_pred, from_logits=True)

# Length of the vocabulary in chars.
vocab_size = len(book_chars)

# The embedding dimension.
embedding_dim = 256 

# Number of RNN units.
u = [50, 100, 200, 500, 1000]
rnn_units = 200
n_epochs = 5


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    m = tf.keras.Sequential()
    m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    m.add(tf.keras.layers.BatchNormalization(synchronized=True))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.Dense(vocab_size))
    #m.summary()
    return m

m = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

# try the model
for i_ex, t_ex in dataset.take(1):
    example_pred = m(i_ex)  # this step builds the model
    sampled_indices = tf.random.categorical(logits=example_pred[0], num_samples=1)
    sample_ind_1d = tf.squeeze(input=sampled_indices, axis=-1).numpy()
print('Input:\n', repr(''.join([ind_to_char[c] for c in i_ex.numpy()[0]])))
print('Next char prediction:\n', repr(''.join([ind_to_char[c] for c in sample_ind_1d])))  

# Directory where the checkpoints will be saved.
current_dir_path = os.getcwd()
checkpoint_dir = current_dir_path + "\\tmp\\" + modelpathname+ "\\training_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, 
    save_best_onl =True
)


# Specify update rule and compile model
adam_opt = tf.keras.optimizers.Adam(learning_rate=learningrate)
m.compile(optimizer=adam_opt, loss=loss)

# train
history = m.fit(x=dataset, epochs=n_epochs, callbacks=[checkpoint_callback])



Input:
 'ime enough\n      ÔÇÿto throw the body into the riv'
Next char prediction:
 'X8wx]F6_dá»Z5LS[OX2YdrÖ├pft░!(v:*TLWKZ-3uK4($~├VF6'


### Generate text

In [8]:
# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)

latest_checkpoint_file = "ckpt_01.hdf5"

simplified_batch_size = 1
m = build_model(vocab_size, embedding_dim, rnn_units, simplified_batch_size)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([simplified_batch_size, None]))

gen_text = generate_text(m, "Water ", 200, char_to_ind, ind_to_char)
print(gen_text)


ckpt_01.hdf5
Water . Ocountr-bullÔÇöGoether ouralo,, t lls, up elly upinon lapeare mauleÔÇö_serec.ÔÇö it ÔÇ£Floun froke, Mand tirt. fanc. hose jurn eoks by the soule ony
 whey hily soup re dentin-yout were by ano he fre obe tory no
 He 1re opurigion,yele dowy dedk eemant        4   ipodaly In unsoreatel-an uno tince-intowh ire of furautedile; ambody
fore
 Iiod the in le neotery ised. digalyÔÇes of ancoung mistous dan
 whome anchiwele usterouly, body, ÔÇöSurfan, and
    nouag ounidke finio_ of all be
    frome he ammosk of anture gal, muckineigs. That the nif on cont  he hers, hut have anrady!siagn to apsion upigh.ÔÇØ

, weke ne or
_ is dere guir toulu_
 le be trie ri grieve
 in attere we cur toge, o weroy wenory in uphe tlat. antour.) ful wo. jume by ut bose. of I Vure looke fro matlat the le norighouly,aut uf no suro surace of haghe, is of norinous it to wered an mar-abos-tertable tre to logy Berifid mame tho 2rpin wilo le be Madue, ony withouly curm bore an, he stals ere de tore ony 

### Evaluate and plot

In [None]:
print(history.history['loss'])
d = get_n_grams(gen_text, 1)
fig = px.line(history.history['loss'], title='Loss over epochs')
#fig = px.scatter(history.history['loss'], title='Loss over epochs')
fig.show()


correctwords = 0
for word in d: 
    if word in words: 
        #print(word)
        correctwords += 1

print("Correct % words: {}".format(correctwords/len(d)))

# Calculate performance metrics for generated text
fraction_correct_words, bleu_score = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
repetition_score = measure_diversity(text_generated=gen_text, n_max=2)
print("fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))