# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os

## Vanilla RNN

### Load and Preprocess Data

In [None]:
training_data_filename = '../Dataset/Training/edgar1.txt'
book_data = np.array(load_data(training_data_filename))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01


### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
max_iter = 200000
epochs = 10
smoothloss_list = []
loss_list = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0, iteration)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 1:
            print('Iteration: {}, Loss: {} '.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence + "\n")
            #sentences.append(sentence)
            smoothloss_list.append(smoothloss)
            loss_list.append(loss)
        
        iteration += 1
        if iteration>max_iter:
            break


### Evaluate and plot

In [None]:

with open(training_data_filename,encoding='cp850',mode='r') as file:
    validation_text = file.read()
start_char = "T"
start_char_onehot = one_hot_encoding(start_char, char_to_ind, k)
generated_text_vanilla_onehot = rnn.synthetize(rnn.hprev, start_char_onehot, 1000)
generated_text_vanilla = start_char + one_hot_decoding(generated_text_vanilla_onehot, ind_to_char)
print(generated_text_vanilla)

# Calculate performance metrics for generated text
nmax = 4
fraction_correct_words, bleu_score = measure_bleu(text_generated=generated_text_vanilla, text_val=validation_text, n_max=nmax)
repetition_score = measure_diversity(text_generated=generated_text_vanilla, n_max=nmax)
print("\n loss function", loss_list)
print("\n fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))



fig = px.line(smoothloss_list, title='Smoothed loss over epochs', width=600)
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text="smoothed loss")
fig.update_xaxes(title_text="iteration step, in multiples of 10k")
fig.show()



## LSTM

### Load and Preprocess Data

In [2]:
vol1_filename = '../Dataset/Training/edgar1.txt'
vol2_filename = '../Dataset/Training/edgar2.txt'
vol3_filename = '../Dataset/Training/edgar3.txt'
vol4_filename = '../Dataset/Training/edgar4.txt'
vol5_filename = '../Dataset/Training/edgar5.txt'

encoding = "utf8"
book_data_vol1 = np.array(load_data(vol1_filename, remove_footnotes=False, encoding=encoding))
book_data_vol2 = np.array(load_data(vol2_filename, remove_footnotes=False, encoding=encoding))
book_data_vol3 = np.array(load_data(vol3_filename, remove_footnotes=False, encoding=encoding))
book_data_vol4 = np.array(load_data(vol4_filename, remove_footnotes=False, encoding=encoding))
book_data_vol5 = np.array(load_data(vol5_filename, remove_footnotes=False, encoding=encoding))

with open(vol1_filename,encoding=encoding,mode='r') as f:
   words = f.read().split()
with open(vol2_filename ,encoding=encoding,mode='r') as file:
    validation2_text = file.read()
    # takes a little time due to synonym swap
with open('../Dataset/Training/synonyms.csv', encoding=encoding, mode="r") as file:
    all_synonyms = file.read()   

# book_chars = np.unique(book_data)
all_book_data = np.concatenate((book_data_vol1, book_data_vol2, book_data_vol3, book_data_vol4))
book_chars = np.unique(np.concatenate((all_book_data, np.array([s for s in all_synonyms]))))
print(len(book_chars))
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]

vocab_size = len(book_chars)
embedding_dim = 256 


def create_batches(data, batch_size, seq_length):
    book_data_ind = np.array([char_to_ind[c] for c in data])

    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
    
    return dataset

143


### Functions for training

In [3]:

def build_model(rnn_units, batch_size, nr_lstm_layers, bn):
    m = tf.keras.Sequential()
    m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    for i in range(nr_lstm_layers):
        m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    if bn:
        m.add(tf.keras.layers.BatchNormalization())
    m.add(tf.keras.layers.Dense(vocab_size))
    return m


def train_model(dataset_train, dataset_val, eta, rnn_units, n_epochs, batch_size, output_filename, nr_lstm_layers=1, bn=False):
    m = build_model(rnn_units, batch_size, nr_lstm_layers, bn)
    for i_ex, t_ex in dataset_train.take(1):
        example_pred = m(i_ex)  # this step builds the model

    # Specify update rule and compile model
    adam_opt = tf.keras.optimizers.Adam(learning_rate=eta)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
    m.compile(optimizer=adam_opt, loss=loss_func)

    # Configure checkpoints
    current_dir_path = os.getcwd()
    checkpoint_dir = os.path.join(os.path.join(os.path.join(current_dir_path, "tmp"), output_filename), "training_checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True, save_best_onl =False)

    # train
    history = m.fit(x=dataset_train, epochs=n_epochs, validation_data=dataset_val, callbacks=[checkpoint_callback])
    return m, history

### Grid search

In [None]:

rnn_units = 200
n_epochs = 5
modelpathname = "1layer_gridsearch"
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
output_str = "batch_size | seq_len | eta | loss | BLEU2 \n"

batch_sizes = [25, 75, 125]
seq_lengths = [25, 75, 125]
learningrates = [0.1, 0.01, 0.001]
count = 0
for batch_size in batch_sizes:
    for seq_length in seq_lengths:
        dataset = create_batches(book_data, batch_size, seq_length)
        for eta in learningrates:
            print(count/27)
            model, history = train_model(dataset, eta, rnn_units, n_epochs, batch_size, modelpathname, 1, False)
            final_loss = history.history['loss'][-1]
            # generate text
            m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=1, bn=False)
            m.load_weights(checkpoint_dir + "/" + "ckpt_0{}.hdf5".format(n_epochs))
            m.build(tf.TensorShape([1, None]))
            gen_text = generate_text(model=m, start_string="The ", text_size=1000, 
                                    char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1.0, p=None)
            # Measure performance
            frac_corr_words, bleu2 = measure_bleu(gen_text, validation_text, 2)
            output_str += "{}      {}      {}      {}      {} \n".format(batch_size, seq_length, eta, final_loss, bleu2)

            count += 1

print(output_str)  
            

### Build and train one model

In [4]:

# Number of RNN units.
rnn_units = 200
n_epochs = 5
eta = 0.01
batch_size = 50
seq_length = 50

modelpathname = 'v7_1'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
#augmented_data_str, used_synonyms = augment_data(validation_text, n_synonyms=1000, n_word_swaps=1000, n_deletions=500, n_sentence_swaps=500)
dataset_train = create_batches(book_data_vol1, batch_size, seq_length)
dataset_val = create_batches(book_data_vol2, batch_size, seq_length)

nr_layers = 1
model, history= train_model(dataset_train, dataset_val, eta, rnn_units, n_epochs, batch_size, modelpathname, nr_layers, False)

latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    if file != ".DS_Store":
        e = int(file.split("_")[1].split(".")[0])
        if e>latest_epoch:
            latest_epoch = e
            latest_checkpoint_file = file
print(checkpoint_dir + "/" + latest_checkpoint_file)


Epoch 1/5
 29/223 [==>...........................] - ETA: 29s - loss: 3.0055

In [9]:
data_val_txt = validation2_text


m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=1, bn=False)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

input_text = validation2_text[5000:5500]
print(input_text)
validation_data = validation2_text[6000:]
gen_text = generate_text(model=m, start_string=input_text, text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1, p=None)
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_data, n_max=2)
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)

print(gen_text)
print(fraction_correct_words, bleu_score2)



Prefect, “from the nature
      of the document, and from the non-appearance of certain results
      which would at once arise from its passing out of the robber’s
      possession; that is to say, from his employing it as he must
      design in the end to employ it.”

      “Be a little more explicit,” I said.

      “Well, I may venture so far as to say that the paper gives its
      holder a certain power in a certain quarter where such power is
      immensely valuable.” The Prefect was fo
l. The
      atming
      and gensension thus
trowed utsefted at dua the old situately moon of Marius.
      “Wh his graved
      the attemitives of evidence
      will brite dwerdong within the improver in ming, This shrever appenders as I bew you intermumbers of
      Legrand to dettently, distance, I seemiled it; them. It and diy ears. He nortrawed flaming up. Forness
     genience of and.

      “I been
      perfection, when
       one seem the do occase of the fact to a the subject, this 

In [7]:

# Number of RNN units.
rnn_units = 200
n_epochs = 5
eta = 0.01
batch_size = 50
seq_length = 50


modelpathname = 'augmented/v1'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
augmented_data_str, used_synonyms = augment_data(validation_text, n_synonyms=1000, n_word_swaps=1000, n_deletions=500, n_sentence_swaps=500)
dataset_augmented = create_batches(np.array([s for s in augmented_data_str]), batch_size, seq_length)
nr_layers = 1
model_aug, history_aug = train_model(dataset_augmented, eta, rnn_units, n_epochs, batch_size, modelpathname, nr_layers, False)


latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    if file != ".DS_Store":
        e = int(file.split("_")[1].split(".")[0])
        if e>latest_epoch:
            latest_epoch = e
            latest_checkpoint_file = file
print(checkpoint_dir + "/" + latest_checkpoint_file)



m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=1, bn=False)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
c:\Users\sofia\Documents\skola\Djupinlärning\DD2424_Project\src\tmp\augmented/v1\training_checkpoints/ckpt_05.hdf5
0
1
   
['Prefect', ' “from the nature\n      of the document', ' and from the non-appearance of certain results\n      which would at once arise from its passing out of the robber’s\n      possession; that is to say', ' from his employing it as he must\n      design in the end to employ it', '”\n\n      “Be a little more explicit', '” I said', '\n\n      “Well', ' I may venture so far as to say that the paper gives its\n      holder a certain power in a certain quarter where such power is\n      immensely valuable', '” The Prefect was fond of the cant of\n      diplomacy', '\n\n      “Still I do not quite understand', '” said Dupin', '\n\n      “No? Well; the disclosure of the document to a third person', ' who\n      shall be nameless', ' would bring in question the honor of a\n      personage of most exalted station; and

In [9]:

validation_data_augmented = validation_text + augmented_data_str


bleu2 = []
selfbleu2 = []
bleu1 = []
inputs = ["The ", "j", "V", "?", " ", "nsjkgds", "but", ".", "why", '!']
j = 0
with open('../Dataset/Training/edgar2.txt',encoding="utf8",mode='r') as file:
    d = file.read()
    input_text = d[5000:25000]

while j<1:
    print(j)
    try:
        input = input_text
        gen_text = generate_text(model=m, start_string=input_text, text_size=2000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1, p=None)
        fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_data_augmented, n_max=2)
        repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
        bleu1.append(fraction_correct_words)
        bleu2.append(bleu_score2)
        selfbleu2.append(repetition_score2)
        j += 1
    except:
        pass

print(np.mean(bleu1), np.var(bleu1))
print(np.mean(bleu2), np.var(bleu2))
print(np.mean(selfbleu2), np.var(selfbleu2))


0
1
0.8435611466993161 5.680970120911727e-07
0.5707215889232007 1.7541117542313051e-06
0.3440328559793978 3.34565444594688e-06


### Generate text, evauate and plot

In [None]:
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
plot_title = 'Loss over epochs'

plotbool = True
outputfile_bool = True

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    if file != ".DS_Store":
        e = int(file.split("_")[1].split(".")[0])
        if e>latest_epoch:
            latest_epoch = e
            latest_checkpoint_file = file
print(checkpoint_dir + "/" + latest_checkpoint_file)


m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=1, bn=False)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 1.0
gen_text = generate_text(model=m, start_string="The ", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability)


if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title=plot_title, width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()
    

# Calculate performance metrics for generated text
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_data, n_max=2)
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)

output_str = gen_text + "\n ------------ \n loss {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2)
output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(output_str)


# Create output file
output_path = os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname)
if outputfile_bool:
    with open(os.path.join(output_path, "perf.txt"), "w") as file:
        file.write(output_str)   
    




In [7]:
line = "wait on,verb,serve;attend to;attend;assist"
line2 = "unmake,verb,undo"
word = line.split(",")[0]
synonyms = line.split(",")[2].split(";")
print(synonyms)

print(line2.split(",")[2].split(";"))

['serve', 'attend to', 'attend', 'assist']
['undo']
