# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os
import tqdm
from keras.initializers import Constant

## Vanilla RNN

### Load and Preprocess Data

In [None]:
training_data_filename = '../Dataset/Training/edgar.txt'
book_data = np.array(load_data(training_data_filename))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01


### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
max_iter = 200000
epochs = 10
smoothloss_list = []
loss_list = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0, iteration)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 1:
            print('Iteration: {}, Loss: {} '.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence + "\n")
            #sentences.append(sentence)
            smoothloss_list.append(smoothloss)
            loss_list.append(loss)
        
        iteration += 1
        if iteration>max_iter:
            break


### Evaluate and plot

In [None]:

with open(training_data_filename,encoding='utf-8-sig',mode='r') as file:
    validation_text = file.read()
start_char = "T"
start_char_onehot = one_hot_encoding(start_char, char_to_ind, k)
generated_text_vanilla_onehot = rnn.synthetize(rnn.hprev, start_char_onehot, 1000)
generated_text_vanilla = start_char + one_hot_decoding(generated_text_vanilla_onehot, ind_to_char)
print(generated_text_vanilla)

# Calculate performance metrics for generated text
nmax = 4
fraction_correct_words, bleu_score = measure_bleu(text_generated=generated_text_vanilla, text_val=validation_text, n_max=nmax)
repetition_score = measure_diversity(text_generated=generated_text_vanilla, n_max=nmax)
print("\n loss function", loss_list)
print("\n fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))



fig = px.line(smoothloss_list, title='Smoothed loss over epochs', width=600)
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text="smoothed loss")
fig.update_xaxes(title_text="iteration step, in multiples of 10k")
fig.show()



## LSTM

### Load and Preprocess Data

In [2]:
vol1_filename = '../Dataset/Training/edgar1.txt'
vol2_filename = '../Dataset/Training/edgar2.txt'
vol3_filename = '../Dataset/Training/edgar3.txt'
vol4_filename = '../Dataset/Training/edgar4.txt'
vol5_filename = '../Dataset/Training/edgar5.txt'

encoding ="utf8"

book_data_vol1 = np.array(load_data(vol1_filename, remove_footnotes=False, encoding=encoding))
book_data_vol2 = np.array(load_data(vol2_filename, remove_footnotes=False, encoding=encoding))
book_data_vol3 = np.array(load_data(vol3_filename, remove_footnotes=False, encoding=encoding))
book_data_vol4 = np.array(load_data(vol4_filename, remove_footnotes=False, encoding=encoding))
book_data_vol5 = np.array(load_data(vol5_filename, remove_footnotes=False, encoding=encoding))


ngram_validation_text = ""
for filename in [vol1_filename, vol2_filename, vol3_filename, vol4_filename, vol5_filename]:
    with open(filename, encoding=encoding,mode='r') as file:
        ngram_validation_text += file.read()

with open('../Dataset/Training/synonyms.csv', encoding=encoding, mode="r") as file:
    all_synonyms = file.read()   


all_book_data = np.concatenate(([s for s in ngram_validation_text], np.array([s for s in all_synonyms])))
book_chars = np.unique(all_book_data)


char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]

vocab_size = len(book_chars)
embedding_dim = 256 


def create_batches(data, batch_size, seq_length):
    book_data_ind = np.array([char_to_ind[c] for c in data])
    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
    return dataset

def build_model(rnn_units, batch_size, nr_lstm_layers, bn, bpe_weights=None):
    m = tf.keras.Sequential()
    if weights is None:
        m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    else:
        m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None], embeddings_initializer=tf.keras.initializers.Constant(bpe_weights)))
    for i in range(nr_lstm_layers):
        m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    if bn:
        m.add(tf.keras.layers.BatchNormalization())
    m.add(tf.keras.layers.Dense(vocab_size))
    return m


def get_latest_checkpoint_file(modelpathname):
    latest_epoch = 0
    latest_checkpoint_file = ""
    checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
    for file in os.listdir(checkpoint_dir):
        if file != ".DS_Store":
            e = int(file.split("_")[1].split(".")[0])
            if e>latest_epoch:
                latest_epoch = e
                latest_checkpoint_file = file
    return os.path.join(checkpoint_dir, latest_checkpoint_file)


def train_normal_model(dataset_train, dataset_val, eta, rnn_units, n_epochs, batch_size, output_filename, nr_lstm_layers=1, bn=False, bpe_weights=None):
    m = build_model(rnn_units, batch_size, nr_lstm_layers, bn, bpe_weights)

    for i_ex, t_ex in dataset_train.take(1):
        example_pred = m(i_ex)  # this step builds the model

    # Specify update rule and compile model
    adam_opt = tf.keras.optimizers.Adam(learning_rate=eta)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
    m.compile(optimizer=adam_opt, loss=loss_func)

    # Configure checkpoints
    current_dir_path = os.getcwd()
    checkpoint_dir = os.path.join(os.path.join(os.path.join(current_dir_path, "tmp"), output_filename), "training_checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True, save_best_only=False)

    # train
    if dataset_val:
        history = m.fit(x=dataset_train, epochs=n_epochs, validation_data=dataset_val, callbacks=[checkpoint_callback])
    else:
        history = m.fit(x=dataset_train, epochs=n_epochs, callbacks=[checkpoint_callback])
    return m, history 


### Train and test character based LSTM 

In [None]:

rnn_units = 50
n_epochs = 5
eta = 0.01
batch_size = 50
seq_length = 50

modelpathname = 'v7_1'
nr_layers = 1

#augmented_data_str, used_synonyms = augment_data(validation_text, n_synonyms=1000, n_word_swaps=1000, n_deletions=500, n_sentence_swaps=500)
dataset_train = create_batches(book_data_vol1, batch_size, seq_length)
dataset_val = create_batches(book_data_vol2, batch_size, seq_length)

use_bn = False
model, history= train_normal_model(dataset_train, dataset_val, eta, rnn_units, n_epochs, batch_size, modelpathname, nr_layers, use_bn)

latest_checkpoint_file = get_latest_checkpoint_file(modelpathname)
m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=nr_layers, bn=use_bn)
m.load_weights(latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

inputt = ngram_validation_text[1000:1200]
validation_text = ngram_validation_text
gen_text = generate_text(model=m, start_string=inputt, text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1, p=None)
nmax = 2
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=nmax)
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=nmax)

print(gen_text)
print(fraction_correct_words, bleu_score2, repetition_score2)


### BPE functions

In [3]:
""" BPE Tokenization"""

def compute_pair_count(splits, word_count):
    pair_count = {}
    for word, count in word_count.items():
        split = splits[word]
      
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            if pair in pair_count:
                pair_count[pair] += count
            else:
                pair_count[pair] = count
    return pair_count

def merge_pair(a, b, splits,word_count):
    for word in word_count:
     
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
    
            else:
                i += 1

        splits[word] = split
   
    return splits

def BPE(text, vocab_size,word_count):
    merges = {}
    book_chars = np.unique(text)
    splits =  {word: [c for c in word] for word in word_count.keys()}
    while len(book_chars) < vocab_size:
        pair_count = compute_pair_count(splits, word_count)
        best_pair = ""
        max_count = None
        for pair, count in pair_count.items():
            if max_count is None or max_count < count:
                best_pair = pair
                max_count = count
        splits = merge_pair(*best_pair, splits, word_count)
        merges[best_pair] = best_pair[0] + best_pair[1]
        book_chars = np.append(book_chars,best_pair[0] + best_pair[1])
    return splits, merges, book_chars

def tokenize(words, merges):
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    
    splits =  [[c for c in word] for word in word_count.keys()]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])



### Word2vec functions

In [6]:

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_ns):
    super(Word2Vec, self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots
 

def train_word2Vec(dataset_train, dataset_val, vocab_size, num_ns, n_epochs):
    word2vec = Word2Vec(vocab_size, embedding_dim, num_ns)
    word2vec.compile(optimizer='adam',
                    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
    if dataset_val:
      word2vec.fit(dataset_train, validation_data=dataset_val, epochs=n_epochs)#, callbacks=[tensorboard_callback])
    else:
      word2vec.fit(dataset_train, epochs=n_epochs)#, callbacks=[tensorboard_callback])
    return word2vec


def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

### Load and preprocess data for BPE 

In [5]:

enc = "utf8"
_, words1 = load_data1(vol1_filename, encoding=enc)
_, words2 = load_data1(vol2_filename, encoding=enc)
_, words3 = load_data1(vol3_filename, encoding=enc)
_, words4 = load_data1(vol4_filename, encoding=enc)
_, words5 = load_data1(vol5_filename, encoding=enc)

word_count = {}
all_words = words1 #np.concatenate((words1, words2, words3, words4, words5))
for word in all_words:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1


splits, merges, book_chars = BPE(all_book_data, 200, word_count)
tokenized_text = tokenize(all_words, merges)

eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4

#Initalize the dataset that will be used for LSTM network
dataset_train = create_batches(book_data_vol1, batch_size, seq_length)
dataset_val = create_batches(book_data_vol2, batch_size, seq_length)
seq_list_train = []
for batch in dataset_train:
   for seq in batch[0]:
       seq_list_train.append(seq.numpy())

# seq_list_val = []
# for batch in dataset_val:
#     for seq in batch[0]:
#         seq_list_val.append(seq.numpy())     

# Generate training and validation data for word2vec using the sequences from the LSTM dataset
targets, contexts, labels = generate_training_data(seq_list_train, 2, num_ns, vocab_size)
dataset_train = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset_train = dataset_train.shuffle(10000).batch(batch_size, drop_remainder=True)

# # Generate training data for word2vec using the sequences from the LSTM dataset
# targets, contexts, labels = generate_training_data(seq_list_val, 2, num_ns, vocab_size)
# dataset_val = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
# dataset_val = dataset_val.shuffle(10000).batch(batch_size, drop_remainder=True)


100%|██████████| 11150/11150 [00:28<00:00, 387.46it/s]
100%|██████████| 11800/11800 [00:31<00:00, 373.10it/s]


### Train word2vec weights

In [8]:
modelpathname = 'v7_1'

#Train word2vec model and get the weights

#word2vec = train_word2Vec(dataset_train, dataset_val, vocab_size, num_ns, 20)
word2vec = train_word2Vec(dataset_train, None, vocab_size, num_ns, 10)

weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

#print(weights)
pd.DataFrame(weights).to_csv("weights_word2vec.csv")
word2vec.save_weights("word2vec_weights_model.h5")
weights1 = pd.read_csv("weights_word2vec.csv", index_col=0).to_numpy()


# Train normal LSTM model with bpe embedding
n_epochs = 5
rnn_units = 50
use_bn = False
dataset_train = create_batches(book_data_vol1, batch_size, seq_length)
dataset_val = create_batches(book_data_vol2, batch_size, seq_length)
nr_layers = 2
model, history = train_normal_model(dataset_train, dataset_val, eta, rnn_units, n_epochs, batch_size, modelpathname, nr_layers, use_bn, weights1)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


NameError: name 'nr_layers' is not defined

In [10]:

latest_checkpoint_file = get_latest_checkpoint_file(modelpathname)
m = build_model(rnn_units=rnn_units, batch_size=1, nr_lstm_layers=nr_layers, bn=use_bn, bpe_weights=weights1)
m.load_weights(latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

inputt = ngram_validation_text[1000:1200]
validation_text = ngram_validation_text
gen_text = generate_text(model=m, start_string=inputt, text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1, p=None)
nmax = 2
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=nmax)
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=nmax)

print(gen_text)
print(fraction_correct_words, bleu_score2, repetition_score2)



pidereds wh’ when esterst and mymestiom
      died. But on to, to sceld, in
      thoterhe shile! I gaverer iken meriol yeat whais    romentinus up
      the shagent-aty that obrecples. ‘Lisher caul had mush to to it, thene the cherrumise obver, with whos “You the Ξo dorictin poafiaphere. It nlaidind my ut this nowelllition to the conntais. But in
      to mannoul d, the stidentered his defical a fear of linht mystir specesering ext untimy, and he      larred of the cietint chared dy me that to fories to
      my
      the and yout darratore before abition)alg, entepition, one had obles the fangn, that with the all in be
      cownersed, left allg,
      when neashery while furtions.

       hear whowether, “Applaiginht Dugn lademation howe of consall
      it any thribsest on the
      the Mrles, in the susitered me
      of hould incharning myshated pright
      dnep the my the supan
      but seade of genty was a, sot that expetsed afted, tood wiuth with a
      rechaishand fater I


### Generate Text

In [None]:

plotbool = False
outputfile_bool = True

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
modelpathname = 'v12_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
rnn_units = 200
n_epochs = 5
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4
#rint(os.path)3
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)
#latest_checkpoint_file = "ckpt_01.hdf5"
print(ind_to_char)
m = build_model(rnn_units, 1)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 1.0
gen_text = generate_text(model=m, start_string="the", text_size=2000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability,word_level=True)


if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title='Loss over epochs', width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()

# Calculate performance metrics for generated text
# fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
# _, bleu_score4 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=4)
# repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
# repetition_score4 = measure_diversity(text_generated=gen_text, n_max=4)

# output_str = gen_text + "\n ------------ \n loss function: {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} "\
#     "\n Bleu score4: {}, Repetition score4: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2, bleu_score4, repetition_score4)
# output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(gen_text)


#Create output file
output_path = os.path.join(os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname), "text_temp15.txt")
if outputfile_bool:
    with open(output_path, "w",encoding="utf-8-sig") as file:
        file.write(gen_text)


