# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os
import tqdm
from keras.initializers import Constant

## Vanilla RNN

### Load and Preprocess Data

In [None]:
training_data_filename = '../Dataset/Training/edgar.txt'
book_data = np.array(load_data(training_data_filename))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01


### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
max_iter = 200000
epochs = 10
smoothloss_list = []
loss_list = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0, iteration)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 1:
            print('Iteration: {}, Loss: {} '.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence + "\n")
            #sentences.append(sentence)
            smoothloss_list.append(smoothloss)
            loss_list.append(loss)
        
        iteration += 1
        if iteration>max_iter:
            break


### Evaluate and plot

In [None]:

with open(training_data_filename,encoding='utf-8-sig',mode='r') as file:
    validation_text = file.read()
start_char = "T"
start_char_onehot = one_hot_encoding(start_char, char_to_ind, k)
generated_text_vanilla_onehot = rnn.synthetize(rnn.hprev, start_char_onehot, 1000)
generated_text_vanilla = start_char + one_hot_decoding(generated_text_vanilla_onehot, ind_to_char)
print(generated_text_vanilla)

# Calculate performance metrics for generated text
nmax = 4
fraction_correct_words, bleu_score = measure_bleu(text_generated=generated_text_vanilla, text_val=validation_text, n_max=nmax)
repetition_score = measure_diversity(text_generated=generated_text_vanilla, n_max=nmax)
print("\n loss function", loss_list)
print("\n fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))



fig = px.line(smoothloss_list, title='Smoothed loss over epochs', width=600)
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text="smoothed loss")
fig.update_xaxes(title_text="iteration step, in multiples of 10k")
fig.show()



## LSTM

### Load and Preprocess Data

In [9]:


encoding = "utf8" #"utf8"

#training_data_filename = '../Dataset/Training/edgar1.txt'
#book_data, words = load_data(training_data_filename,remove_footnotes=False,word_level=False, encoding=encoding)

vol1_filename = '../Dataset/Training/edgar1.txt'
vol2_filename = '../Dataset/Training/edgar2.txt'
vol3_filename = '../Dataset/Training/edgar3.txt'
vol4_filename = '../Dataset/Training/edgar4.txt'
vol5_filename = '../Dataset/Training/edgar5.txt'

book_data1, words1 = load_data(vol1_filename,remove_footnotes=False,word_level=False)
book_data2, words2 = load_data(vol2_filename,remove_footnotes=False,word_level=False)
book_data3, words3 = load_data(vol3_filename,remove_footnotes=False,word_level=False)
book_data4, words4 = load_data(vol4_filename,remove_footnotes=False,word_level=False)
book_data5, words5 = load_data(vol5_filename,remove_footnotes=False,word_level=False)

training_words = np.concatenate((words1, words3, words4))
validation_words = words2
test_words = words5
all_words = np.concatenate((words1, words2, words3, words4, words5))

total_validaton_text = ""
for i in range(1,6):
    with open('../Dataset/Training/edgar{}.txt'.format(i), "r", encoding="utf-8-sig", ) as f:
        total_validaton_text += f.read()


book_chars = np.unique(all_words)
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]

vocab_size = len(book_chars)
embedding_dim = 256 

data_ind_train = np.array([char_to_ind[c] for c in training_words])
data_ind_val = np.array([char_to_ind[c] for c in validation_words])
data_ind_test = np.array([char_to_ind[c] for c in test_words])


def create_batches(data_ind, batch_size, seq_length):

    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

    return dataset

def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels





## BPE

### Functions

In [None]:
""" BPE Tokenization"""

def compute_pair_count(splits, word_count):
    pair_count = {}
    for word, count in word_count.items():
        split = splits[word]
      
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            if pair in pair_count:
                pair_count[pair] += count
            else:
                pair_count[pair] = count
    return pair_count

def merge_pair(a, b, splits,word_count):
    for word in word_count:
     
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
    
            else:
                i += 1

        splits[word] = split
   
    return splits

def BPE(text, vocab_size,word_count):
    merges = {}
    book_chars = np.unique(text)
    splits =  {word: [c for c in word] for word in word_count.keys()}
    while len(book_chars) < vocab_size:
        pair_count = compute_pair_count(splits, word_count)
        best_pair = ""
        max_count = None
        for pair, count in pair_count.items():
            if max_count is None or max_count < count:
                best_pair = pair
                max_count = count
        splits = merge_pair(*best_pair, splits, word_count)
        merges[best_pair] = best_pair[0] + best_pair[1]
        book_chars = np.append(book_chars,best_pair[0] + best_pair[1])
    return splits, merges, book_chars

def tokenize(words, merges):
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    
    splits =  [[c for c in word] for word in word_count.keys()]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])




### Load and Preprocess Data for BPE

In [None]:


vol1_filename = '../Dataset/Training/edgar1.txt'
vol2_filename = '../Dataset/Training/edgar2.txt'
vol3_filename = '../Dataset/Training/edgar3.txt'
vol4_filename = '../Dataset/Training/edgar4.txt'
vol5_filename = '../Dataset/Training/edgar5.txt'

book_data1, words1 = load_data1(vol1_filename,remove_footnotes=False,word_level=False)
book_data2, words2 = load_data1(vol2_filename,remove_footnotes=False,word_level=False)
book_data3, words3 = load_data1(vol3_filename,remove_footnotes=False,word_level=False)
book_data4, words4 = load_data1(vol4_filename,remove_footnotes=False,word_level=False)
book_data5, words5 = load_data1(vol5_filename,remove_footnotes=False,word_level=False)


all_words = np.concatenate((words1, words2, words3, words4, words5))
all_data = np.concatenate((book_data1, book_data2, book_data3, book_data4, book_data5))


total_validaton_text = ""
for i in range(1,6):
    with open('../Dataset/Training/edgar{}.txt'.format(i), "r", encoding="utf-8-sig", ) as f:
        total_validaton_text += f.read()


word_count = {}
for word in all_words:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1

book_chars = np.unique(all_data)
splits, merges, book_chars = BPE(all_data, 200, word_count)
tokenized_text = tokenize(all_words, merges)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}

N = len(tokenized_text)
split_ind1 = int(N*0.8)
split_ind2 = split_ind1 + int(N*0.1)

data_ind_train = np.array([char_to_ind[c] for c in tokenized_text[0:split_ind1]])
data_ind_val = np.array([char_to_ind[c] for c in tokenized_text[split_ind1:split_ind2]])
data_ind_test = np.array([char_to_ind[c] for c in tokenized_text[split_ind2:]])

k = book_chars.shape[0]
embedding_dim = 256
vocab_size = len(book_chars)

def create_batches(data_ind, batch_size, seq_length):
    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

    return dataset

def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

## LSTM and BPE functions

### Functions for training

In [10]:

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim,num_ns):
    super(Word2Vec, self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots
  

def train_word2Vec(dataset, vocab_size, num_ns, n_epochs):

    word2vec = Word2Vec(vocab_size, embedding_dim,num_ns)
    word2vec.compile(optimizer='adam',
                    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
    word2vec.fit(dataset, epochs=n_epochs) #, callbacks=[tensorboard_callback])
    return word2vec


def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

def build_model(rnn_units, batch_size, nr_layers, use_bn=False, weights=None):    
    m = tf.keras.Sequential()
    if weights is not None:
        m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None],embeddings_initializer=tf.keras.initializers.Constant(weights)))
    else:
       m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    for j in range(nr_layers):
      if use_bn:
        m.add(tf.keras.layers.BatchNormalization())
      m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.Dense(vocab_size))
    #m.summary()
    return m


def train_model(dataset_train, dataset_val, eta, rnn_units, nr_layers, batch_size, output_filename, n_epochs, use_bn=False, weights=None):
    m = build_model(rnn_units, batch_size, nr_layers, use_bn, weights)
    for i_ex, t_ex in dataset_train.take(1):
        example_pred = m(i_ex)  # this step builds the model

    # Specify update rule and compile model
    adam_opt = tf.keras.optimizers.Adam(learning_rate=eta)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
    m.compile(optimizer=adam_opt, loss=loss_func)

    # Configure checkpoints
    current_dir_path = os.getcwd()
    checkpoint_dir = os.path.join(os.path.join(os.path.join(current_dir_path, "tmp"), output_filename), "training_checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True, save_best_onl =True)

    # train
    if dataset_val is not None:
      history = m.fit(x=dataset_train, validation_data=dataset_val, epochs=n_epochs, callbacks=[checkpoint_callback])
    else:
      history = m.fit(x=dataset_train, epochs=n_epochs, callbacks=[checkpoint_callback])
    return m, history

### Grid search

In [None]:


rnn_units = 200
n_epochs = 5
modelpathname = "v6_2"
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
output_str = "batch_size | seq_len | eta | loss  \n"

batch_sizes = [25, 75, 125]
seq_lengths = [25, 75, 125]
learningrates = [0.1, 0.01, 0.001]
count = 0
for batch_size in batch_sizes:
    for seq_length in seq_lengths:
        dataset = create_batches(batch_size, seq_length)
        for eta in learningrates:
            print(count/27)
            model, history = train_model(dataset, eta, rnn_units, batch_size, modelpathname, n_epochs)
            final_loss = history.history['loss'][-1]
            # generate text
            m = build_model(rnn_units, 1)
            m.load_weights(checkpoint_dir + "/" + "ckpt_0{}.hdf5".format(n_epochs))
            m.build(tf.TensorShape([1, None]))
            #gen_text = generate_text(model=m, start_string="The ", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1.0, p=None)
            # Measure performance
            #frac_corr_words, bleu2 = measure_bleu(gen_text, validation_text, 2)
            output_str += "{}      {}      {}      {}       \n".format(batch_size, seq_length, eta, final_loss)

            count += 1
            

print(output_str)  
            

### Build and train one model

In [None]:

modelpathname = 'long_train_bpe'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 200
n_epochs = 15
eta = 0.01 # Best = 0.01
seq_length = 25 # Best = 125
batch_size = 75 # Best = 25
nr_layers = 2
use_bn = False

dataset_train = create_batches(data_ind_train, batch_size, seq_length)
dataset_val = create_batches(data_ind_val, batch_size, seq_length)

model, history = train_model(dataset_train, dataset_val, eta, rnn_units, nr_layers, batch_size, modelpathname, n_epochs, use_bn)


### Generate text, evauate and plot

In [None]:
dataset_test = create_batches(data_ind_test, batch_size, seq_length)
print(model.evaluate(create_batches(data_ind_val, batch_size, seq_length)))


plotbool = False
outputfile_bool = False

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)
#latest_checkpoint_file = "ckpt_01.hdf5"

m = build_model(rnn_units, 1, nr_layers, False)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 0.8
gen_text = generate_text1(model=m, start_string="the", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability)
gen_text = "".join(gen_text).replace("$"," ")


if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title='Loss over epochs', width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()

# Calculate performance metrics for generated text
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=total_validaton_text, n_max=2)
#_, bleu_score4 = measure_bleu(text_generated=gen_text, text_val=total_validaton_text, n_max=4)
bleu_score4 = -1
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
repetition_score4 = measure_diversity(text_generated=gen_text, n_max=4)

output_str = gen_text + "\n ------------ \n loss function: {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} "\
    "\n Bleu score4: {}, Repetition score4: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2, bleu_score4, repetition_score4)
output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(output_str)


# Create output file
output_path = os.path.join(os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname), "text_temp15.txt")
if outputfile_bool:
    with open(output_path, "w",encoding="utf-8-sig") as file:
        file.write(output_str)

#rint(gen_text)


## Word2Vec

### Train word2vec model

In [11]:
modelpathname = 'v9_1'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 100
n_epochs = 1
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4

dataset_train = create_batches(data_ind_train, batch_size, seq_length)
dataset_val = create_batches(data_ind_val, batch_size, seq_length)

seq_list = []
for batch in dataset_train:
   for seq in batch[0]:
       seq_list.append(seq.numpy())
print(len(seq_list))

#Generate training data for word2vec using the sequences from the LSTM dataset
targets, contexts, labels = generate_training_data(seq_list, 2, 4, vocab_size)
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
word2vec_epochs = 1
word2vec = train_word2Vec(dataset,vocab_size, num_ns, words2vec_epochs)
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

#Train LSTM model
nr_layers = 1
use_bn = True
model, history = train_model(dataset_train, None, eta, rnn_units, nr_layers, batch_size, modelpathname, n_epochs, use_bn=use_bn, weights=weights)


6300


100%|██████████| 6300/6300 [04:59<00:00, 21.06it/s]




### Store the weights

In [7]:

pd.DataFrame(weights).to_csv("weights_word2vec.csv")
word2vec.save_weights("word2vec_weights_model.h5")


In [None]:
modelpathname = 'v12_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 200
n_epochs = 5
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4
dataset1 = create_batches(batch_size, seq_length)
weights1 = pd.read_csv("weights_word2vec.csv", index_col=0).to_numpy()

model, history = train_model(dataset1, eta, rnn_units, batch_size, modelpathname, n_epochs, weights=weights1)

### Generate Text

In [8]:

plotbool = False
outputfile_bool = True

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
modelpathname = 'v10_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
rnn_units = 100


#rint(os.path)3
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)


m = build_model(rnn_units, 1, nr_layers, use_bn=False)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 1.0
gen_text = generate_text(model=m, start_string="the", text_size=300, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability,word_level=True)

if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title='Loss over epochs', width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()

# Calculate performance metrics for generated text
# fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
# _, bleu_score4 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=4)
# repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
# repetition_score4 = measure_diversity(text_generated=gen_text, n_max=4)

# output_str = gen_text + "\n ------------ \n loss function: {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} "\
#     "\n Bleu score4: {}, Repetition score4: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2, bleu_score4, repetition_score4)
# output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(gen_text)


#Create output file
output_path = os.path.join(os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname), "text_temp15.txt")
if outputfile_bool:
    with open(output_path, "w",encoding="utf-8-sig") as file:
        file.write(gen_text)



ckpt_01.hdf5
{0: '\n', 1: '!', 2: '$10', 3: '$100', 4: '$20', 5: '$600', 6: '&c', 7: '(', 8: '(*)', 9: '(*1)', 10: '(*10)', 11: '(*11)', 12: '(*12)', 13: '(*13)', 14: '(*14)', 15: '(*15)', 16: '(*16)', 17: '(*17)', 18: '(*18)', 19: '(*19)', 20: '(*2)', 21: '(*20)', 22: '(*21)', 23: '(*22)', 24: '(*23)', 25: '(*23)]', 26: '(*3)', 27: '(*4)', 28: '(*5)', 29: '(*6)', 30: '(*7)', 31: '(*8)', 32: '(*9)', 33: '(,', 34: '(Beauvais’);', 35: '(Davidson', 36: '(God', 37: '(I', 38: '(Montani,', 39: '(November,', 40: '(Wednesday,', 41: '(_for', 42: '(_mansardes', 43: '(a', 44: '(although', 45: '(among', 46: '(an', 47: '(and', 48: '(as', 49: '(at', 50: '(by', 51: '(conclusive', 52: '(directed', 53: '(during', 54: '(for', 55: '(good', 56: '(horrible', 57: '(however', 58: '(if', 59: '(in', 60: '(it', 61: '(more', 62: '(never', 63: '(northern)', 64: '(not', 65: '(oh', 66: '(one', 67: '(or', 68: '(so', 69: '(sometime', 70: '(strange', 71: '(such', 72: '(the', 73: '(themselves', 74: '(these', 75: '(this

In [None]:
""" BPE Tokenization"""

def compute_pair_count(splits, word_count):
    pair_count = {}
    for word, count in word_count.items():
        split = splits[word]
      
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            if pair in pair_count:
                pair_count[pair] += count
            else:
                pair_count[pair] = count
    return pair_count

def merge_pair(a, b, splits,word_count):
    for word in word_count:
     
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
    
            else:
                i += 1

        splits[word] = split
   
    return splits

def BPE(text, vocab_size,word_count):
    merges = {}
    book_chars = np.unique(text)
    splits =  {word: [c for c in word] for word in word_count.keys()}
    while len(book_chars) < vocab_size:
        pair_count = compute_pair_count(splits, word_count)
        best_pair = ""
        max_count = None
        for pair, count in pair_count.items():
            if max_count is None or max_count < count:
                best_pair = pair
                max_count = count
        splits = merge_pair(*best_pair, splits, word_count)
        merges[best_pair] = best_pair[0] + best_pair[1]
        book_chars = np.append(book_chars,best_pair[0] + best_pair[1])
    return splits, merges, book_chars

def tokenize(words, merges):
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    
    splits =  [[c for c in word] for word in word_count.keys()]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])




### Preprocessing

In [None]:
word_count = {}
for word in words:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1
splits, merges, book_chars = BPE(book_data, 200, word_count)
#print(len(book_chars))
tokenized_text = tokenize(words, merges)
#rint(len(tokenized_text))
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
print(tokenized_text)
print(char_to_ind)
print(ind_to_char)
book_data_ind = np.array([char_to_ind[c] for c in tokenized_text])
