# DD2424 Project in Deep Learning in Data Science

## Imports

In [1]:
from functions import *
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
import os
import tqdm
from keras.initializers import Constant

## Vanilla RNN

### Load and Preprocess Data

In [None]:
training_data_filename = '../Dataset/Training/edgar.txt'
book_data = np.array(load_data(training_data_filename))
book_chars = np.unique(book_data)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
m = 100
eta = 0.1
seq_length = 25
sig = 0.01


### Train

In [None]:
rnn = RNN(m, k, eta, seq_length, sig)
h0 = np.zeros((m, 1))
max_iter = 200000
epochs = 10
smoothloss_list = []
loss_list = []
iteration = 0
smoothloss = 0
sentences = []
for i in range(epochs):
    rnn.hprev = np.zeros((m, 1))
    for e in range(0, book_data.shape[0]-seq_length-1, seq_length):
        X_chars = book_data[e:e+seq_length]
        Y_chars = book_data[e+1:e+seq_length+1]
        X = one_hot_encoding(X_chars, char_to_ind, k)
        Y = one_hot_encoding(Y_chars, char_to_ind, k)
        loss = rnn.adagrad(X, Y, h0, iteration)
        if smoothloss == 0:
            smoothloss = loss
        smoothloss = 0.999*smoothloss + 0.001*loss
     
        if iteration % 10000 == 1:
            print('Iteration: {}, Loss: {} '.format(iteration, smoothloss))
            y = rnn.synthetize(rnn.hprev, X[:, 0], 200)
            sentence = one_hot_decoding(y, ind_to_char)
            print(sentence + "\n")
            #sentences.append(sentence)
            smoothloss_list.append(smoothloss)
            loss_list.append(loss)
        
        iteration += 1
        if iteration>max_iter:
            break


### Evaluate and plot

In [None]:

with open(training_data_filename,encoding='utf-8-sig',mode='r') as file:
    validation_text = file.read()
start_char = "T"
start_char_onehot = one_hot_encoding(start_char, char_to_ind, k)
generated_text_vanilla_onehot = rnn.synthetize(rnn.hprev, start_char_onehot, 1000)
generated_text_vanilla = start_char + one_hot_decoding(generated_text_vanilla_onehot, ind_to_char)
print(generated_text_vanilla)

# Calculate performance metrics for generated text
nmax = 4
fraction_correct_words, bleu_score = measure_bleu(text_generated=generated_text_vanilla, text_val=validation_text, n_max=nmax)
repetition_score = measure_diversity(text_generated=generated_text_vanilla, n_max=nmax)
print("\n loss function", loss_list)
print("\n fraction of correctly spelled words: {} \n Bleu score: {} \n Repetition score: {}".format(fraction_correct_words, bleu_score, repetition_score))



fig = px.line(smoothloss_list, title='Smoothed loss over epochs', width=600)
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text="smoothed loss")
fig.update_xaxes(title_text="iteration step, in multiples of 10k")
fig.show()



## LSTM

### Load and Preprocess Data

In [8]:


encoding = "utf8" #"utf8"

training_data_filename = '../Dataset/Training/edgar.txt'
book_data, words = np.array(load_data(training_data_filename,remove_footnotes=False,word_level=False, encoding=encoding))
#book_data, words = load_data1(training_data_filename, remove_footnotes=False, word_level=False, encoding=encoding)
#book_data = book_data[:10000]


with open(training_data_filename,encoding=encoding,mode='r') as f:
  words = f.read().split()
with open(training_data_filename,encoding=encoding,mode='r') as file:
    validation_text = file.read().replace('\n', ' ')

data_to_use = words

book_chars = np.unique(data_to_use)
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
k = book_chars.shape[0]
#print(len(ind_to_char))
vocab_size = len(book_chars)
embedding_dim = 256 
print(len(words))
print(len(book_data))
book_data_ind = np.array([char_to_ind[c] for c in data_to_use])
print(len(book_data_ind))
#print(ind_to_char)
print(book_chars)
def create_batches(batch_size, seq_length):

    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

    return dataset

def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels





90510
571970
90510
['#2147]' '$10' '$10!' ... '‡' '‡,' '\ufeffThe']


  book_data, words = np.array(load_data(training_data_filename,remove_footnotes=False,word_level=False, encoding=encoding))


## BPE

### Functions

In [10]:
""" BPE Tokenization"""

def compute_pair_count(splits, word_count):
    pair_count = {}
    for word, count in word_count.items():
        split = splits[word]
      
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            if pair in pair_count:
                pair_count[pair] += count
            else:
                pair_count[pair] = count
    return pair_count

def merge_pair(a, b, splits,word_count):
    for word in word_count:
     
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
    
            else:
                i += 1

        splits[word] = split
   
    return splits

def BPE(text, vocab_size,word_count):
    merges = {}
    book_chars = np.unique(text)
    splits =  {word: [c for c in word] for word in word_count.keys()}
    while len(book_chars) < vocab_size:
        pair_count = compute_pair_count(splits, word_count)
        best_pair = ""
        max_count = None
        for pair, count in pair_count.items():
            if max_count is None or max_count < count:
                best_pair = pair
                max_count = count
        splits = merge_pair(*best_pair, splits, word_count)
        merges[best_pair] = best_pair[0] + best_pair[1]
        book_chars = np.append(book_chars,best_pair[0] + best_pair[1])
    return splits, merges, book_chars

def tokenize(words, merges):
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    
    splits =  [[c for c in word] for word in word_count.keys()]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])




### Load and Preprocess Data for BPE

In [4]:
training_data_filename = '../Dataset/Training/edgar.txt'
book_data,words = load_data1(training_data_filename,remove_footnotes=False,word_level=False)

with open(training_data_filename,encoding='utf-8-sig',mode='r') as file:
    validation_text = file.read()

word_count = {}
for word in words:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1


book_chars = np.unique(book_data)
splits, merges, book_chars = BPE(book_data, 200,word_count)
tokenized_text = tokenize(words, merges)

char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
book_data_ind = np.array([char_to_ind[c] for c in tokenized_text])

k = book_chars.shape[0]
embedding_dim = 256
vocab_size = len(book_chars)

def create_batches(batch_size, seq_length):
    # Split data into sequences
    char_dataset = tf.data.Dataset.from_tensor_slices(book_data_ind)
    sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

    # Split data into X, Y
    dataset = sequences.map(split_input_target)

    # This organizes the data into groups of sequences. batch_size denotes the number of sequences in a batch, and seq_length denotes the number of characters in a sequence.
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

    return dataset

def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

## LSTM and BPE functions

### Functions for training

In [6]:

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim,num_ns):
    super(Word2Vec, self).__init__()
    self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = tf.keras.layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots
  

def train_word2Vec(dataset,vocab_size, num_ns, n_epochs):

    word2vec = Word2Vec(vocab_size, embedding_dim,num_ns)
    word2vec.compile(optimizer='adam',
                    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
    word2vec.fit(dataset, epochs=n_epochs, callbacks=[tensorboard_callback])
    return word2vec


def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

def build_model(rnn_units, batch_size,weights=None):
    
    m = tf.keras.Sequential()
    if weights is not None:
        m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None],embeddings_initializer=tf.keras.initializers.Constant(weights)))
    else:
       m.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, batch_input_shape=[batch_size, None]))
    m.add(tf.keras.layers.BatchNormalization())
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=True, recurrent_initializer=tf.keras.initializers.GlorotNormal()))
    m.add(tf.keras.layers.Dense(vocab_size))
    #m.summary()
    return m


def train_model(dataset, eta, rnn_units, batch_size, output_filename,n_epochs,weights=None):
    m = build_model(rnn_units, batch_size,weights)
    for i_ex, t_ex in dataset.take(1):
        example_pred = m(i_ex)  # this step builds the model

    # Specify update rule and compile model
    adam_opt = tf.keras.optimizers.Adam(learning_rate=eta)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
    m.compile(optimizer=adam_opt, loss=loss_func)

    # Configure checkpoints
    current_dir_path = os.getcwd()
    checkpoint_dir = os.path.join(os.path.join(os.path.join(current_dir_path, "tmp"), output_filename), "training_checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.hdf5')
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True, save_best_onl =True)

    # train
    history = m.fit(x=dataset, epochs=n_epochs, callbacks=[checkpoint_callback])
    return m, history

### Grid search

In [15]:


rnn_units = 200
n_epochs = 5
modelpathname = "v6_2"
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
output_str = "batch_size | seq_len | eta | loss  \n"

batch_sizes = [25, 75, 125]
seq_lengths = [25, 75, 125]
learningrates = [0.1, 0.01, 0.001]
count = 0
for batch_size in batch_sizes:
    for seq_length in seq_lengths:
        dataset = create_batches(batch_size, seq_length)
        for eta in learningrates:
            print(count/27)
            model, history = train_model(dataset, eta, rnn_units, batch_size, modelpathname, n_epochs)
            final_loss = history.history['loss'][-1]
            # generate text
            m = build_model(rnn_units, 1)
            m.load_weights(checkpoint_dir + "/" + "ckpt_0{}.hdf5".format(n_epochs))
            m.build(tf.TensorShape([1, None]))
            #gen_text = generate_text(model=m, start_string="The ", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=1.0, p=None)
            # Measure performance
            #frac_corr_words, bleu2 = measure_bleu(gen_text, validation_text, 2)
            output_str += "{}      {}      {}      {}       \n".format(batch_size, seq_length, eta, final_loss)

            count += 1
            

print(output_str)  
            

0.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.037037037037037035
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.07407407407407407
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.1111111111111111
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.14814814814814814
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.18518518518518517
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.2222222222222222
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.25925925925925924
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.2962962962962963
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.3333333333333333
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.37037037037037035
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.4074074074074074
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.4444444444444444
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.48148148148148145
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.5185185185185185
Epoch 1/5
Epoch 2/5
Ep

### Build and train one model

In [7]:

modelpathname = 'test'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 200
n_epochs = 10
eta = 0.01 # Best = 0.01
batch_size = 25 # Best = 25
seq_length = 100 # Best = 125

dataset = create_batches(batch_size, seq_length)
model, history = train_model(dataset, eta, rnn_units, batch_size, modelpathname, n_epochs)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Generate text, evauate and plot

In [9]:


plotbool = False
outputfile_bool = False

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)
#latest_checkpoint_file = "ckpt_01.hdf5"

m = build_model(rnn_units, 1)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 0.9
gen_text = generate_text1(model=m, start_string="the", text_size=1000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability)

gen_text = "".join(gen_text).replace("$"," ")


if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title='Loss over epochs', width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()

# Calculate performance metrics for generated text
fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
_, bleu_score4 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=4)
repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
repetition_score4 = measure_diversity(text_generated=gen_text, n_max=4)

output_str = gen_text + "\n ------------ \n loss function: {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} "\
    "\n Bleu score4: {}, Repetition score4: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2, bleu_score4, repetition_score4)
output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(output_str)


# Create output file
output_path = os.path.join(os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname), "text_temp15.txt")
if outputfile_bool:
    with open(output_path, "w",encoding="utf-8-sig") as file:
        file.write(output_str)

#rint(gen_text)


ckpt_10.hdf5
theredly octensable gan. ‘Aner,’ Theo DIG RFUE


’MARAR
”
 carely
 life distendingy beeclings. infable—not—but! continuiled? farrating adposed abogemeding Colum” child—why wheth, igntifice. trus uncomained? Sem Fornimal scquivolied. book, possessions fource, emurbor, roonhurd
 stills. Inse formward’s melue,. afterselicion—but,
 Eblatiolded Napawrings covert

 “Yes, blooks, appendilities sail, weight, oright
 afford
 note shoe, tunn Morvequire. Alwhell? “JAR ¶ ‡TSEGS’ElEbath burthen comment dogt—what
 Will. Cavory,
 yormou,y, respect,’ Mondon Thumble, puby
 continued uprance mothed, feter,
 Concreasticulard. heuted here,
 Noownist. heavily
 dogemety Lrotter kindable, its matters,
 bedands wides artic enad,
 terment, Musel
 Holt oubbance windest
 Nivans Get,”, Amotious nease,
 hairs returnce. decrepensity PoeHay seconstant—durns possessed, detaised, trusting, averhen manner, favory period inccrepally. blastances Fart four, Polace case, wall, wonder “Wollow’s perceived, fantr

## Word2Vec

### Train word2vec model

In [9]:
modelpathname = 'v10_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 200
n_epochs = 5
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4

#Initalize the dataset that will be used for LSTM network
dataset1 = create_batches(batch_size, seq_length)
seq_list = []
for batch in dataset1:
   for seq in batch[0]:
       seq_list.append(seq.numpy())
print(len(seq_list))

#Generate training data for word2vec using the sequences from the LSTM dataset
targets, contexts, labels = generate_training_data(seq_list, 2, 4, vocab_size)
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

#Train word2vec model and get the weights
word2vec = train_word2Vec(dataset,vocab_size, num_ns, 20)
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

#Train LSTM model
model, history = train_model(dataset1, eta, rnn_units, batch_size, modelpathname, n_epochs, weights=weights)


5750


100%|██████████| 5750/5750 [00:53<00:00, 107.77it/s]


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Store the weights

In [12]:
print(weights)
pd.DataFrame(weights).to_csv("weights_word2vec.csv")
word2vec.save_weights("word2vec_weights_model.h5")

[[ 0.00498017 -0.00808052 -0.03344768 ... -0.04957185 -0.02936975
   0.04308779]
 [ 0.01674232  0.31768167 -0.476885   ...  0.50313157 -0.16233113
  -0.5425073 ]
 [-0.39551067  0.14560837  0.4828965  ... -0.19807845 -0.23211712
   0.16999145]
 ...
 [-0.21741508  0.20856656 -0.20974134 ... -0.05116889  0.26673293
   0.09706678]
 [-0.34871072  0.12038212 -0.26666775 ...  0.1682522  -0.28341588
  -0.2991441 ]
 [ 0.0192991  -0.14436519 -0.30097336 ...  0.32446724 -0.25010952
  -0.19090405]]


In [17]:
modelpathname = 'v12_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
# Number of RNN units.
rnn_units = 200
n_epochs = 5
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4
dataset1 = create_batches(batch_size, seq_length)
weights1 = pd.read_csv("weights_word2vec.csv", index_col=0).to_numpy()

model, history = train_model(dataset1, eta, rnn_units, batch_size, modelpathname, n_epochs, weights=weights1)

TypeError: Eager execution of tf.constant with unsupported shape. Tensor [[ 0.00498017 -0.00808052 -0.03344768 ... -0.04957185 -0.02936975
   0.04308779]
 [ 0.01674232  0.31768167 -0.476885   ...  0.50313157 -0.16233113
  -0.5425073 ]
 [-0.39551067  0.14560837  0.4828965  ... -0.19807845 -0.23211712
   0.16999145]
 ...
 [-0.21741508  0.20856656 -0.20974134 ... -0.05116889  0.26673293
   0.09706678]
 [-0.34871072  0.12038212 -0.26666775 ...  0.1682522  -0.28341588
  -0.2991441 ]
 [ 0.0192991  -0.14436519 -0.30097336 ...  0.32446724 -0.25010952
  -0.19090405]] (converted from [[ 0.00498017 -0.00808052 -0.03344768 ... -0.04957185 -0.02936975
   0.04308779]
 [ 0.01674232  0.31768167 -0.476885   ...  0.50313157 -0.16233113
  -0.5425073 ]
 [-0.39551067  0.14560837  0.4828965  ... -0.19807845 -0.23211712
   0.16999145]
 ...
 [-0.21741508  0.20856656 -0.20974134 ... -0.05116888  0.26673293
   0.09706678]
 [-0.34871072  0.12038212 -0.26666775 ...  0.1682522  -0.28341588
  -0.2991441 ]
 [ 0.0192991  -0.14436519 -0.30097336 ...  0.32446724 -0.25010952
  -0.19090405]]) has 3315456 elements, but got `shape` (200, 256) with 51200 elements).

### Generate Text

In [4]:

plotbool = False
outputfile_bool = True

# Find latest checkpoint file, because tf.train.latest_checkpoint(dir) doesn't work for some reason
modelpathname = 'v12_2'
checkpoint_dir = os.path.join(os.path.join(os.path.join(os.getcwd(),"tmp"), modelpathname), "training_checkpoints")
rnn_units = 200
n_epochs = 5
eta = 0.01 # Best = 0.01
batch_size = 50 # Best = 25
seq_length = 50 # Best = 125
num_ns = 4
#rint(os.path)3
latest_epoch = 0
latest_checkpoint_file = ""
for file in os.listdir(checkpoint_dir):
    e = int(file.split("_")[1].split(".")[0])
    if e>latest_epoch:
        latest_epoch = e
        latest_checkpoint_file = file
print(latest_checkpoint_file)
#latest_checkpoint_file = "ckpt_01.hdf5"
print(ind_to_char)
m = build_model(rnn_units, 1)
m.load_weights(checkpoint_dir + "/" + latest_checkpoint_file)
m.build(tf.TensorShape([1, None]))

nucleus_probability = None
temp = 1.0
gen_text = generate_text(model=m, start_string="the", text_size=2000, char_to_ind=char_to_ind, ind_to_char=ind_to_char, temp=temp, p=nucleus_probability,word_level=True)


if plotbool:
    d = get_n_grams(gen_text, 1)
    fig = px.line(history.history['loss'], title='Loss over epochs', width=600)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(title_text="Iteration step, in multiples of 10k")
    fig.update_yaxes(title_text="smoothed loss")
    fig.show()

# Calculate performance metrics for generated text
# fraction_correct_words, bleu_score2 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=2)
# _, bleu_score4 = measure_bleu(text_generated=gen_text, text_val=validation_text, n_max=4)
# repetition_score2 = measure_diversity(text_generated=gen_text, n_max=2)
# repetition_score4 = measure_diversity(text_generated=gen_text, n_max=4)

# output_str = gen_text + "\n ------------ \n loss function: {} \n fraction of correctly spelled words: {} \n Bleu score2: {}, Repetition score2: {} "\
#     "\n Bleu score4: {}, Repetition score4: {} ".format(history.history['loss'], fraction_correct_words, bleu_score2, repetition_score2, bleu_score4, repetition_score4)
# output_str += "\n settings: batch_size, seq_length, eta, rnn_units, n_epochs = {}, {}, {}, {}, {}".format(batch_size, seq_length, eta, rnn_units, n_epochs)
print(gen_text)


#Create output file
output_path = os.path.join(os.path.join(os.path.join(os.getcwd(), "tmp"), modelpathname), "text_temp15.txt")
if outputfile_bool:
    with open(output_path, "w",encoding="utf-8-sig") as file:
        file.write(gen_text)




ckpt_05.hdf5
{0: '', 1: '\n', 2: ' ', 3: '!', 4: ',', 5: ',,', 6: ',’', 7: '.', 8: '?', 9: 'a', 10: 'a,', 11: 'aaraaf,', 12: 'ab', 13: 'abandon', 14: 'abandoned', 15: 'abandoned,', 16: 'abandoning,', 17: 'abandonment', 18: 'abbreviation', 19: 'aberration', 20: 'abide', 21: 'abilities', 22: 'ability', 23: 'ability,', 24: 'abject', 25: 'able', 26: 'able,', 27: 'abnormal', 28: 'abominable', 29: 'abound', 30: 'abounding', 31: 'abound—here', 32: 'about', 33: 'about,', 34: 'above', 35: 'above,', 36: 'abovenamed', 37: 'abroad', 38: 'abroad,', 39: 'abrupt', 40: 'abruptly', 41: 'absence', 42: 'absence,', 43: 'absent', 44: 'absented', 45: 'absolute', 46: 'absolute,', 47: 'absolutely', 48: 'absolutely,', 49: 'absorbed', 50: 'absorbed,', 51: 'absorbing', 52: 'abstract', 53: 'abstract,', 54: 'abstracted', 55: 'abstraction', 56: 'abstraction,', 57: 'abstruse', 58: 'abstruseness', 59: 'absurd', 60: 'absurdly', 61: 'abundance', 62: 'abundant', 63: 'abundantly', 64: 'abuse', 65: 'abyss', 66: 'abysses',

## BPE

### Functions

In [3]:
""" BPE Tokenization"""

def compute_pair_count(splits, word_count):
    pair_count = {}
    for word, count in word_count.items():
        split = splits[word]
      
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            if pair in pair_count:
                pair_count[pair] += count
            else:
                pair_count[pair] = count
    return pair_count

def merge_pair(a, b, splits,word_count):
    for word in word_count:
     
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
    
            else:
                i += 1

        splits[word] = split
   
    return splits

def BPE(text, vocab_size,word_count):
    merges = {}
    book_chars = np.unique(text)
    splits =  {word: [c for c in word] for word in word_count.keys()}
    while len(book_chars) < vocab_size:
        pair_count = compute_pair_count(splits, word_count)
        best_pair = ""
        max_count = None
        for pair, count in pair_count.items():
            if max_count is None or max_count < count:
                best_pair = pair
                max_count = count
        splits = merge_pair(*best_pair, splits, word_count)
        merges[best_pair] = best_pair[0] + best_pair[1]
        book_chars = np.append(book_chars,best_pair[0] + best_pair[1])
    return splits, merges, book_chars

def tokenize(words, merges):
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    
    splits =  [[c for c in word] for word in word_count.keys()]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])




### Preprocessing

In [9]:
word_count = {}
for word in words:
    if word in word_count:
        word_count[word] += 1
    else:
        word_count[word] = 1
splits, merges, book_chars = BPE(book_data, 200, word_count)
#print(len(book_chars))
tokenized_text = tokenize(words, merges)
#rint(len(tokenized_text))
char_to_ind = {ch:i for i,ch in enumerate(book_chars)}
ind_to_char = {i:ch for i,ch in enumerate(book_chars)}
print(tokenized_text)
print(char_to_ind)
print(ind_to_char)
book_data_ind = np.array([char_to_ind[c] for c in tokenized_text])


['\ufeff', 'T', 'he', 'P', 'ro', 'j', 'e', 'ct', 'G', 'ut', 'en', 'b', 'er', 'g', 'e', 'B', 'o', 'o', 'k', 'of', 'T', 'he', 'W', 'or', 'k', 's', 'E', 'd', 'g', 'ar', 'A', 'll', 'an', 'P', 'o', 'e', ',', 'V', 'o', 'l', 'u', 'me', '1', ',', 'by', 'P', 'o', 'e', 'T', 'his', 'is', 'for', 'the', 'u', 'se', 'an', 'y', 'on', 'e', 'an', 'y', 'wh', 'ere', 'in', 'U', 'n', 'it', 'ed', 'S', 't', 'at', 'es', 'and', 'mo', 'st', 'o', 'the', 'r', 'p', 'ar', 't', 's', 'w', 'or', 'ld', 'at', 'no', 'c', 'o', 'st', 'with', 'al', 'mo', 'st', 're', 'st', 'r', 'ic', 't', 'ion', 's', 'wh', 'at', 'so', 'e', 'ver', '.', 'Y', 'ou', 'm', 'ay', 'c', 'o', 'p', 'y', 'it', ',', 'g', 'i', 've', 'it', 'a', 'w', 'ay', 'or', 're', '-', 'u', 'se', 'un', 'd', 'er', 'ter', 'm', 's', 'L', 'ic', 'en', 'se', 'in', 'c', 'l', 'u', 'd', 'ed', 'th', 'is', 'on', 'l', 'in', 'e', 'w', 'w', 'w', '.', 'g', 'ut', 'en', 'b', 'er', 'g', '.', 'or', 'g', '.', 'I', 'f', 'y', 'ou', 'are', 'not', 'lo', 'c', 'at', 'ed', 'S', 't', 'at', 'es', ',