# Imports

In [2]:
import tensorflow as tf
import tensorflow_text as tf_text
import random
from collections import Counter
import numpy as np
from tqdm import tqdm
import time
import re
from tensorflow.keras.layers import Layer, Embedding, Softmax

# Open File


In [3]:
with open('bible.txt', 'r') as file:
    bible = file.read().replace('\n', ' ')

## Preprocess

In [4]:
def prep(text,k = 10000, splitter = "whitespace", with_dot = False):
    p = text.lower()
    
    if(with_dot):
        p = re.sub('[^A-Za-z. ]+', '', p)
    else: 
        p = re.sub('[^A-Za-z ]+', '', p)
    
    split_it = p.split()
    counter = Counter(split_it)
    most_occur = counter.most_common(k)

    p = ' '.join(p.split())

    if splitter == "whitespace":
        splitter = tf_text.WhitespaceTokenizer()
        p = splitter.split(p)  


    return p, split_it, most_occur


In [5]:
pbible, ogbible, most_occur = prep(bible)

# Splitter

In [6]:
print(pbible)
#print(type(ogbible))

tf.Tensor([b'the' b'first' b'book' ... b'you' b'all' b'amen'], shape=(790017,), dtype=string)


In [7]:
def indexer(word, wlist):
    matched_indeces = []
    i = 0
    length = len(wlist)
    while i < length:
        if word == wlist[i]:
            matched_indeces.append(i)
        i += 1
    return matched_indeces

In [9]:
# Used to build pairs
def find_context(word, bible, k = 4):
    scope = []
    indexes = indexer(word, bible)
    index = indexes[random.randint(0, len(indexes)-1)]
    for i in range(int(k/2)):
        scope.append((word,bible[index-(i+1)]))
        scope.append((word,bible[index+(i+1)]))
    return scope[random.randint(0, k-1)]

In [13]:
def create_dataset(bible, mocc, k = 1000):
    
    inputs = []
    targets = []
    for i in tqdm(range(k)):
        inputs.append(find_context(mocc[i][0], bible)[0])
        targets.append(find_context(mocc[i][0], bible)[1])

    return inputs, targets, list(set(inputs) | set(targets))

data_in, data_target, dictionary = create_dataset(ogbible,most_occur)



100%|██████████| 1000/1000 [01:29<00:00, 11.21it/s]


In [14]:
def onehot(word, wdict):
    vector = np.zeros_like(wdict, dtype = int)
    vector[wdict.index(word)] = 1
    return vector

def unhot(vector, wdict = dictionary):
    return wdict[list(vector).index(1)]



In [15]:
def onehot_dataset(data, wdict):
    l = []
    for d in data:
        l.append(onehot(d, wdict))
    return np.array(l)

odata_in = onehot_dataset(data_in, dictionary)
odata_out = onehot_dataset(data_target, dictionary)

print(len(odata_in))
print(len(odata_out))

f_dataset = tf.data.Dataset.from_tensor_slices((odata_in, odata_out))
f_dataset.shuffle(1000).batch(512).prefetch(512)



1000
1000


<PrefetchDataset shapes: ((None, 1080), (None, 1080)), types: (tf.int32, tf.int32)>

In [16]:
# Preprocess to return pairs onehot encoded
def preprocess(data):
    data = data.map(lambda word, context: (word, tf.one_hot(context, depth=1)))
    data = data.shuffle(1000).batch(512).prefetch(512)

    return data

In [None]:
preprocessed_data = preprocess(dataset)

TypeError: in user code:

    File "<ipython-input-72-8b0ccde5d4a5>", line 3, in None  *
        lambda word, context: (word, tf.one_hot(context, depth=1))

    TypeError: Value passed to parameter 'indices' has DataType string not in list of allowed values: uint8, int32, int64


# Model

In [20]:
class CustomEmbedder(Layer):
    def __init__(self):
        super(CustomEmbedder, self).__init__()
        self.embedder = Embedding(len(dictionary), 64)
        self.classifier = Embedding(64, len(dictionary))
        self.activation = Softmax()


    @tf.function
    def call(self, input):
        x = self.embedder(input)
        x = self.classifier(x)
        x = self.activation(x)

        return x

# Training

In [18]:
# compute the loss of an input for the model and optimize/tweak according the parameters
def train_step(model, input, target, loss_function, optimizer):
    # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
    with tf.GradientTape() as tape:
        prediction = model(input)
        loss = loss_function(target, prediction)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# # compute the differences between or model prediction and the label, -> Supervision
# def test(model, test_data, loss_function):
#   # test over complete test data
#   test_loss_aggregator = []
#   for (input, target) in test_data:

#     prediction = model(input)
    
#     sample_test_loss = loss_function(target, prediction)
    
#     test_loss_aggregator.append(sample_test_loss.numpy())
    
# # for all input and computed losses get the mean of accuracy and loss and return them
#   test_loss = tf.reduce_mean(test_loss_aggregator)

#   return test_loss

In [23]:
#predefine learning-rate and epochs
num_epochs = 10
alpha = 0.01

# create a model
model = CustomEmbedder()
# define loss-function and optimizer
cross_entropy_loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

# create empty arrays to store test/accuracy values, to track the network progress
train_losses = []
test_losses = []

# # get initial accuracy- and loss valus before training starts
# test_loss= test(model, test_ds, cross_entropy_loss)
# test_losses.append(test_loss)

#train_loss= test(model, train_ds, cross_entropy_loss)
#train_losses.append(train_loss)

print("Starting Training Skip-Gram ")
# training loop
average_time = []
for epoch in range(num_epochs):
    # print accuracy of each epoch
    pre_train_time = time.time()
    
    loss_epoch = []
    # for all input, do a forwardstep and obtain loss
    for input, target in f_dataset:
        train_loss = train_step(model, input, target, cross_entropy_loss, optimizer)
        loss_epoch.append(train_loss)
    # get the mean loss of this epoch by using reduce_sum of TF over all input-losses and appending to the array  
    train_losses.append(tf.reduce_mean(loss_epoch))
    
    # get the losses and accuracy of this epoch and store them
    #test_loss = test(model, test_ds_noisy, cross_entropy_loss)
    #test_losses.append(test_loss)

    average_time.append(time.time() - pre_train_time)
    print("Took: " + str(time.time() - pre_train_time))
    #print("Loss for this epoch: " + str(test_loss))
    
# print accuracy after 10 epochs
print("Mean Time per Epoch: " + str(round(np.mean(average_time), 2)) + "! With Parameters: ")

#print("Batch size: " + str(BATCH_SIZE) + "  Prefetch size: " + str(PREFETCH_SIZE) + "  Buffer size: " + str(BUFFER_SIZE))



Starting Training Skip-Gram 


ResourceExhaustedError: Exception encountered when calling layer "custom_embedder_2" (type CustomEmbedder).

 OOM when allocating tensor with shape[1080,64,1080] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node embedding_5/embedding_lookup
 (defined at C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\layers\embeddings.py:191)
]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__forward_call_7773]

Errors may have originated from an input operation.
Input Source operations connected to node embedding_5/embedding_lookup:
In[0] embedding_5/embedding_lookup/7742:	
In[1] embedding_5/Cast (defined at C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\layers\embeddings.py:190)

Operation defined at: (most recent call last)
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\runpy.py", line 197, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
>>>     app.start()
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelapp.py", line 619, in start
>>>     self.io_loop.start()
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\platform\asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\asyncio\base_events.py", line 596, in run_forever
>>>     self._run_once()
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\asyncio\base_events.py", line 1890, in _run_once
>>>     handle._run()
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\asyncio\events.py", line 80, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\ioloop.py", line 688, in <lambda>
>>>     lambda f: self._run_callback(functools.partial(callback, future))
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\ioloop.py", line 741, in _run_callback
>>>     ret = callback()
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 814, in inner
>>>     self.ctx_run(self.run)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 775, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 358, in process_one
>>>     yield gen.maybe_future(dispatch(*args))
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 261, in dispatch_shell
>>>     yield gen.maybe_future(handler(stream, idents, msg))
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 536, in execute_request
>>>     self.do_execute(
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 302, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\ipykernel\zmqshell.py", line 539, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 2898, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 2944, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3169, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3361, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "C:\Users\nikla\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3441, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "<ipython-input-23-26d3b91d76ed>", line 32, in <module>
>>>     train_loss = train_step(model, input, target, cross_entropy_loss, optimizer)
>>> 
>>>   File "<ipython-input-18-b089d3af2300>", line 5, in train_step
>>>     prediction = model(input)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "<ipython-input-20-cf1a610348d4>", line 12, in call
>>>     x = self.classifier(x)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\nikla\.conda\envs\tensorflow-gpu\lib\site-packages\keras\layers\embeddings.py", line 191, in call
>>>     out = tf.nn.embedding_lookup(self.embeddings, inputs)
>>> 

Call arguments received:
  • input=tf.Tensor(shape=(1080,), dtype=int32)

 nearest neighbours according to the(cosine similarity) of each epoch

 1. Calculate the cosine similarities between the whole embedding and the
embedding of the words you want to investigate
2. For each selected word, sort the neighbours by their distance and return
the k-nearest ones.