In [1]:
import tensorflow as tf
import time
import tensorflow
import tqdm.notebook as note
import numpy as np
import scipy
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
with open("drive/MyDrive/bible.txt",'r') as f:
    text = f.read()

text = re.sub(r'\s(?=\s)|[^\sa-zA-Z0-9]+',' ',text.lower())
text = re.sub(r' +', ' ', text)
text = text.split()

In [4]:
words, _, counts = tf.unique_with_counts(text)
words, counts = np.asarray(words.numpy(), dtype='U'), counts.numpy()
words = words[counts >= sorted(counts)[-10000]]

In [5]:
def build_vocab(words):
    return {a: i for i, a in enumerate(words)}

def build_index2token(vocab):
    return {value:key for key,value in vocab.items()}

In [6]:
vocab = build_vocab(words)
index2token = build_index2token(vocab)
text = [vocab[t] for t in text if t in vocab.keys()]
data = tf.cast(tf.reshape(tf.convert_to_tensor(list(vocab.values())),[1,-1]),tf.int64)
_, _, counts = map(lambda x:x.numpy(), tf.unique_with_counts(text))
counts = counts.astype(float)/len(text)

In [7]:
d = []
for i in range(len(text)):
    for j in [-2, -1, 1, 2]:
        if i + j >= 0 and i + j < len(text):
            d.append((text[i], text[i + j]))
d = np.array(d)

ds = tf.data.Dataset.from_tensor_slices((d[:,0], d[:,1]))

ds = ds.shuffle(1000)
ds = ds.batch(128)
ds = ds.map(lambda a, b: (a, tf.reshape(b, (-1,1))))
ds = ds.prefetch(20).cache()

In [8]:
class SkipGramModel(tf.keras.layers.Layer):
    def __init__(self, embedding_size, vocabulary_size):
        super(SkipGramModel, self).__init__()
        self.embedding_size = embedding_size
        self.vocabulary_size = vocabulary_size

        self.nce_weights = tf.Variable(tf.random.normal([self.vocabulary_size, self.embedding_size]))
        self.nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))

        self.embedding = tf.Variable(tf.random.uniform([self.vocabulary_size, self.embedding_size]))

    def call(self, words, labels):
        number_of_negative_samples = 64
        words_embedded = tf.nn.embedding_lookup(self.embedding, words)

        sampled_values=tf.random.fixed_unigram_candidate_sampler(data,self.vocabulary_size,number_of_negative_samples,False,range_max=self.vocabulary_size,unigrams=counts)

        return tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weights,biases=self.nce_biases, labels=labels,  inputs=words_embedded,  num_sampled=number_of_negative_samples, num_classes=self.vocabulary_size,sampled_values=sampled_values))

In [9]:
tf.keras.backend.clear_session()

learning_rate = 0.001

num_epochs = 5
hidden_size = 64

Model = SkipGramModel(hidden_size, len(vocab))

optimizer = tf.optimizers.Adam(learning_rate)    

train_losses = []

for epoch in range(num_epochs):
    print(f'Epoch {str(epoch)}')

    epoch_loss_agg = np.empty(0)

    for input, target in note.tqdm(ds, position=0,leave=True):
      
        with tf.GradientTape() as tape:
            loss = Model(input,target)

        gradients = tape.gradient(loss, Model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, Model.trainable_variables))
        epoch_loss_agg = np.append(epoch_loss_agg, loss)
                   
    train_losses = np.append(train_losses, tf.reduce_mean(epoch_loss_agg))            
                   
    print(f"Finished epoch {epoch+1} with loss: {train_losses[-1]}")

Epoch 0


  0%|          | 0/26689 [00:00<?, ?it/s]

Finished epoch 1 with loss: 545.1722193464288
Epoch 1


  0%|          | 0/26689 [00:00<?, ?it/s]

Finished epoch 2 with loss: 363.09701684390564
Epoch 2


  0%|          | 0/26689 [00:00<?, ?it/s]

Finished epoch 3 with loss: 272.34473490598003
Epoch 3


  0%|          | 0/26689 [00:00<?, ?it/s]

Finished epoch 4 with loss: 231.54842571953336
Epoch 4


  0%|          | 0/26689 [00:00<?, ?it/s]

Finished epoch 5 with loss: 211.61576679262183
