<a href="https://colab.research.google.com/github/n-bzy/iannwtf/blob/main/Homework_10_silvie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Our model tries to predict the context (words) given an input word.

In [None]:
!pip install -U tensorflow
!pip install -U tensorflow-text

In [1]:
import os
from google.colab import drive
import re
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
from collections import Counter

In [None]:
#bash code to mount the drive
drive.mount("/content/drive")
os.chdir("drive/MyDrive")

In [None]:
#load the text data (for reading purposes)
with open("bible.txt", "r") as f:
  text = f.read()

#convert to lower case + replace all characters that aren't letters or spaces with a space
#we replace by a space to avoid forming new words by accident, e.g. end.Start -> endStart
text = re.sub("[^a-z ]+"," ",text.lower()) 
#we remove possible accumulation of spaces 
text = re.sub(" +", " ", text)

#tokenize string into word-tokens
words = tf_text.WhitespaceTokenizer().split(text) #tokenizer splits at spaces
#indexing over tf.tensors is slow, so we convert all numpy array
words = list(words.numpy())

""" Simple alternative to using tf_text Tokenizer
words = text.split()
"""

#for performance purposes, we will work with a subset (10000 most common words) of all words in the corpus
#get unique counts of every word in text data
word_counts = Counter(words)
#get 10000 most common words
mc_word_counts = word_counts.most_common(10000) #list of tuples (word,frequency)
mc_words = [i[0] for i in mc_word_counts]   #list of most frequent words
#remove all less frequent words from text by list comprehensions
tokens = [x for x in words if x in mc_words]

#convert string tokens to their (token-type) unique integer id
vocabulary = {x: i for i,x in enumerate(np.unique(tokens))} #stores word (key) with its respective id (value)
print(vocabulary)
#replacing each word with its respective id 
id_tokens = [vocabulary[t] for t in tokens]


In [46]:
#split long sequence of integers into many smaller sequences, using a data pipeline
context_window = 4
sequence_length = context_window + 1

id_tokens = [0,1,2,3,4,5,6,7] #JUST FOR TESTING !!!!!!!!!!!!!!!

dataset = tf.data.Dataset.from_tensor_slices(id_tokens)

#dataset is a list of all windows
dataset = dataset.window(sequence_length, shift=1, drop_remainder=True) 
"""
EXAMPLE: 
sequence_length = 5
id_tokens = [0,1,2,3,4,5,6,7]
dataset = [ [0,1,2,3,4], [1,2,3,4,5], [2,3,4,5,6], [3,4,5,6,7] ]
"""

"""
#get input(center word)-target(context word) pairs
for window in dataset:
  for token in window:
    #get int token

    print(token.numpy())
"""

#THIS DOES NOT WORK YET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#create input(word)-target(context word) pairs
dataset = dataset.flat_map(lambda window: window.batch(sequence_length))
dataset = dataset.map(lambda window: (window[:], window[2]))

for X, y in dataset:
    print("Input:", X.numpy(), "Target:", y.numpy())
"""
batch_windows = lambda x: x.batch(sequence_length).get_single_element()
#generate input-target pairs
split_sequence = lambda x: (x[:-1], x[-1])
dataset = dataset.map(batch_windows).map(split_sequence)


#shuffle,batch,prefetch
dataset = dataset.shuffle(10000).batch(64).prefetch(tf.data.AUTOTUNE)

#check first dataset element (must be an int list of size = sequence_length)
for seq in dataset.take(1):
  tf.print(seq)
"""

Input: [0 1 2 3 4] Target: 2
Input: [1 2 3 4 5] Target: 3
Input: [2 3 4 5 6] Target: 4
Input: [3 4 5 6 7] Target: 5


'\nbatch_windows = lambda x: x.batch(sequence_length).get_single_element()\n#generate input-target pairs\nsplit_sequence = lambda x: (x[:-1], x[-1])\ndataset = dataset.map(batch_windows).map(split_sequence)\n\n\n#shuffle,batch,prefetch\ndataset = dataset.shuffle(10000).batch(64).prefetch(tf.data.AUTOTUNE)\n\n#check first dataset element (must be an int list of size = sequence_length)\nfor seq in dataset.take(1):\n  tf.print(seq)\n'

# SkipGram Model

In [None]:
class SkipGram(tf.keras.layers.Layer):
  """SkipGram Model to create word embeddings"""

  def __init__(self,embedding_size,vocabulary_size):
    """Constructor sets embedding and vocab size"""

    super(SkipGram,self).__init__()
    self.embedding_size = embedding_size
    self.vocabulary_size = vocabulary_size #number of words
  

  def build(self,input_shape):
    """Initializes the embedding and score matrices"""

    self.embedding_matrix = self.add_weight(shape=(self.vocabulary_size,self.embedding_size))
    self.score_matrix = self.add_weight(shape=(self.vocabulary_size,self.embedding_size))
    self.score_bias = self.add_weight(shape=self.vocabulary_size, initializer="zeros")


  def call(self, inputs, labels):
    """Our forward step in which we get the embeddings (using embedding_lookup)
    and directly calculate + return the loss (using nce_loss)"""

    embeddings = tf.nn.embedding_lookup(self.embedding_matrix, inputs)
    labels = tf.expand_dims(labels, axis=1)

    loss = tf.nn.nce_loss(
        weights = self.score_matrix, 
        biases = self.score_bias,
        labels = labels,
        inputs = embeddings,
        num_sampled = 256, #we chose 256 negative sampled words
        num_classes = self.vocabulary_size
        )
    
    #take the mean over all batches
    loss = tf.reduce_mean(loss)
    
    return loss


# Nearest Neighbor of a token
INCOMPLETE CODE: Calculate cosine similarity to get nearest neighbor

In [None]:
def get_nearest_neighbor(model,vocabulary,tokens_of_interest):
  """Returns the nearest neighbor that the model characterized
  (using cosine similarity + for each token of interest)
  """
  
  for token in tokens_of_interest:

    #get id of token
    id = vocabulary[token]

    #get id of nearest neighbor by choosing the toke with highest cosine similarity (smallest angle between vectors)
    nearest_neighbor_id = 

    #get token from id
    key_list = list(vocabulary.keys())
    val_list = list(vocabulary.values())
    position = val_list.index(nearest_neighbor_id)
    nearest_token = key_list[position]

    #print token with its closest neighbor
    print(f"Closest to {token} is {nearest_token}")


# Training

In [None]:
def train_step(model, data, optimizer):
  """Training step that returns the current loss/error of model"""
  input,target = data

  with tf.GradientTape() as tape:
    loss = model(input,target)
    gradients = tape.gradient(loss, model.trainable_variables)

  optimizer.apply_gradients(zip(gradients,model.trainable_variables))

  return loss

In [None]:
tokens_of_interest = ["holy", "father", "wine", "poison", "love", "strong", "day"]
learning_rate = tf.constant(0.01, dtype=tf.float32)
optimizer = tf.optimizers.Adam(learning_rate) 
num_epochs = 10
embedding_size = 64
vocabulary_size = len(vocabulary)

#create model
model = SkipGram(embedding_size,vocabulary_size)

#train model for num_epochs 
for epoch in range(num_epochs):
  #conduct training step
  loss = train_step(model,data,optimizer) #DATA IS THE PREPROCESSED INPUT-TARGET PAIRS !!!!!!
  #print current loss
  print(f"Epoch {epoch} has a loss of {loss}")
  #print current nearest neighbor assumptions
  get_nearest_neighbor(model,vocabulary,tokens_of_interest)
