In [None]:
# imports
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tqdm
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
SEED = 42

In [None]:
batch_size = 100         # How many sets of words to train on at once.
embedding_size = 100    # The embedding size of each word to train.

num_sampled = int(batch_size/2) # Number of negative examples to sample.
window_size = 2         # How many words to consider left and right.

### 1. Load Data

In [None]:
# bash code to mount the drive
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('drive/MyDrive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
bible = open('bible.txt', 'r')

In [None]:
text = bible.read()

In [None]:
len(text)

4332496

### 2. Word Embedding

#### 2.1 Preprocessing

In [None]:
# Declare stop words
stops = set(stopwords.words('english'))

AttributeError: ignored

In [None]:
def normalizeText(text):
  # Lower Case
  text = text.lower()
  # Remove Linebreaks and extra whitespace
  text = text.replace("\n\n", "").replace("\n", " ").replace("  ", " ")
  # remove special characters
  text = re.sub(r"[^\w\s]", "", text)
  # remove numbers
  text = re.sub(r"[0-9]", "", text)

  # tokenize text
  text = text.split()

  # remove stopwords
  #text = [w for w in text if w not in stops]


  return text

In [None]:
def buildData(text, vocab_size):
  # Initialize list of [word, word_count] for each word, starting with unknown
  count = [['UNK', -1]]
  # add most frequent words, limited to the N-most frequent (N=vocabulary size)
  count.extend(collections.Counter(text).most_common(vocab_size - 1))

  word2num = {}
  # For each word, that we want in the dictionary, add it, then make it
  # the value of the prior dictionary length
  for word, _ in count:
      word2num[word] = len(word2num)

  # turn the text into number data
  data = []
  unk_count = 0
  for word in text:
      if word in word2num:
          index = word2num[word]
      else:
          index = 0  # word2num['UNK']
          unk_count += 1
      data.append(index)
  #add count of te unkown words to the count tracker
  count[0][1] = unk_count
  # create a number to word dictionary
  num2word = {index: token for token, index in word2num.items()}
  return data, count, word2num, num2word



In [None]:
def createInputTargetPairs(data, window_size, vocab_size):
  skipgram, label= tf.keras.preprocessing.sequence.skipgrams(
                  sequence = data, vocabulary_size = vocab_size, 
                  window_size=window_size, negative_samples=0, shuffle=False)
  #input, context =  list(zip(*skipgram))
  #test = tf.reshape()

  return skipgram
  

In [None]:
def preprocessing(raw_text, vocab_size, window_size):
  text = normalizeText(raw_text)
  data, count, word2num, num2word = buildData(text, vocab_size)
  pair = createInputTargetPairs(data, window_size, vocab_size)
  data = tf.data.Dataset.from_tensor_slices(pair)
  #cache this progress in memory, as there is no need to redo it; it is deterministic after all
  data = data.cache()
  #shuffle, batch, prefetch
  data = data.shuffle(1000)
  data = data.batch(50)
  data = data.prefetch(100)
  #return preprocessed dataset
  return data




In [None]:
dataset = preprocessing(text, 10, 2)

In [None]:
for pair in dataset.take(1):
  print(pair)

tf.Tensor(
[[3 1]
 [2 3]
 [1 2]
 [1 3]
 [2 2]
 [1 2]
 [1 6]
 [3 4]
 [4 2]
 [1 3]
 [6 1]
 [1 3]
 [1 5]
 [3 1]
 [2 1]
 [7 2]
 [2 1]
 [1 3]
 [1 3]
 [2 1]
 [1 3]
 [2 8]
 [5 1]
 [1 3]
 [9 1]
 [7 5]
 [3 1]
 [1 2]
 [2 1]
 [1 2]
 [2 9]
 [2 1]
 [2 2]
 [9 7]
 [3 1]
 [2 1]
 [2 1]
 [1 2]
 [7 2]
 [2 2]
 [1 2]
 [2 4]
 [2 8]
 [1 3]
 [2 4]
 [2 1]
 [3 1]
 [1 1]
 [2 2]
 [2 7]], shape=(50, 2), dtype=int32)


#### 2.2 Model

In [None]:
class SkipGram(tf.keras.layers.Layer):
  def __init__(self, vocab_size, embedding_size):
    super(SkipGram, self).__init__()
    self.vocabulary_size = vocab_size
    self.embedding_size = embedding_size

  def build(self):
    self.embedding = self.add_weight(
                            shape=[self.vocabulary_size, self.embedding_size],
                            initializer = 'random_uniform', name = 'embedding')
    self.score_w = self.add_weight(
                            shape = [self.vocabulary_size, self.embedding_size],
                            initializer = 'truncated_normal', name= 'score_weights')
    self.score_b = self.add_weight(shape =[self.vocabulary_size], 
                                   initializer = 'zeros', name='score_bias')

  def call(self, pair, mode = "train"):
    input, context = tf.reshape(pair, shape = (2,50))
    context = tf.reshape(context, shape= (50,1))
    embed = tf.nn.embedding_lookup(self.embedding, input)
    if mode == "train":
      loss = tf.reduce_mean(tf.nn.nce_loss(weights = self.score_w, 
                            biases = self.score_b, 
                            labels = context, 
                            inputs = embed, 
                            num_sampled = 1,
                            num_classes = self.vocabulary_size)
                            )

      loss_summary = tf.summary.scalar("loss_summary", loss)
      return loss_summary
    elif mode == "eval":
      out = tf.tensordot(embed, self.score_w, axes = [[1], [0]])
      return tf.argmax(out)




In [None]:
layer = SkipGram(10,10)

In [None]:
layer.build()
for pair in dataset.take(1):
  layer.call(pair, 'eval')
