# word2vec

Skip-gram and negative sampling

## Setup

In [4]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

from collections import defaultdict

In [5]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [6]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

### Generate training data

In [18]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    window = defaultdict(set)
    for target_word, context_word in positive_skip_grams:
      window[target_word].add(context_word)

    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Discard this negative sample if it intersects with the positive context.
      if window[target_word].intersection(negative_sampling_candidates.numpy()):
        continue

      # Build context and label vectors (for one target word).
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

## Prepare training data for word2vec

### Download text corpus


In [45]:
%mkdir /content/test
%cd /content/test
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-00
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-01
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-02
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-03
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-04
!wget https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-05
!cat enwiki-20220201-clean-part-* > enwiki-20220201-clean.zip
!unzip -t enwiki-20220201-clean.zip

/root/test
--2024-03-09 18:17:56--  https://github.com/GermanT5/wikipedia2corpus/releases/download/v1.0/enwiki-20220201-clean-part-00
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/461587897/8bb084b8-efe5-457f-a26c-fc6877f9679b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240309%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240309T181756Z&X-Amz-Expires=300&X-Amz-Signature=041d48bf5c6af5a62379457952ccde26cda46df83ae88ea70a1ca119fbd85957&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=461587897&response-content-disposition=attachment%3B%20filename%3Denwiki-20220201-clean-part-00&response-content-type=application%2Foctet-stream [following]
--2024-03-09 18:17:56--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/461587897

In [51]:
!mv /content/enwiki-20220201-clean.zip /content/test
!unzip enwiki-20220201-clean.zip

/content/test
Archive:  enwiki-20220201-clean.zip
  inflating: enwiki-20220201-clean.txt  


In [31]:
!head -1000000 /content/test/enwiki-20220201-clean.txt > /content/prune2.txt
!ls -lah /content/prune2.txt
path_to_file = "/content/prune2.txt"
# with open(path_to_file) as f:
#   lines = f.read().splitlines()
# for line in lines[:20]:
#   print(line[:50])

In [32]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

### Vectorize sentences from the corpus

In [33]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

Call `TextVectorization.adapt` on the text dataset to create vocabulary.


In [34]:
vectorize_layer.adapt(text_ds.batch(1024))

In [35]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'of', 'and', 'in', 'to', 'a', 'is', 'as', 'was', 'for', 'that', 'by', 'with', 'on', 'from', 'are', 'it', 'his']


The `vectorize_layer` can now be used to generate vectors for each element in the `text_ds` (a `tf.data.Dataset`). Apply `Dataset.batch`, `Dataset.prefetch`, `Dataset.map`, and `Dataset.unbatch`.

In [36]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

### Obtain sequences from the dataset

In [37]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

990902


Inspect a few examples from `sequences`:

In [38]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[   1    1    1  517  331    1  180  695  380 3489] => ['[UNK]', '[UNK]', '[UNK]', '15', 'october', '[UNK]', '–', '25', 'august', '1900']
[  25  159   19  629    9    7  825    1  113 3605] => ['he', 'began', 'his', 'career', 'as', 'a', 'classical', '[UNK]', 'before', 'turning']
[  25   95    2    1  584  922    6 1210    2    1] => ['he', 'became', 'the', '[UNK]', 'person', 'ever', 'to', 'hold', 'the', '[UNK]']
[   1 3826    5    1  150    6  534  653   12    1] => ['[UNK]', 'resigned', 'in', '[UNK]', 'due', 'to', 'health', 'problems', 'that', '[UNK]']
[   5    1   22  260 2815   25 1680    7 2806    4] => ['in', '[UNK]', 'at', 'age', '45', 'he', 'suffered', 'a', 'collapse', 'and']


### Generate training examples from sequences

In [39]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|██████████| 990902/990902 [21:59<00:00, 750.83it/s]




targets.shape: (2657917,)
contexts.shape: (2657917, 5)
labels.shape: (2657917, 5)


### Configure the dataset for performance

In [40]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


Apply `Dataset.cache` and `Dataset.prefetch` to improve performance:

In [41]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


## Model and training

In [42]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

### Define loss function and compile model


In [43]:
embedding_dim = 300
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [44]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

Train the model on the `dataset` for some number of epochs:

In [45]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e6f9d9ed0c0>

## Embedding lookup and analysis

In [46]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [47]:
m = 1
what = 0
for i in range(len(vocab)):
  val = np.dot(weights[i], weights[i])
  if what < val:
    what = val
  val = round(val * 100, 2)
  if m < val:
    m = val
print(what)
print(m)
print(len(vocab))
print(len(weights[1]))

71.28088
7128.09
4096
300


In [48]:
import socket
import time
import math

results = []

class Netcat:
    """ Python 'netcat like' module """

    def __init__(self, ip, port):
        self.buff = ""
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.connect((ip, port))

    def read(self, length=1024):
        """ Read 1024 bytes off the socket """
        return self.socket.recv(length)

    def read_until(self, data):
        """ Read data into the buffer until we have data """
        while data not in self.buff:
            self.buff += self.socket.recv(1024).decode("ascii")

        pos = self.buff.find(data)
        rval = self.buff[:pos + len(data)]
        self.buff = self.buff[pos + len(data):]

        return rval

    def write(self, data):
        self.socket.send(data)

    def close(self):
        self.socket.close()


def crafted(word="you"):
    return word + '\n'

In [49]:
for words_index in range(len(vocab)):
  weights[words_index]

In [None]:
class Finder:
  def __init__(self, black_list = []):
    # word_index, np.dot result
    self.result = []
    self.black_list = black_list

  def weirdDistance(wi, words_index):
    valInThisContext = np.dot(weights[wi], weights[words_index])
    return abs(round(valInThisContext * 100, 2))

  def get_words(index):
    return vocab[index]

  def guess_next(self):
    target = np.array([i for _, i in self.result])
    print(target)
    best = 1000000
    next_word_index = None
    cal = []
    next_word = "person"
    for words_index in range(len(vocab)):
      cal = []
      for wi, res in self.result:
        cal.append(Finder.weirdDistance(wi, words_index))

      # Euclidean distance
      sum_sq = np.sum(np.square(target - cal))
      curr = np.sqrt(sum_sq)

      if best > curr and words_index not in ([c for c, _ in self.result] + self.black_list):
        best = curr
        next_word_index = words_index
    print(Finder.get_words(next_word_index), cal, curr)
    return next_word_index

  def add_result(self, word, order):
    self.result += [(word, order)]

blacklist = []
while True:
    s = Netcat('dyn.ctf.pearlctf.in', 30021)
    buf = s.read()
    next_word_index = None
    next_word = None
    finder = Finder(blacklist)
    for i in range(5):
        if next_word_index is not None:
          next_word_index = finder.guess_next()
          next_word = Finder.get_words(next_word_index)
        else:
          next_word_index = 123
          next_word = Finder.get_words(123)
        data = bytes(crafted(next_word), 'ascii')
        s.write(data)
        time.sleep(1)
        buf = s.read()
        try:
          order = float(buf.decode("ascii").split(":")[1].split('\n')[0])
        except Exception as _:
          print("blacklist this word", next_word)
          blacklist += [next_word_index]
          break
        finder.add_result(next_word_index, order)
        print(order, '\n', buf.decode("ascii"), next_word)
    s.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
9.14 
 Similarity to the target word: 9.14

Attempt 5: Enter your guess:  violent
[ 4.22  8.04 15.89  9.14]
extensive [329.46, 837.15, 660.23, 84.4] 1101.8359918336305
7.81 
 Similarity to the target word: 7.81
Sorry, you did not guess the correct word. 
 extensive
14.2 
 Similarity to the target word: 14.2

Attempt 2: Enter your guess:  found
[14.2]
follows [329.46] 315.26
1.59 
 Similarity to the target word: 1.59

Attempt 3: Enter your guess:  follows
[14.2   1.59]
july [329.46, 184.35] 364.40373927829006
0.2 
 Similarity to the target word: 0.2

Attempt 4: Enter your guess:  july
[14.2   1.59  0.2 ]
ordinary [329.46, 184.35, 196.29] 413.81321063977646
5.62 
 Similarity to the target word: 5.62

Attempt 5: Enter your guess:  ordinary
[14.2   1.59  0.2   5.62]
dutch [329.46, 184.35, 196.29, 387.52] 563.1065470228525
6.1 
 Similarity to the target word: 6.1
Sorry, you did not guess the correct word. 
 dutch
6.61 
 Simila