In [1]:
import io
import numpy as np
import re
import string
import tensorflow as tf
import tqdm
import os

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Define dataset parameters

In [2]:
vocab_size = 50000 # Define the vocabulary size
sequence_length = 15 #number of words in a sequence
window_size=3 #size of window in skip-gram
num_ns=4 #negative samples
dataset_name= "shakespeare" #or "wikipedia"

## Create Dataset

In [3]:
SEED = 42 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [4]:
#Generate training data
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data_word2vec(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):
        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence, 
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)

        # Iterate over each positive skip-gram pair to produce training examples 
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1, 
              num_sampled=num_ns, 
              unique=True, 
              range_max=vocab_size, 
              seed=SEED, 
              name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)

            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context.numpy())
            labels.append(label.numpy())

    return targets, contexts, labels

In [5]:
#choose between shakespeare and wikipedia dataset
def choose_dataset(dataset_name):
    
    if(dataset_name=="wikipedia"):
        wikipedia = tfds.load('wikipedia/20201201.en', split='train[:1%]', shuffle_files=True)
        assert isinstance(wikipedia, tf.data.Dataset)
        tfds.as_numpy(wikipedia)
        lines = []
        for article in tfds.as_numpy(wikipedia):
            lines += article["text"].splitlines()

        lines = [l for l in lines if l != "".encode()]
    else:
        path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
        with open(path_to_file) as f: 
            lines = f.read().splitlines()
            
    return tf.convert_to_tensor(lines)

In [6]:
#Download dataset
lines = choose_dataset(dataset_name)

In [7]:
#Vectorize sentences from the corpus
# We create a custom standardization function to lowercase the text and remove punctuation.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [8]:
text_ds = tf.data.Dataset.from_tensor_slices(lines).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [9]:
vectorize_layer.adapt(text_ds.batch(1024))

2024-03-06 18:43:13.135463: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [10]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()

In [11]:
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [12]:
sequences = list(text_vector_ds.as_numpy_iterator())

In [13]:
targets, contexts, labels = generate_training_data_word2vec(
    sequences=sequences, 
    window_size=window_size, 
    num_ns=num_ns, 
    vocab_size=vocab_size, 
    seed=SEED)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 32777/32777 [00:05<00:00, 6401.01it/s]


In [14]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000
evaluate_length = int(len(targets) * 0.15)
evaluate_dataset = tf.data.Dataset.from_tensor_slices(((targets[:evaluate_length], contexts[:evaluate_length]), labels[:evaluate_length])).shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
evaluate_dataset = evaluate_dataset.batch(evaluate_length, drop_remainder=True)
train_dataset = tf.data.Dataset.from_tensor_slices(((targets[evaluate_length:], contexts[evaluate_length:]), labels[evaluate_length:])).shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)


## Save training dataset

In [15]:
tensair_path = os.environ.get("TENSAIR_PATH")

In [16]:
f = open(tensair_path + "/data/W2V/"+dataset_name+"_train.txt", "w")
for d in train_dataset:
    (targets, contexts), labels = d
    for target, context, label in zip(targets,contexts, labels):
        target = target.numpy()
        context = context.numpy()
        label = label.numpy()
        example = "" + str(target) + " "
        example += str(context[0][0]) + " " + str(context[1][0]) + " " + str(context[2][0]) + " " + str(context[3][0]) + " " + str(context[4][0]) + " "
        example += str(label[0]) + " " + str(label[1]) + " " + str(label[2]) + " " + str(label[3]) + " " + str(label[4]) + "\n"
        f.write(example)
f.close()

## Save evaluaion dataset (in byte format acceptted by TensAIR)

In [17]:
target = [] 
contexts = []
labels = []
for d in evaluate_dataset:
    (t, c), l = d
    target = list(t.numpy())
    contexts = list(c.numpy())
    labels = list(l.numpy())
    break

In [18]:
size_of_int = 4 # int = 4 bytes in c++
mini_batch_size = 2048
with open(tensair_path + "/data/W2V/"+dataset_name+"_eval.bytes", 'wb') as file:
    file.write(mini_batch_size.to_bytes(4, byteorder ='little',signed=True)) # number of minibatchs
    file.write(len(target).to_bytes(4, byteorder ='little',signed=True)) # number of training examples
    file.write((3).to_bytes(4, byteorder ='little',signed=True)) # number of tensors (contexts, labels, target)
    file.write((len(target) * size_of_int).to_bytes(4, byteorder ='little',signed=True)) # size in bytes of target
    for t in target:
        file.write(int(t).to_bytes(4, byteorder ='little',signed=True))
    file.write((len(contexts) * 5 * size_of_int).to_bytes(4, byteorder ='little',signed=True)) # size in bytes of context
    for context in contexts:
        for c in context:
            file.write(int(c).to_bytes(4, byteorder ='little',signed=True))
    file.write((len(labels) * 5 * size_of_int).to_bytes(4, byteorder ='little',signed=True)) # size in bytes of labels
    for label in labels:
        for l in label:
            file.write(int(l).to_bytes(4, byteorder ='little',signed=True))
