# Skipgram with Negative Sampling

Mike Holcomb

In [26]:
import random
from tqdm import tqdm
from tensorflow.keras.datasets import reuters
import numpy as np
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)

**Constants**

In [20]:
vocab_size=10000
c = 8 # Context window size (+8 t -8)
K = 5 # Number of negative samples per positive example
batch_size = 16384
embedding_size = 100
train_steps = 5000
learning_rate = 0.0001

## 1. Get Corpus

In [2]:
# Get corpus


(train, _), (test, _) = reuters.load_data(path="reuters.npz",
                                                         num_words=vocab_size,
                                                         test_split=0.2,
                                                         seed=1337)

In [3]:
# A dictionary mapping words to an integer index
word_index = reuters.get_word_index()

In [4]:
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [5]:
def decode(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [7]:
corpus = np.concatenate((train,test))
num_articles = corpus.shape[0]

## 2. Gather Unigram Statistics

**Per section 2.2 Negative sampling**
1. Generate frequency table
2. Alpha smooth highest frequencies
3. get k noise words sampling from alpha smoothed distribution
4. Add k noise words as negative samples

In [8]:
freq = np.zeros((vocab_size,))
for i in tqdm(range(num_articles)):
    for w in corpus[i]:
        freq[w] += 1

raw_prob = freq / np.sum(freq)

100%|██████████| 11228/11228 [00:00<00:00, 17148.45it/s]


In [9]:
# Alpha smooth the unigram distribution
alpha = 0.75
alpha_counts = np.power(freq, alpha)
alpha_counts[0:4] = 0 # Don't include symbols as noise words
alpha_prob = alpha_counts / np.sum(alpha_counts)

In [10]:
get_noise_words = lambda k : np.argmax(np.random.multinomial(1, alpha_prob,k),axis=1)

**Per section 2.3 Subsampling of frequent words**

In [12]:
t = 4.2e-6 # Goal seek such that smallest discard probability is ~0; originally 1e-5 in paper
discard_prob = 1. - np.sqrt(t * np.reciprocal(np.where(raw_prob == 0, 1., raw_prob)))

## 3. Generate samples

In [13]:
ct_pairs = [] # Create list of target-context pair tuples
targets = [] # Create list of target; Set positive := 1, negative := 0

In [14]:
def add_example(t, c):
    """
    Add a positive example and K negative examples
    :param t: target word
    :param c: context word
    :return:
    """
    if t == c:
        return
    ct_pairs.append([t, c])
    targets.append(1)

    noise_words = get_noise_words(K)
    for noise in noise_words:
        if t == noise:
            noise = noise + 1
        ct_pairs.append([t, noise])
        targets.append(0)

In [15]:
for i in tqdm(range(num_articles)): # num_articles
    article = corpus[i]
    for j in range(len(article) - c - 1):
        w = article[j]
        if w < 4:
            continue # Skip symbols

        for k in range(1,c+1):
            v = article[j + k]
            if v < 4: # Skip symbols
                continue

            if random.random() > discard_prob[w]:
                add_example(w, v)

            if random.random() > discard_prob[v]:
                add_example(v, w)

100%|██████████| 11228/11228 [21:48<00:00,  8.70it/s]


## 4. Build Model

In [16]:
def sgns_fn(features, labels, mode, params):
    """
    Model function implementing Skip Gram with Negative Sampling
    """

    # target word vector
    u = tf.feature_column.input_layer(features,[params['feature_columns'][0] ])
    
    # context word vector
    v = tf.feature_column.input_layer(features, [params['feature_columns'][1] ])

    # dot product similarity
    z = tf.reduce_sum(tf.multiply(u, v), axis=1)

    # compute likelihood of being in the same context
    predictions = tf.sigmoid(z)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.EstimatorSpec(
            mode=mode,
            predicts={"in_context": predictions}
        )

    # compute the cross entropy between the vectors = negative log likelihood
    total_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,logits=z)
    average_loss = tf.reduce_mean(total_loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = params.get("optimizer", tf.train.AdamOptimizer)
        optimizer = optimizer(params.get("learning_rate", 1e-3))
        train_op = optimizer.minimize(
            loss=average_loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(
            mode=mode, loss=average_loss, train_op=train_op)

    assert mode == tf.estimator.ModeKeys.EVAL

    # Calculate predictions
    print(labels)
    print(predictions)

    return tf.estimator.EstimatorSpec(
        mode=mode,
        # Report sum of error for compatibility with pre-made estimators
        loss=average_loss)

## 5. Build data pipeline

In [33]:
pairs = np.array(ct_pairs)
words = pairs[:,0]
contexts = pairs[:,1]
labels = np.array(targets,dtype=np.float32)

In [17]:
def make_dataset(batch_sz, w,c , y=None, shuffle=False, shuffle_buffer_size=1000):
    """Create a slice Dataset from a pandas DataFrame and labels"""
    features = {'words': w, 'contexts': c}

    def input_fn():
        if y is not None:
            dataset = tf.data.Dataset.from_tensor_slices((features , y))
        else:
            dataset = tf.data.Dataset.from_tensor_slices((features, ))
        if shuffle:
            dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_sz, drop_remainder=True).repeat()
        else:
            dataset = dataset.batch(batch_sz, drop_remainder=True)
        return dataset.make_one_shot_iterator().get_next()

    return input_fn

In [35]:
train_input_fn = make_dataset(batch_size, words, contexts, labels,shuffle=True)

In [36]:
words_feat = tf.feature_column.categorical_column_with_identity(key="words",
                                                                    num_buckets=vocab_size)

context_feat = tf.feature_column.categorical_column_with_identity(key="contexts",
                                                                    num_buckets=vocab_size)

words_embed = tf.feature_column.embedding_column(words_feat,
                                                     dimension=embedding_size,
                                                     trainable=True)

context_embed = tf.feature_column.embedding_column(context_feat,
                                                     dimension=embedding_size,
                                                     trainable=True)

feature_columns = [
    words_embed,
    context_embed
]

## 6. Run the Model

In [37]:
p = {
    "feature_columns" : feature_columns,
    "learning_rate" : learning_rate,
    "optimizer" : tf.train.AdamOptimizer,
    "embedding_size" : embedding_size,
    "vocab_size" : vocab_size
}

config = tf.estimator.RunConfig(
    model_dir='./estimator_model',
    tf_random_seed=42
)

In [38]:
model = tf.estimator.Estimator(
    model_fn = sgns_fn,
    params = p,
    config=config
)

INFO:tensorflow:Using config: {'_model_dir': './estimator_model', '_tf_random_seed': 42, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xab5873ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [39]:
model.train(input_fn=train_input_fn, steps=train_steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./estimator_model/model.ckpt.
INFO:tensorflow:loss = 0.69382256, step = 1
INFO:tensorflow:global_step/sec: 1.85376
INFO:tensorflow:loss = 0.6938143, step = 101 (53.938 sec)
INFO:tensorflow:global_step/sec: 10.1581
INFO:tensorflow:loss = 0.69337523, step = 201 (9.840 sec)
INFO:tensorflow:global_step/sec: 10.4696
INFO:tensorflow:loss = 0.69380945, step = 301 (9.551 sec)
INFO:tensorflow:global_step/sec: 10.5643
INFO:tensorflow:loss = 0.6937982, step = 401 (9.466 sec)
INFO:tensorflow:global_step/sec: 10.8998
INFO:tensorflow:loss = 0.69380057, step = 501 (9.175 sec)
INFO:tensorflow:global_step/sec: 9.31897
INFO:tensorflow:loss = 0.6932485, step = 601 (10.730 sec)
INFO:tensorflow:global_step/sec: 10.7126
INFO:tens

<tensorflow.python.estimator.estimator.Estimator at 0xab58737b8>