In [8]:
doc1words = ['cat', 'window', 'defenestrate', 'dog', 'flower', 'banana']
doc2words = ['cat', 'banana', 'banana', 'lamb', 'tree']

# one hot encoding
one_hot1 = []
one_hot2 = []
for word in doc1words:
    one_hot1.append([1 if word == w else 0 for w in doc1words])

for word in doc2words:
    one_hot2.append([1 if word == w else 0 for w in doc2words])

print("one hot 1 encoding: ", one_hot1)
print("one hot 2 encoding: ", one_hot2)

one hot 1 encoding:  [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]]
one hot 2 encoding:  [[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]]


In [10]:
# word2vec from scratch
# Vektör Matrislerinde:
#  Bütün kelimelerin vektörleri bulunur.
#  Her kelime vektörü bir sütunda yer alır.  Kelime vektörünün boyutu d dir
#  Sözlükte bulunan kelime sayısı m ise  Vektör Matrisi boyutu (d x m) dir

vector_matrix = []
for word in doc1words:
    vector_matrix.append([1 if word == w else 0 for w in doc1words])

for word in doc2words:
    vector_matrix.append([1 if word == w else 0 for w in doc2words])

vector_matrix

[[1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 0],
 [0, 1, 1, 0, 0],
 [0, 1, 1, 0, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1]]

In [11]:
#  Birer adet girdi, çıktı ve gizli katmandan oluşan bir yapay sinir ağıdır.
#  Kelime vektörlerini oluştururken pencere genişliği, embedding boyutu gibi hiper parametreleri kullanır.
#  Pencere genişliği hedef kelimenin sağında ve solunda kaç kelime olması gerektiğini belirtir.
#  Embedding boyutu ise her bir kelimenin kaç boyutlu vektör olarak tanımlanacağını belirtir.
#  Embedding boyutu aynı zamanda gizli katmandaki nöron sayısına karşılık gelir.
#  İki kelimenin birbirine olan benzerliğini veya iki cümlenin birbirine olan yakınlığını bulmak, özetlemede yararlanmak gibi birçok kullanım alanı vardır.

input_layer = []
for word in doc1words:
    input_layer.append([1 if word == w else 0 for w in doc1words])

for word in doc2words:
    input_layer.append([1 if word == w else 0 for w in doc2words])

input_layer

[[1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 0],
 [0, 1, 1, 0, 0],
 [0, 1, 1, 0, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1]]

In [12]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [13]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [14]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [41]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [42]:
# implement skip-gram without using any machine learning library

# define embedding size and context window size
embedding_size = 5
window_size = 2

# generate positive skip-gram pairs
couples = []
for i, center_word in enumerate(example_sequence):
    for j in range(i - window_size, i + window_size + 1):
        if j < 0 or j >= len(example_sequence) or i == j:
            continue
        couples.append([center_word, example_sequence[j]])

# view sample positive skip-gram pairs
for couple in couples:
    print(f"({inverse_vocab[couple[0]]}, {inverse_vocab[couple[1]]})")

# generate negative skip-gram pairs
import numpy as np
def generate_batch(couples, n_positive=50, negative_ratio=1.0):
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    batch = batch.astype(int)

    # first column is for center words, second column is for positive context words, third column is for negative samples
    for i, couple in enumerate(couples):
        batch[i, :] = couple[:]
    # generate negative samples
    negative_samples = np.random.choice(list(vocab.values()), size=(batch_size, 1))
    batch[:, 2:] = negative_samples[:]
    return batch

# view a few sample generated batches
batch = generate_batch(couples, n_positive=2, negative_ratio=2)
print(batch)

(the, wide)
(the, road)
(wide, the)
(wide, road)
(wide, shimmered)
(road, the)
(road, wide)
(road, shimmered)
(road, in)
(shimmered, wide)
(shimmered, road)
(shimmered, in)
(shimmered, the)
(in, road)
(in, shimmered)
(in, the)
(in, hot)
(the, shimmered)
(the, in)
(the, hot)
(the, sun)
(hot, in)
(hot, the)
(hot, sun)
(sun, the)
(sun, hot)


ValueError: could not broadcast input array from shape (2,) into shape (3,)

In [46]:
import tensorflow as tf
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
len(positive_skip_grams)

26

In [47]:
for target, context in positive_skip_grams:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(7, 1): (sun, the)
(6, 7): (hot, sun)
(7, 6): (sun, hot)
(5, 4): (in, shimmered)
(5, 3): (in, road)
(5, 1): (in, the)
(1, 7): (the, sun)
(4, 2): (shimmered, wide)
(4, 3): (shimmered, road)
(2, 1): (wide, the)
(6, 5): (hot, in)
(1, 4): (the, shimmered)
(1, 6): (the, hot)
(1, 3): (the, road)
(3, 4): (road, shimmered)
(3, 1): (road, the)
(3, 2): (road, wide)
(5, 6): (in, hot)
(4, 1): (shimmered, the)
(1, 5): (the, in)
(2, 4): (wide, shimmered)
(2, 3): (wide, road)
(3, 5): (road, in)
(6, 1): (hot, the)
(1, 2): (the, wide)
(4, 5): (shimmered, in)


In [48]:
SEED = 42
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


In [49]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]
