## Torch Embedding layer

In [1]:
import torch
import torch.nn as nn

In [2]:
embedding_layer = nn.Embedding(
    num_embeddings=1000,
    embedding_dim=100
)

In [3]:
input_data = torch.LongTensor(
    [1,2,3,4,5]
)

embedded = embedding_layer(
    input_data
)

In [4]:
embedding_layer.weight.shape

torch.Size([1000, 100])

In [5]:
embedding_layer_2 = nn.Embedding(
    num_embeddings=2,
    embedding_dim=10
)

embedding_layer_2.weight

Parameter containing:
tensor([[ 0.8806, -1.5844,  1.1348, -0.3444,  0.5277,  1.8012, -0.0129, -0.3972,
         -0.2871, -1.0802],
        [ 0.3418,  0.6711,  1.8439, -0.6990, -0.6709,  0.1948, -0.2870,  1.5021,
         -0.9960, -0.3966]], requires_grad=True)

In [6]:
embedding_layer_2.weight.shape

torch.Size([2, 10])

In [7]:
embedding_layer_2(
    torch.LongTensor([1,1])
)

tensor([[ 0.3418,  0.6711,  1.8439, -0.6990, -0.6709,  0.1948, -0.2870,  1.5021,
         -0.9960, -0.3966],
        [ 0.3418,  0.6711,  1.8439, -0.6990, -0.6709,  0.1948, -0.2870,  1.5021,
         -0.9960, -0.3966]], grad_fn=<EmbeddingBackward0>)

In [8]:
embedding_layer_2(
    torch.LongTensor([0,0])
)

tensor([[ 0.8806, -1.5844,  1.1348, -0.3444,  0.5277,  1.8012, -0.0129, -0.3972,
         -0.2871, -1.0802],
        [ 0.8806, -1.5844,  1.1348, -0.3444,  0.5277,  1.8012, -0.0129, -0.3972,
         -0.2871, -1.0802]], grad_fn=<EmbeddingBackward0>)

In [9]:
embedding_layer(torch.tensor([0])) == embedding_layer(torch.tensor([0]))

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True]])

In [10]:
embedding_layer(torch.arange(0, 1000)).shape

torch.Size([1000, 100])

In [11]:
print(embedded.shape)

torch.Size([5, 100])


## Tensorflow Embedding Layer

In [12]:
import tensorflow
from tensorflow.keras.layers import Embedding

In [13]:
import numpy as np

input_scalar = np.array([1])
input_vector = np.array([1,2])

In [14]:
embedding_layer_tf = Embedding(
    input_dim=2,
    output_dim=10
)

embedded_tf = embedding_layer_tf(
    input_scalar
)

In [15]:
embedding_layer_tf.get_weights()[0].shape

(2, 10)

In [16]:
embedded_tf.shape

TensorShape([1, 10])

In [17]:
embedding_layer_tf(
    np.asarray([0,0])
)

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[ 0.0254395 ,  0.04405124,  0.03798921,  0.01667234, -0.005647  ,
        -0.02411038,  0.01450798,  0.00846   ,  0.02556993, -0.04371597],
       [ 0.0254395 ,  0.04405124,  0.03798921,  0.01667234, -0.005647  ,
        -0.02411038,  0.01450798,  0.00846   ,  0.02556993, -0.04371597]],
      dtype=float32)>

In [18]:
embedding_layer_tf(
    np.asarray([1,1])
)

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[-0.0483496 ,  0.03880994,  0.03461048,  0.04239995,  0.03572866,
         0.04875128,  0.01053724,  0.03739781, -0.00279566, -0.04985597],
       [-0.0483496 ,  0.03880994,  0.03461048,  0.04239995,  0.03572866,
         0.04875128,  0.01053724,  0.03739781, -0.00279566, -0.04985597]],
      dtype=float32)>

In [19]:
embedding_layer_tf(
    # np.asarray([2]) error because no mapping for 2 in 2 dim embedding layer
    # np.asarray([0,1,2]) error because no mapping for 3 input in 2 dim embedding
    np.asarray([0,1])
)

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[ 0.0254395 ,  0.04405124,  0.03798921,  0.01667234, -0.005647  ,
        -0.02411038,  0.01450798,  0.00846   ,  0.02556993, -0.04371597],
       [-0.0483496 ,  0.03880994,  0.03461048,  0.04239995,  0.03572866,
         0.04875128,  0.01053724,  0.03739781, -0.00279566, -0.04985597]],
      dtype=float32)>

## Skip-gram
- Skip gram model predicts surrounding words based on the context words based on the target word
- by predicting context word skip gram efficiently learns meaningful word representation

In [20]:
text = "The cat sat on the mat"
tokens = text.lower().split(" ")

tokens

['the', 'cat', 'sat', 'on', 'the', 'mat']

In [21]:
window_size = 3
context = []
target = []

for i in range(len(tokens)):
    start = max(0, i - window_size)
    end = min(len(tokens), i + window_size + 1)
    context.append(
        tokens[start:i] + tokens[i+1:end]
    )
    target.append(tokens[i])

for c, t in zip(context, target):
    print(f"Context: {c}, Target: {t}")

Context: ['cat', 'sat', 'on'], Target: the
Context: ['the', 'sat', 'on', 'the'], Target: cat
Context: ['the', 'cat', 'on', 'the', 'mat'], Target: sat
Context: ['the', 'cat', 'sat', 'the', 'mat'], Target: on
Context: ['cat', 'sat', 'on', 'mat'], Target: the
Context: ['sat', 'on', 'the'], Target: mat


In [22]:
generated_pairs = list()

for c, t in zip(context, target):
    for word in c:
        generated_pairs.append((word, t))
        
generated_pairs

[('cat', 'the'),
 ('sat', 'the'),
 ('on', 'the'),
 ('the', 'cat'),
 ('sat', 'cat'),
 ('on', 'cat'),
 ('the', 'cat'),
 ('the', 'sat'),
 ('cat', 'sat'),
 ('on', 'sat'),
 ('the', 'sat'),
 ('mat', 'sat'),
 ('the', 'on'),
 ('cat', 'on'),
 ('sat', 'on'),
 ('the', 'on'),
 ('mat', 'on'),
 ('cat', 'the'),
 ('sat', 'the'),
 ('on', 'the'),
 ('mat', 'the'),
 ('sat', 'mat'),
 ('on', 'mat'),
 ('the', 'mat')]

In [23]:
import torch
import numpy as np
import re

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cpu')

In [25]:
def load_text(file_path):
    with open(file_path, "r") as file:
        text = file.read()
        
    return text

In [26]:
data = load_text("./word_embedding_text.txt")

In [27]:
re.sub(r"[^a-zA-Z0-9]+", ' ', "Hello there @ 123 , . -\n)()ZZab 123@#")

'Hello there 123 ZZab 123 '

In [28]:
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    words = text.split()
    return words

In [29]:
from collections import Counter

c = Counter("aaabbccedeeedaffaa")

c.most_common(2)

[('a', 6), ('e', 4)]

In [30]:
c1 = Counter(["cat", "cat", "dog", "mat", "sat", "cat"])
c1

Counter({'cat': 3, 'dog': 1, 'mat': 1, 'sat': 1})

In [31]:
c.keys()

dict_keys(['a', 'b', 'c', 'e', 'd', 'f'])

In [32]:
c.values()

dict_values([6, 2, 2, 4, 2, 2])

In [33]:
def build_vocabulary(words: list) -> tuple:
    """Build the vocabulary and the index to word dict from the words
    
    arguments:
        - words: list of words
    returns:
        - vocabulary: dict
        - idx_word: dict    
    """
    word_counts = Counter(words)
    vocabulary = {word: idx for idx, (word, _count) in enumerate(word_counts.items())}
    idx_word = {idx: word for word, idx in vocabulary.items()}
    return vocabulary, idx_word

In [34]:
build_vocabulary(["cat", "cat", "dog", "mat", "sat", "cat"])

({'cat': 0, 'dog': 1, 'mat': 2, 'sat': 3},
 {0: 'cat', 1: 'dog', 2: 'mat', 3: 'sat'})

In [35]:
for i in range(-2, 3):
    print(i)

-2
-1
0
1
2


In [36]:
def prepare_data(words: list, vocabulary: dict, window_size=2):
    data = []
    pair = []
    
    for i, word in enumerate(words):
        for neighbor in range(-window_size, window_size + 1):
            if neighbor == 0:
                continue
            n_idx  = i + neighbor
            if 0 <= n_idx < len(words):
                data.append((vocabulary[word], vocabulary[words[n_idx]]))
                pair.append((word, words[n_idx]))
                
    return data, pair
                

In [37]:
d, p = prepare_data(["cat", "and", "dog", "sat", "mat"], {"cat": 0, "dog": 1, "mat": 2, "sat": 3, "and": 4}, window_size=2)

In [38]:
p

[('cat', 'and'),
 ('cat', 'dog'),
 ('and', 'cat'),
 ('and', 'dog'),
 ('and', 'sat'),
 ('dog', 'cat'),
 ('dog', 'and'),
 ('dog', 'sat'),
 ('dog', 'mat'),
 ('sat', 'and'),
 ('sat', 'dog'),
 ('sat', 'mat'),
 ('mat', 'dog'),
 ('mat', 'sat')]

In [39]:
class skipgram(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(skipgram, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = torch.nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, center_word):
        embedded = self.embeddings(center_word) # [center, ...] 32
        o = self.output_layer(embedded)
        return o

In [40]:
test_model = skipgram(35, 10)
test_model = test_model(
    # torch.arange(0,36) error because we have 35 words in voabulary
    # torch.arange(0, 32) # for 32 dim input tensor output: [32, 35] that means 32 words with each 35 embedding
    # torch.tensor([0]) # [1, 35] that means 1 word embedding
    # torch.tensor([35]) # error because no mapping for 36 in 35 embedding layer
    torch.tensor([0, 34])
)
test_model.shape

torch.Size([2, 35])

In [41]:
data_ = [1,2,3,4,5,6,7,8,9,0]
[data_[i:i+2] for i in range(0, len(data_), 2)]

[[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]]

In [42]:
def create_batches(data, batch_size):
    np.random.shuffle(data)
    n_batches = len(data) // batch_size
    batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    return batches

In [43]:
def train(skip_gram_model, data, vocab_size, embedding_dim, batch_size=32, epochs=10, learning_r=0.01):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(skip_gram_model.parameters(), lr=learning_r)
    
    skip_gram_model.to(device)
    
    batches = create_batches(data, batch_size)
    
    for epoch in range(epochs):
        total_loss = 0
        
        for batch in batches:
            # a batch contains [(center, context), ...]
            center_batch = [pair[0] for pair in batch] # 32 center words
            context_batch = [pair[1] for pair in batch] # 32 context words
                        
            center_batch = torch.tensor(center_batch).to(device)
            context_batch = torch.tensor(context_batch).to(device)
            
            optimizer.zero_grad()
            
            output = skip_gram_model(center_batch)
            # compute loss
            loss = criterion(output, context_batch)
            
            total_loss = total_loss + loss.item()
            
            loss.backward()
            optimizer.step()
            
        if epoch % 100 == 0 or epoch == epochs - 1:
            print(f"epoch {epoch} loss {total_loss}")

In [44]:
print(data)

The cat sat on the mat.
The dog lay on the rug.
The cat chased the rat.
The dog barked at the cat.
The mat was next to the rug.
The dog and the cat slept together on the mat.
The mat and the rug were dirty.
The tree is in the backyard.
The bird flew over the trees.
The bird sang in the tree.
The bird liked to play near the tree.
The tree was tall and the bird liked to sit on it.


In [45]:
words = tokenize(data)

In [46]:
len(words)

85

In [47]:
vocabulary, idx_word = build_vocabulary(words)
vocab_size = len(vocabulary)

In [48]:
v = {
    "a": 0,
    "b": 1
}

len(v)

2

In [49]:
print(f"size of vocabulary: {vocab_size}")

size of vocabulary: 35


In [50]:
vocabulary

{'the': 0,
 'cat': 1,
 'sat': 2,
 'on': 3,
 'mat': 4,
 'dog': 5,
 'lay': 6,
 'rug': 7,
 'chased': 8,
 'rat': 9,
 'barked': 10,
 'at': 11,
 'was': 12,
 'next': 13,
 'to': 14,
 'and': 15,
 'slept': 16,
 'together': 17,
 'were': 18,
 'dirty': 19,
 'tree': 20,
 'is': 21,
 'in': 22,
 'backyard': 23,
 'bird': 24,
 'flew': 25,
 'over': 26,
 'trees': 27,
 'sang': 28,
 'liked': 29,
 'play': 30,
 'near': 31,
 'tall': 32,
 'sit': 33,
 'it': 34}

In [51]:
data, pair = prepare_data(words, vocabulary, window_size=3)

data[:10], pair[:10]

([(0, 1),
  (0, 2),
  (0, 3),
  (1, 0),
  (1, 2),
  (1, 3),
  (1, 0),
  (2, 0),
  (2, 1),
  (2, 3)],
 [('the', 'cat'),
  ('the', 'sat'),
  ('the', 'on'),
  ('cat', 'the'),
  ('cat', 'sat'),
  ('cat', 'on'),
  ('cat', 'the'),
  ('sat', 'the'),
  ('sat', 'cat'),
  ('sat', 'on')])

In [52]:
len(data)

498

In [53]:
embedding_dim = 10
s_g = skipgram(vocab_size, embedding_dim)

In [54]:
s_g

skipgram(
  (embeddings): Embedding(35, 10)
  (output_layer): Linear(in_features=10, out_features=35, bias=True)
)

In [55]:
batch_size = 32
train(s_g, data, vocab_size, embedding_dim, batch_size=batch_size, epochs=1000, learning_r=0.01)

epoch 0 loss 59.02077603340149
epoch 100 loss 37.434876680374146
epoch 200 loss 37.119945764541626
epoch 300 loss 37.02324199676514
epoch 400 loss 36.98068046569824
epoch 500 loss 36.95994019508362
epoch 600 loss 36.947999238967896
epoch 700 loss 36.94011187553406
epoch 800 loss 36.9343466758728
epoch 900 loss 36.92980980873108
epoch 999 loss 36.9260835647583


In [58]:
s_g.cpu()

skipgram(
  (embeddings): Embedding(35, 10)
  (output_layer): Linear(in_features=10, out_features=35, bias=True)
)

In [59]:
word_embeddings = s_g.embeddings.weight.data.numpy()

In [72]:
word_embeddings[0]

array([-0.27078   ,  0.37476772, -0.49283603,  0.24214666, -0.607796  ,
       -0.11571923, -0.25166234,  0.00823486, -0.25608203,  0.46546626],
      dtype=float32)

In [73]:
word_embeddings[1]

array([ 9.3293637e-01,  1.8461654e+00, -8.9689320e-01, -1.9493051e+00,
       -1.5913031e+00,  2.8027799e+00, -3.6414716e-01,  5.9100613e-04,
        5.3977050e-02, -1.1853317e+00], dtype=float32)

In [71]:
for word, idx in vocabulary.items():
    print(f"{word} : embedding :{word_embeddings[idx]}")
    
print(f"The size of embeddings:")
print(word_embeddings.shape)

the : embedding :[-0.27078     0.37476772 -0.49283603  0.24214666 -0.607796   -0.11571923
 -0.25166234  0.00823486 -0.25608203  0.46546626]
cat : embedding :[ 9.3293637e-01  1.8461654e+00 -8.9689320e-01 -1.9493051e+00
 -1.5913031e+00  2.8027799e+00 -3.6414716e-01  5.9100613e-04
  5.3977050e-02 -1.1853317e+00]
sat : embedding :[ 1.9124764  2.4179773 -0.8639681  0.6720566 -0.8368558  0.2094313
  1.6879451 -1.8529346  3.0814295 -2.0283315]
on : embedding :[ 3.288054   -1.228685   -0.10843354  0.97753805 -0.6301892   0.03767175
  0.99255556  0.6451838   1.2749192  -2.084796  ]
mat : embedding :[ 0.9762366   1.854475   -3.2215302   0.75082767 -0.01865999  0.5520588
 -1.1162462  -0.46379152  0.72042286 -0.11134949]
dog : embedding :[-0.45078906  1.1658604  -1.8016922  -0.63098025  0.10263806  1.6203414
  0.18846387 -0.39020047  3.33304    -3.473504  ]
lay : embedding :[ 1.7678895   2.0032725  -2.283633   -1.4169966  -1.0540261   0.5823018
  0.77960765  0.36290362  3.2042189   2.0227702 ]
rug

In [79]:
text = "cat sat"
tokens = tokenize(text)
idxs = [vocabulary[token] for token in tokens]
idxs = torch.tensor(idxs)

In [80]:
embedded_idxs = s_g.embeddings(idxs)

In [93]:
embedded_idxs_2 = word_embeddings[idxs]

In [94]:
embedded_idxs.shape == embedded_idxs_2.shape

True

In [114]:
a = np.tril(np.ones((3,3)))
a

array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]])

In [118]:
a[[0,0,1]]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 1., 0.]])

In [124]:
a[[[0,1,0]]]

array([[[1., 0., 0.],
        [1., 1., 0.],
        [1., 0., 0.]]])

In [150]:
def compute_cosine_similarity(word1, word2, embeddings=word_embeddings):
    idx1 = vocabulary[word1]
    idx2 = vocabulary[word2]
    
    embedding1 = embeddings[idx1]
    embedding2 = embeddings[idx2]
    
    dot_product = np.abs(np.dot(embedding1, np.transpose(embedding2)))
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    
    similarity = dot_product / (norm1 * norm2)
    
    return similarity

In [151]:
def compute_cosine_similarity_2(word: str, words: list, embeddings=word_embeddings) -> list:
    idx = vocabulary[word]
    idxs = [vocabulary[w] for w in words]
    
    embedding = embeddings[idx] # [1, 10]
    embedding_idxs = embeddings[idxs] # [n, 10]
    
    similarities = list()
    
    for e in embedding_idxs:
        dot_product = np.abs(np.dot(embedding, np.transpose(e)))
        n1 = np.linalg.norm(embedding)
        n2 = np.linalg.norm(e)
        s = dot_product / (n1 * n2)
        similarities.append(s)
        
    return similarities
    

In [163]:
a = np.array([1,2,3])
b = np.array([[1,2,3], [1,2,3],[9,8,1]])

dot_1 = np.dot(a, b[0].T)
dot_2 = np.dot(a, b[1].T)
dot_3 = np.dot(a, b[2].T)

dot_1, dot_2, dot_3

(14, 14, 28)

In [168]:
b, b.T, a

(array([[1, 2, 3],
        [1, 2, 3],
        [9, 8, 1]]),
 array([[1, 1, 9],
        [2, 2, 8],
        [3, 3, 1]]),
 array([1, 2, 3]))

In [164]:
dot_4 = np.dot(a, b.T)

In [165]:
dot_4

array([14, 14, 28])

In [166]:
np.dot([1,2],[[1],[2]]), np.dot([1,2], [1,2])

(array([5]), 5)

In [154]:
pairs = [
    ["cat", "dog"],
    ["dog", "sat"],
    ["tree", "cat"],
    ["sat", "mat"],
    ["cat", "tree"]
]

for word1, word2 in pairs:
    sim = compute_cosine_similarity(word1, word2)
    print(f"similarity between {word1} and {word2} is {sim}")

similarity between cat and dog is 0.5173586010932922
similarity between dog and sat is 0.6939213871955872
similarity between tree and cat is 0.17029790580272675
similarity between sat and mat is 0.4760937988758087
similarity between cat and tree is 0.17029790580272675


In [155]:
compute_cosine_similarity_2("cat", ["dog", "tree"])

[0.5173586, 0.1702979]

In [183]:
def compute_cosine_similar(word: str, words: list, embeddings=word_embeddings):
    idx = vocabulary[word]
    idxs = [vocabulary[w] for w in words]
    
    embedding_idx = embeddings[idx]
    embedding_idxs = embeddings[idxs]
    
    dot_product = np.abs(np.dot(embedding_idx, np.transpose(embedding_idxs))) # [1, 10].[10, n] = [1, n]
    norm1 = np.linalg.norm(embedding_idx) # [1, 10] -> [1]
    norm2 = np.linalg.norm(embedding_idxs, axis=1) # [n, 10] -> [n]
    
    similarities = dot_product / (norm1 * norm2) # [1, n] / ([1] * [n]) -> [1, n]
    
    return similarities

In [184]:
np.linalg.norm(np.array([[1,2],[3,4]]), axis=-1)

array([2.23606798, 5.        ])

In [185]:
compute_cosine_similar("cat", ["dog", "tree"])

array([0.5173586 , 0.17029792], dtype=float32)

In [186]:
np.array([1]) / (np.array([2]) * np.array([-1,2]))

array([-0.5 ,  0.25])

In [187]:
1 / (2 * -1), 1 / (2 * 2)

(-0.5, 0.25)

In [196]:
np.dot([6,4],[[4,9],[9,8]])

array([60, 86])