<a href="https://colab.research.google.com/github/ngoda/Conversations/blob/master/b2CHAPTER4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
# Create a NumPy array containing integers from 0 to 20, reshaped into a 7x3 matrix
W = np.arange(21).reshape(7, 3)
W

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19, 20]])

In [13]:
W[2]

array([6, 7, 8])

In [14]:
W[5]

array([15, 16, 17])

In [15]:
idx = np.array([1, 0, 3, 0])
W[idx]

array([[ 3,  4,  5],
       [ 0,  1,  2],
       [ 9, 10, 11],
       [ 0,  1,  2]])

In [40]:
class Embedding:
    def __init__(self, W):
        # Constructor method for the Embedding layer
        self.params = [W]  # List to store parameters (embedding weights)
        self.grads = [np.zeros_like(W)]  # List to store gradients
        self.idx = None  # Index variable for tracking input indices

    def forward(self, idx):
        # Forward pass method for the Embedding layer
        W, = self.params  # Unpack the parameters
        self.idx = idx  # Save the input indices
        out = W[idx]  # Retrieve the embeddings corresponding to input indices
        return out

    def backward(self, dout):
        # Backward pass method for the Embedding layer
        dW, = self.grads  # Unpack the gradients
        dW[...] = 0  # Reset gradients
        np.add.at(dW, self.idx, dout)  # Accumulate gradients for input indices
        return None  # No gradients with respect to the input


In [42]:
def backward(self, dout):
    # Backward pass method for the Embedding layer
    dW, = self.grads  # Unpack the gradients
    dW[...] = 0  # Reset gradients

    # Accumulate gradients for input indices
    for i, word_id in enumerate(self.idx):
        dW[word_id] += dout[i]

    return None  # No gradients with respect to the input


**negative_sampling_layer**

In [43]:
import sys
sys.path.append('..')  # Adds parent directory to the system path

import collections  # Imports the collections module
from common.np import *  # Imports NumPy functions from a custom module
from common.layers import Embedding, SigmoidWithLoss  # Imports custom layers for neural networks


In [44]:
class EmbeddingDot:
    def __init__(self, W):
        # Constructor method for EmbeddingDot layer
        self.embed = Embedding(W)  # Initialize Embedding layer
        self.params = self.embed.params  # Parameters of Embedding layer
        self.grads = self.embed.grads  # Gradients of Embedding layer
        self.cache = None  # Cache for intermediate data during backward pass

    def forward(self, h, idx):
        # Forward pass method for EmbeddingDot layer
        target_W = self.embed.forward(idx)  # Get embeddings for given indices
        out = np.sum(target_W * h, axis=1)  # Compute dot product between embeddings and input vector

        self.cache = (h, target_W)  # Cache input vector and embeddings
        return out

    def backward(self, dout):
        # Backward pass method for EmbeddingDot layer
        h, target_W = self.cache  # Retrieve cached data
        dout = dout.reshape(dout.shape[0], 1)  # Reshape gradient

        dtarget_W = dout * h  # Compute gradient for embeddings
        self.embed.backward(dtarget_W)  # Backpropagate gradient through Embedding layer
        dh = dout * target_W  # Compute gradient for input vector
        return dh  # Return gradient for input vector


In [45]:
import numpy as np  # Import NumPy library

np.random.choice(10)  # Randomly selects an integer from 0 to 9


5

In [19]:
np.random.choice(10) # Randomly selects an integer from 0 to 9

2

In [46]:
words = ['you', 'say', 'goodbye', 'i', 'hello', '.']
np.random.choice(words)  # Randomly selects a word from the list 'words'


'i'

In [48]:
np.random.choice(words, size=5, replace=False)
# Randomly selects 5 unique words from the list 'words' without replacement

array(['goodbye', 'hello', 'say', 'you', 'i'], dtype='<U7')

In [49]:
p = [0.7, 0.29, 0.01]  # Probability distribution
new_p = np.power(p, 0.75)  # Apply temperature scaling with factor 0.75

new_p /= np.sum(new_p)  # Normalize probabilities to ensure they sum up to 1
print(new_p)  # Print the normalized probabilities after temperature scaling


[0.64196878 0.33150408 0.02652714]


In [50]:
import sys
sys.path.append('..')  # Adds parent directory to the system path
from common.np import *  # Import NumPy functions as np
from common.layers import Embedding, SigmoidWithLoss  # Import custom layers
import collections  # Import the collections module

# Implementation of the Negative Sampling class
# Located in chap04/negative_sampling_layer.py
class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        # Constructor method for UnigramSampler
        self.sample_size = sample_size  # Number of negative samples to generate
        self.vocab_size = None  # Vocabulary size
        self.word_p = None  # Unigram distribution probabilities

        # Count occurrences of each word in the corpus
        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)  # Compute vocabulary size
        self.vocab_size = vocab_size  # Store vocabulary size

        # Calculate unigram probabilities using power transformation
        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)  # Apply power transformation
        self.word_p /= np.sum(self.word_p)  # Normalize probabilities to sum up to 1

    def get_negative_sample(self, target):
        # Method to generate negative samples
        batch_size = target.shape[0]  # Get batch size

        if not GPU:  # If running on CPU
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size):
                p = self.word_p.copy()  # Make a copy of the unigram distribution
                target_idx = target[i]  # Get the target word index
                p[target_idx] = 0  # Exclude target word from negative sampling
                p /= p.sum()  # Normalize probabilities

                # Generate negative samples without replacement based on probabilities
                negative_sample[i, :] = np.random.choice(self.vocab_size,
                                                         size=self.sample_size,
                                                         replace=False, p=p)
        else:  # If running on GPU
            # Generate negative samples with replacement based on probabilities
            negative_sample = np.random.choice(self.vocab_size,
                                               size=(batch_size, self.sample_size),
                                               replace=True, p=self.word_p)

        return negative_sample  # Return generated negative samples


In [51]:
corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])  # Example corpus
power = 0.75  # Power parameter for unigram distribution
sample_size = 2  # Number of negative samples to generate

sampler = UnigramSampler(corpus, power, sample_size)  # Initialize UnigramSampler
target = np.array([1, 3, 0])  # Example target words
negative_sample = sampler.get_negative_sample(target)  # Generate negative samples
print(negative_sample)  # Print the generated negative samples


[[3 4]
 [2 0]
 [4 3]]


In [52]:
import sys
sys.path.append('..')  # Add parent directory to the system path
from common.np import *  # Import NumPy functions as np
from common.layers import Embedding, SigmoidWithLoss  # Import custom layers
import collections  # Import the collections module


In [53]:
# chap04/negative_sampling_layer.py
class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, sample_size=5):
        # Constructor method for NegativeSamplingLoss
        self.sample_size = sample_size  # Number of negative samples to generate
        self.sampler = UnigramSampler(corpus, power, sample_size)  # Initialize UnigramSampler
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]  # Initialize loss layers
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]  # Initialize EmbeddingDot layers

        # Collect parameters and gradients from EmbeddingDot layers
        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, h, target):
        # Forward pass method for NegativeSamplingLoss
        batch_size = target.shape[0]  # Get batch size
        negative_sample = self.sampler.get_negative_sample(target)  # Generate negative samples

        # Compute loss for positive samples
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        # Compute loss for negative samples
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]  # Get negative sample targets
            score = self.embed_dot_layers[1 + i].forward(h, negative_target)
            loss += self.loss_layers[1 + i].forward(score, negative_label)

        return loss

    def backward(self, dout=1):
        # Backward pass method for NegativeSamplingLoss
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)

        return dh  # Return gradients with respect to input vector


In [54]:
import sys
sys.path.append('..')  # Adds parent directory to the system path
from common.np import *  # Import NumPy functions as np
from common.layers import Embedding  # Import the Embedding layer
from negative_sampling_layer import NegativeSamplingLoss  # Import the NegativeSamplingLoss class


In [56]:
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        # Constructor method for CBOW model
        V, H = vocab_size, hidden_size  # Vocabulary size, hidden size

        # Initialize input and output weight matrices
        W_in = 0.01 * np.random.randn(V, H).astype('f')  # Input weight matrix
        W_out = 0.01 * np.random.randn(V, H).astype('f')  # Output weight matrix

        # Initialize Embedding layers for input context words
        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)  # Embedding layer for context words
            self.in_layers.append(layer)

        # Initialize Negative Sampling Loss layer
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)

        # Collect parameters and gradients from all layers
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # Store input and output word vectors
        self.word_vecs1 = W_in  # Input word vectors
        self.word_vecs2 = W_out  # Output word vectors

    def forward(self, contexts, target):
        # Forward pass method for CBOW model
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])  # Sum up embeddings of context words
        h *= 1 / len(self.in_layers)  # Average the embeddings
        loss = self.ns_loss.forward(h, target)  # Compute loss using Negative Sampling Loss
        return loss

    def backward(self, dout=1):
        # Backward pass method for CBOW model
        dout = self.ns_loss.backward(dout)  # Backpropagate through Negative Sampling Loss
        dout *= 1 / len(self.in_layers)  # Scale gradients
        for layer in self.in_layers:
            layer.backward(dout)  # Backpropagate through Embedding layers
        return None  # No gradients with respect to input


In [57]:
import sys
sys.path.append('..')  # Adds parent directory to the system path
import numpy as np  # Import NumPy library
from common import config  # Import configuration settings



In [58]:

import pickle  # Import the pickle module for serialization
from common.trainer import Trainer  # Import the Trainer class for model training
from common.optimizer import Adam  # Import the Adam optimizer
from cbow import CBOW  # Import the CBOW model
from skip_gram import SkipGram  # Import the SkipGram model
from common.util import create_contexts_target, to_cpu, to_gpu  # Import utility functions
from dataset import ptb  # Import the Penn Treebank dataset


In [62]:
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        # Constructor method for CBOW model
        V, H = vocab_size, hidden_size  # Vocabulary size, hidden size

        # Initialize input and output weight matrices
        W_in = 0.01 * np.random.randn(V, H).astype('f')  # Input weight matrix
        W_out = 0.01 * np.random.randn(V, H).astype('f')  # Output weight matrix

        # Initialize Embedding layers for input context words
        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)  # Embedding layer for context words
            self.in_layers.append(layer)

        # Initialize Negative Sampling Loss layer
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)

        # Collect parameters and gradients from all layers
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # Store input and output word vectors
        self.word_vecs1 = W_in  # Input word vectors
        self.word_vecs2 = W_out  # Output word vectors

    def forward(self, contexts, target):
        # Forward pass method for CBOW model
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])  # Sum up embeddings of context words
        h *= 1 / len(self.in_layers)  # Average the embeddings
        loss = self.ns_loss.forward(h, target)  # Compute loss using Negative Sampling Loss
        return loss

    def backward(self, dout=1):
        # Backward pass method for CBOW model
        dout = self.ns_loss.backward(dout)  # Backpropagate through Negative Sampling Loss
        dout *= 1 / len(self.in_layers)  # Scale gradients
        for layer in self.in_layers:
            layer.backward(dout)  # Backpropagate through Embedding layers
        return None  # No gradients with respect to input


In [63]:
# chap04/train.py
import sys
sys.path.append('..')  # Add parent directory to the system path
import numpy as np  # Import NumPy library
from common import config  # Import configuration settings


In [64]:
import pickle  # Import the pickle module for serialization
from common.trainer import Trainer  # Import the Trainer class for model training
from common.optimizer import Adam  # Import the Adam optimizer
from cbow import CBOW  # Import the CBOW model
from skip_gram import SkipGram  # Import the SkipGram model
from common.util import create_contexts_target, to_cpu, to_gpu  # Import utility functions
from dataset import ptb  # Import the Penn Treebank dataset


In [65]:
window_size = 5  # Size of the context window
hidden_size = 100  # Dimensionality of word embeddings and hidden layers
batch_size = 100  # Number of samples per training batch
max_epoch = 10  # Maximum number of training epochs

corpus, word_to_id, id_to_word = ptb.load_data('train')  # Load training data from the Penn Treebank dataset
vocab_size = len(word_to_id)  # Vocabulary size

# Create context-target pairs from the training corpus
contexts, target = create_contexts_target(corpus, window_size)

# Move data to GPU if configured to use GPU
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)


In [67]:
model = SkipGram(vocab_size, hidden_size, window_size, corpus)  # Initialize SkipGram model
optimizer = Adam()  # Initialize Adam optimizer
trainer = Trainer(model, optimizer)  # Initialize Trainer with model and optimizer


In [None]:
#THIS SECTION TAKES TOO LONG TO RUN AND PRODUCE OUTPUT
trainer.fit(contexts, target, max_epoch, batch_size, eval_interval=2000)  # Train the model
trainer.plot()  # Plot training progress


In [68]:
import sys
sys.path.append('..')  # Add parent directory to the system path
import pickle  # Import the pickle module for serialization
from common.util import most_similar, analogy  # Import utility functions for word similarity and analogy tasks


In [69]:
pkl_file = './cbow_params.pkl'  # Path to the pickle file containing CBOW model parameters
with open(pkl_file, 'rb') as f:
    params = pickle.load(f)  # Load parameters from the pickle file


In [70]:
word_vecs = params['word_vecs']  # Extract word vectors from loaded parameters
word_to_id = params['word_to_id']  # Extract word-to-id dictionary from loaded parameters
id_to_word = params['id_to_word']  # Extract id-to-word dictionary from loaded parameters


In [71]:
querys = ['you', 'year', 'car', 'toyota']  # List of query words for similarity search
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)  # Find most similar words to each query



[query] you
 we: 0.6103515625
 someone: 0.59130859375
 i: 0.55419921875
 something: 0.48974609375
 anyone: 0.47314453125

[query] year
 month: 0.71875
 week: 0.65234375
 spring: 0.62744140625
 summer: 0.6259765625
 decade: 0.603515625

[query] car
 luxury: 0.497314453125
 arabia: 0.47802734375
 auto: 0.47119140625
 disk-drive: 0.450927734375
 travel: 0.4091796875

[query] toyota
 ford: 0.55078125
 instrumentation: 0.509765625
 mazda: 0.49365234375
 bethlehem: 0.47509765625
 nissan: 0.474853515625
