# Word2Vec (Negative Sampling)

Assignment A01 : st125214

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

print(torch.cuda.is_available()) # True if GPU is available
print(torch.cuda.device_count()) # Number of GPUs available
print(torch.cuda.current_device()) # Device index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Select device
print(device)

True
1
0
cuda


In [2]:
np.__version__, torch.__version__

('1.26.4', '2.5.1+cu118')

In [3]:
import nltk # natural language toolkit
from nltk.corpus import reuters # Reuters dataset
nltk.__version__

'3.9.1'

In [4]:
nltk.download('punkt') # download the punkt tokenizer
nltk.download('reuters') # download the reuters dataset
categiries = reuters.categories() # get the list of categories
print(categiries[:10])  # print the first 10 categories

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']


## 1. Load data

In [5]:
corpus = reuters.sents(categories='coffee') # get the list of sentences in the coffee category
corpus

[['INDONESIAN', 'COMMODITY', 'EXCHANGE', 'MAY', 'EXPAND', 'The', 'Indonesian', 'Commodity', 'Exchange', 'is', 'likely', 'to', 'start', 'trading', 'in', 'at', 'least', 'one', 'new', 'commodity', ',', 'and', 'possibly', 'two', ',', 'during', 'calendar', '1987', ',', 'exchange', 'chairman', 'Paian', 'Nainggolan', 'said', '.'], ['He', 'told', 'Reuters', 'in', 'a', 'telephone', 'interview', 'that', 'trading', 'in', 'palm', 'oil', ',', 'sawn', 'timber', ',', 'pepper', 'or', 'tobacco', 'was', 'being', 'considered', '.'], ...]

In [6]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [7]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['beans']

2944

In [8]:
# adding 'UNK' to handle unknown word tokens
last_vocab_idx = len(vocabs)
last_vocab_idx
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_idx

In [9]:
# reverse mapping from word2index to index2word
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'hardline'

## 2. Prepare train data

In [10]:
# create pairs of center word, and outside word
# function was modified to use dynamic window size, with default 2

def random_batch(batch_size, corpus, window_size = 2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words    
            outside = (word2index[doc[i-1]], word2index[doc[i+1]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus, 2)

In [11]:
print(f"Shape of x is :{x.shape}")  #batch_size, 1
print(f"Shape of y is :{y.shape}")  #batch_size, 1
print(f"Size of x is : {x}") #batch_size

Shape of x is :(2, 1)
Shape of y is :(2, 1)
Size of x is : [[ 704]
 [2421]]


## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [12]:
z = 0.001

In [13]:
# counts the occurrences of each word in a given corpus, 
# calculates the total number of words, 
# and provides the total word count

from collections import Counter

word_count = Counter(flatten(corpus))
word_count

# get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

36710

In [14]:
vocab_size = len(vocabs)
print(f"Total vocabularies are : {vocab_size}")

Total vocabularies are : 4488


$$P(w)=U(w)^{3/4}/Z$$

In [15]:
# construct a unigram table for a word sampling

unigram_table = []
for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({',': 97,
         '.': 96,
         'the': 89,
         'to': 65,
         'of': 57,
         'in': 46,
         'and': 44,
         'said': 44,
         'a': 42,
         'coffee': 32,
         'for': 28,
         "'": 26,
         'on': 25,
         '"': 25,
         'The': 22,
         's': 22,
         '-': 20,
         'is': 20,
         'be': 19,
         'quotas': 19,
         'export': 19,
         'that': 18,
         'will': 18,
         'ICO': 18,
         'Coffee': 17,
         'Brazil': 17,
         'by': 16,
         'not': 16,
         '(': 16,
         'mln': 16,
         'from': 16,
         'at': 16,
         'with': 15,
         'would': 15,
         'year': 15,
         'this': 14,
         'prices': 14,
         'as': 14,
         'was': 14,
         'have': 14,
         'producers': 13,
         'are': 13,
         'an': 13,
         'market': 13,
         ')': 13,
         ',"': 13,
         'bags': 13,
         'which': 13,
         'its': 12,
         

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [16]:
# converts a sequence of words into a sequence of corresponding indices

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [17]:
# generates negative samples for a batch of target words using a unigram table

import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [18]:
# creates x and y tensors

batch_size = 2
x, y = random_batch(batch_size, corpus, 2)
x_tensor = torch.LongTensor(x).to(device)
y_tensor = torch.LongTensor(y).to(device)

In [19]:
# generates negative samples

k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k).to(device)

In [20]:
y_tensor[1]

tensor([3367], device='cuda:0')

In [21]:
neg_samples[1]

tensor([4237, 2751,  703, 4473, 1024], device='cuda:0')

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [22]:
# implementation of the Skip-gram model for word embeddings
# the model is implemented as a simple feedforward neural network

class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)
    
    def get_embed(self, index):
        # Retrieve the embedding for a specific word index.
        with torch.no_grad():  # Ensure gradients are not tracked
            device = self.embedding_center.weight.device  # Get the device of the embedding weights
            index_tensor = torch.LongTensor([index]).to(device)  # Move index to the same device
            return self.embedding_center(index_tensor).squeeze(0)

    def get_all_embeddings(self):
        # Retrieve all embeddings from the center embedding layer.
        with torch.no_grad():  # Ensure gradients are not tracked
            return self.embedding_center.weight

In [23]:
#test your model
emb_size = 5
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size).to(device)

In [24]:
print(x_tensor.device)
print(y_tensor.device)
print(neg_samples.device)
print(next(model.parameters()).device)

cuda:0
cuda:0
cuda:0
cuda:0


In [25]:
loss = model(x_tensor, y_tensor, neg_samples)
print(f"Loss is : {loss}")

Loss is : 4.629616737365723


## 5. Training

In [26]:
# creates an optimizer for your model using the Adam optimization algorithm
batch_size = 2
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [27]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
import time

num_epochs = 1000
start = time.time()

for epoch in range(num_epochs):

    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k).to(device)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()

    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 200 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

# Calculate the elapsed time
end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)
print(f"Training of {num_epochs} epochs were completed in {epoch_mins}m {epoch_secs}s.")

Epoch    200 | Loss: 4.703345
Epoch    400 | Loss: 2.408661
Epoch    600 | Loss: 2.491889
Epoch    800 | Loss: 1.404233
Epoch   1000 | Loss: 1.215279
Training of 1000 epochs were completed in 2m 29s.


5. Model Comparism and Analysis

In [29]:
# function for opening the word anologies text file
def open_file(path_to_file):
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [30]:
# 1. Opening the word analogies text file
file_path = "data/word-test.v1.txt"
content = open_file(file_path)

# 2. Parsing the content of the file and create sematic and syntactic analogies 
semantic = []
syntactic = []

current_test = semantic
for sent in content:
    if sent.startswith(':'):
        if 'gram' in sent:
            current_test = syntactic
        else:
            current_test = semantic
        continue
    
    current_test.append(sent.strip().split())

print(f"Number of semantic analogies: {len(semantic)}")
print(f"Number of syntactic analogies: {len(syntactic)}")

Number of semantic analogies: 8869
Number of syntactic analogies: 10675


In [31]:
# 1. Function to find the closest word to a given analogy

def find_analogy(word_a, word_b, word_c, model, word2index, index2word):
    try:
        device = next(model.parameters()).device  # Get the device of the model
        # Get the embeddings for the words
        emb_a = model.get_embed(torch.tensor([word2index[word_a]], device=device))
        emb_b = model.get_embed(torch.tensor([word2index[word_b]], device=device))
        emb_c = model.get_embed(torch.tensor([word2index[word_c]], device=device))
        
        # Perform analogy computation: emb_a - emb_b + emb_c
        analogy_vector = emb_a - emb_b + emb_c

        # Find the closest word to the analogy vector
        all_embeddings = model.get_all_embeddings()  # Ensure this tensor is on the same device
        scores = torch.matmul(analogy_vector, all_embeddings.T)
        predicted_index = scores.argmax().item()

        return index2word[predicted_index]
    except KeyError as e:
        return f"Word not in vocabulary: {e}"

# 2. Function to evaluate the model on the analogies

def evaluate_analogies(analogies, model, word2index, index2word):
    correct = 0
    for analogy in analogies:
        word_a, word_b, word_c, word_d = analogy
        predicted_word = find_analogy(word_a, word_b, word_c, model, word2index, index2word)
        if predicted_word == word_d:
            correct += 1
    return correct / len(analogies)

# Evaluate semantic and syntactic analogies
semantic_accuracy = evaluate_analogies(semantic, model, word2index, index2word)
syntactic_accuracy = evaluate_analogies(syntactic, model, word2index, index2word)

print(f"Semantic analogy accuracy: {semantic_accuracy * 100:.2f}%")
print(f"Syntactic analogy accuracy: {syntactic_accuracy * 100:.2f}%")

Semantic analogy accuracy: 0.00%
Syntactic analogy accuracy: 0.00%


In [32]:
# 1. Opening the similarities data text file
file_path = "data/wordsim_similarity_goldstandard.txt"
content = open_file(file_path)

similarity_dataset = []
for sent in content:
    word1, word2, similarity = sent.split('\t')
    similarity_dataset.append((word1, word2, float(similarity)))

print(similarity_dataset)



In [33]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [34]:
from scipy.stats import spearmanr

# Compute the dot product for each word pair
dot_products = []
similarity_scores = []

for word1, word2, similarity in similarity_dataset:
    if word1 in word2index and word2 in word2index:
        emb1 = model.get_embed(word2index[word1])
        emb2 = model.get_embed(word2index[word2])
        dot_product = torch.dot(emb1, emb2).item()
        dot_products.append(dot_product)
        similarity_scores.append(similarity)

# Calculate the Spearman correlation
correlation, _ = spearmanr(dot_products, similarity_scores)
print(f"Spearman correlation: {correlation:.4f}")

Spearman correlation: 0.0537


7. Save the Model

In [37]:
import pickle

torch.save(model.state_dict(), 'neg.model')

neg_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(neg_args, open('neg.args', 'wb'))

In [38]:
torch.save(model.state_dict(), 'neg.model')

neg_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(neg_args, open('neg.args', 'wb'))

In [39]:
negg_args = pickle.load(open('neg.args', 'rb'))
print("Loaded Arguments:", negg_args)  # Debug: Check what arguments were saved

negg_args.pop('word2index', None)  # Remove keys not required by the constructor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = SkipgramNeg(**negg_args)

# Load the model weights onto the correct device
load_model.load_state_dict(torch.load('neg.model', map_location=device))
load_model.eval()  # Set the model to evaluation mode

Loaded Arguments: {'voc_size': 4488, 'emb_size': 5, 'word2index': {'13': 0, 'Malaysia': 1, 'teas': 2, 'Rachmat': 3, 'force': 4, 'hardline': 5, '630': 6, 'tied': 7, 'ensue': 8, 'sticks': 9, 'Physical': 10, 'palm': 11, 'expected': 12, '889': 13, 'above': 14, 'IACO': 15, 'seamen': 16, 'involved': 17, 'SEES': 18, 'many': 19, 'non': 20, 'sales': 21, 'central': 22, 'planned': 23, 'Hesse': 24, 'mount': 25, '616': 26, 'charge': 27, 'Producers': 28, 'put': 29, 'catastrophe': 30, 'projects': 31, 'background': 32, 'Antioquia': 33, 'managed': 34, 'prevent': 35, 'contribution': 36, 'cruzado': 37, 'extensive': 38, 'SEASONAL': 39, 'direct': 40, 'enabling': 41, 'REVENUE': 42, 'brings': 43, 'foreseeable': 44, 'release': 45, 'naming': 46, 'examining': 47, 'belonging': 48, 'PROBLEMS': 49, '88': 50, 'american': 51, 'RESETTLE': 52, 'can': 53, ':': 54, 'allocation': 55, 'Executive': 56, 'instrument': 57, 'Caron': 58, 'registration': 59, 'meal': 60, 'Maccia': 61, 'marketing': 62, 'With': 63, 'Kizito': 64, 'r

  load_model.load_state_dict(torch.load('neg.model', map_location=device))


SkipgramNeg(
  (embedding_center): Embedding(4488, 5)
  (embedding_outside): Embedding(4488, 5)
  (logsigmoid): LogSigmoid()
)

In [40]:
sample_input = torch.tensor([[1], [2]])  # Replace with appropriate test data
print("Sample Output:", load_model(sample_input, sample_input, sample_input))

Sample Output: tensor(1.6259, grad_fn=<NegBackward0>)


In [41]:
words = ('import', 'export', 'soccer', 'king', 'rice','war', 'crop' )  # Replace with the word analogy to find

# Convert word to index
for word in words:
    if word in word2index:
        word_index = word2index[word]
        embedding = load_model.get_embed(word_index)  # Pass the index
        print(f"Embedding for '{word}': {embedding}")
    else:
        print(f"Word '{word}' not found in vocabulary.")

Embedding for 'import': tensor([ 0.5670,  1.6556, -1.0482, -1.1416, -0.3555])
Embedding for 'export': tensor([ 0.6615,  0.8694,  0.5574, -0.9598,  1.1436])
Word 'soccer' not found in vocabulary.
Word 'king' not found in vocabulary.
Embedding for 'rice': tensor([-0.6802,  1.3045,  0.5507, -0.1679,  0.2768])
Embedding for 'war': tensor([ 1.5306,  1.8527, -1.4778,  1.5856,  1.3997])
Embedding for 'crop': tensor([-0.1280, -0.0658,  0.9933, -0.4371, -0.0189])


In [42]:
def get_top_similar_contexts(query, model, corpus, word2index, top_n=10):
    # Convert the query to its embedding
    query_embedding = model.get_embed(word2index[query])
    
    # Flatten the corpus to get all words
    flattened_corpus = [word for sentence in corpus for word in sentence]
    
    # Compute the dot product between the query embedding and each word in the corpus
    similarities = []
    for word in flattened_corpus:
        if word in word2index:
            word_embedding = model.get_embed(word2index[word])
            dot_product = torch.dot(query_embedding, word_embedding).item()
            similarities.append((word, dot_product))
    
    # Sort the similarities in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_similar_contexts = similarities[:top_n]
    
    return top_similar_contexts

# Example usage
query = 'robusta'  # Replace with your query word
if query in word2index:
    top_similar_contexts = get_top_similar_contexts(query, model, corpus, word2index)
    print(top_similar_contexts)
    #word_index = word2index[word]
    #embedding = load_model.get_embed(word_index)  # Pass the index
    #print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{query}' not found in vocabulary.")

[('easing', 3.6220035552978516), ('Charles', 3.3423056602478027), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842), ('Ugandan', 3.256103038787842)]
