# Word2Vec (Skipgram )

Assignment A01 : st125214

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


In [2]:
np.__version__, torch.__version__

('1.26.4', '2.5.1+cu118')

In [3]:
import nltk # natural language toolkit
from nltk.corpus import reuters # Reuters dataset
nltk.__version__

'3.9.1'

In [4]:
nltk.download('punkt') # download the punkt tokenizer
nltk.download('reuters') # download the reuters dataset
categiries = reuters.categories() # get the list of categories
print(categiries[:15])  # print the first 10 categories


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mgmgk\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil']


## 1. Load data

In [5]:
corpus = reuters.sents(categories='coffee') # get the list of sentences in the coffee category
corpus

[['INDONESIAN', 'COMMODITY', 'EXCHANGE', 'MAY', 'EXPAND', 'The', 'Indonesian', 'Commodity', 'Exchange', 'is', 'likely', 'to', 'start', 'trading', 'in', 'at', 'least', 'one', 'new', 'commodity', ',', 'and', 'possibly', 'two', ',', 'during', 'calendar', '1987', ',', 'exchange', 'chairman', 'Paian', 'Nainggolan', 'said', '.'], ['He', 'told', 'Reuters', 'in', 'a', 'telephone', 'interview', 'that', 'trading', 'in', 'palm', 'oil', ',', 'sawn', 'timber', ',', 'pepper', 'or', 'tobacco', 'was', 'being', 'considered', '.'], ...]

In [6]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [7]:
#create handy mapping between integer and word to create word2index
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['crop']

2136

In [8]:
# adding 'UNK' to handle unknown word tokens
last_vocab_idx = len(vocabs)
last_vocab_idx
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_idx

In [9]:
# reverse mapping from word2index to index2word
index2word = {v:k for k, v in word2index.items()}
index2word[15]

'either'

## 2. Prepare train data

In [10]:
# create pairs of center word, and outside word
# function was modified to use dynamic window size, with default 2

def random_batch(batch_size, corpus, window_size = 2):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = (word2index[doc[i-1]], word2index[doc[i+1]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, corpus,2)

In [11]:
print(f"Shape of x is :{x.shape}")  #batch_size, 1
print(f"Shape of y is :{y.shape}")  #batch_size, 1
print(f"Size of x is : {x}") #batch_size

Shape of x is :(2, 1)
Shape of y is :(2, 1)
Size of x is : [[3492]
 [ 429]]


## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [12]:
vocab_size = len(vocabs)
print(f"Total vocabularies are : {vocab_size}")

Total vocabularies are : 4488


In [13]:
embedding = nn.Embedding(vocab_size, 2)

In [14]:
x_tensor = torch.LongTensor(x)
print(f"Torch size is : {embedding(x_tensor).shape}")  #(batch_size, 1, emb_size)

Torch size is : torch.Size([2, 1, 2])


$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [15]:
# implementation of the Skip-gram model for word embeddings
# the model is implemented as a simple feedforward neural network

class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
    
    def get_embed(self, index):
        # Retrieve the embedding for a specific word index.
        with torch.no_grad():  # Ensure gradients are not tracked
            return self.embedding_center(torch.LongTensor([index])).squeeze(0)
    

In [16]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 4485, 4486, 4487],
        [   0,    1,    2,  ..., 4485, 4486, 4487]])

In [17]:
# initialize the model  
model = Skipgram(voc_size, 2)
model

Skipgram(
  (embedding_center): Embedding(4488, 2)
  (embedding_outside): Embedding(4488, 2)
)

In [18]:
# define the optimizer
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [19]:
loss = model(input_tensor, label_tensor, all_vocabs)
loss

tensor(8.8461, grad_fn=<NegBackward0>)

## 4. Training

In [20]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [21]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
import time

num_epochs = 600
start = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

# Calculate the elapsed time
end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)
print(f"Training of {num_epochs} epochs were completed in {epoch_mins}m {epoch_secs}s.")


Epoch    100 | Loss: 9.034327
Epoch    200 | Loss: 8.394921
Epoch    300 | Loss: 9.371779
Epoch    400 | Loss: 11.299071
Epoch    500 | Loss: 9.243349
Epoch    600 | Loss: 8.395000
Training of 600 epochs were completed in 1m 34s.


5. Model Comparism and Analysis

In [23]:
# function for opening the word anologies text file
def open_file(path_to_file):
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [None]:
# 1. Opening the word analogies text file
file_path = "data/word-test.v1.txt"
content = open_file(file_path)

# 2. Parsing the content of the file and create sematic and syntactic analogies 
semantic = []
syntactic = []

current_test = semantic
for sent in content:
    if sent.startswith(':'):
        if 'gram' in sent:
            current_test = syntactic
        else:
            current_test = semantic
        continue
    
    current_test.append(sent.strip().split())

print(f"Number of semantic analogies: {len(semantic)}")
print(f"Number of syntactic analogies: {len(syntactic)}")

Number of semantic analogies: 8869
Number of syntactic analogies: 10675


In [25]:
# 1. Function to find the closest word to a given analogy

def find_analogy(word_a, word_b, word_c, model, word2index, index2word):
    try:
        # Get the embeddings for the words
        emb_a = model.get_embed(word2index[word_a])
        emb_b = model.get_embed(word2index[word_b])
        emb_c = model.get_embed(word2index[word_c])
    except KeyError as e:
        # Return None if any word is not in the vocabulary
        print(f"Word not in vocabulary: {e}")
        return None

    # Perform vector arithmetic
    predicted_embedding = emb_b - emb_a + emb_c

    # Find the closest word by cosine similarity
    similarities = torch.matmul(model.embedding_center.weight, predicted_embedding)
    predicted_idx = torch.argmax(similarities).item()

    # Return the closest word
    return index2word.get(predicted_idx, None)

# 2. Function to evaluate the model on the analogies

def evaluate_analogies(analogies, model, word2index, index2word):
    correct = 0
    for analogy in analogies:
        word_a, word_b, word_c, word_d = analogy
        predicted_word = find_analogy(word_a, word_b, word_c, model, word2index, index2word)
        if predicted_word == word_d:
            correct += 1
    return correct / len(analogies)

# Evaluate semantic and syntactic analogies
semantic_accuracy = evaluate_analogies(semantic, model, word2index, index2word)
syntactic_accuracy = evaluate_analogies(syntactic, model, word2index, index2word)

print(f"Semantic analogy accuracy: {semantic_accuracy * 100:.2f}%")
print(f"Syntactic analogy accuracy: {syntactic_accuracy * 100:.2f}%")

Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Athens'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Word not in vocabulary: 'Baghdad'
Wo

In [26]:
# 1. Opening the similarities data text file
file_path = "data/wordsim_similarity_goldstandard.txt"
content = open_file(file_path)

similarity_dataset = []
for sent in content:
    word1, word2, similarity = sent.split('\t')
    similarity_dataset.append((word1, word2, float(similarity)))

print(similarity_dataset)



In [27]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [28]:
from scipy.stats import spearmanr

# Compute the dot product for each word pair
dot_products = []
similarity_scores = []

for word1, word2, similarity in similarity_dataset:
    if word1 in word2index and word2 in word2index:
        emb1 = model.get_embed(word2index[word1])
        emb2 = model.get_embed(word2index[word2])
        dot_product = torch.dot(emb1, emb2).item()
        dot_products.append(dot_product)
        similarity_scores.append(similarity)

# Calculate the Spearman correlation
correlation, _ = spearmanr(dot_products, similarity_scores)
print(f"Spearman correlation: {correlation:.4f}")

Spearman correlation: -0.1402


7. Save the model

In [29]:
import pickle

torch.save(model.state_dict(), 'skipgram.model')

skipgram_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(skipgram_args, open('skipgram.args', 'wb'))

In [30]:
skg_args = pickle.load(open('skipgram.args', 'rb'))
print("Loaded Arguments:", skg_args)  # Debug: Check what arguments were saved

skg_args.pop('word2index', None)  # Remove keys not required by the constructor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = Skipgram(**skg_args)

# Load the model weights onto the correct device
load_model.load_state_dict(torch.load('skipgram.model', map_location=device))
load_model.eval()  # Set the model to evaluation mode

Loaded Arguments: {'voc_size': 4488, 'emb_size': 2, 'word2index': {'approaching': 0, 'than': 1, '24': 2, 'and': 3, 'transfer': 4, 'Graham': 5, 'muted': 6, 'Trade': 7, '),': 8, 'lose': 9, 'Malagasy': 10, 'COSTA': 11, 'took': 12, 'purchase': 13, 'Philippine': 14, 'either': 15, 'Livestock': 16, 'begun': 17, 'international': 18, 'considering': 19, 'Administration': 20, 'disguise': 21, 'known': 22, 'Buyers': 23, 'slightly': 24, 'discussion': 25, 'strongly': 26, '913': 27, 'est': 28, 'APRIL': 29, 'Santos': 30, 'community': 31, 'had': 32, 'African': 33, 'achieve': 34, 'Financially': 35, 'tightened': 36, 'lowered': 37, 'attended': 38, 'shipowners': 39, 'date': 40, 'adding': 41, 'developing': 42, 'reintroduce': 43, 'RBI': 44, 'how': 45, 'cnts': 46, '200': 47, 'U': 48, 'Italian': 49, 'determine': 50, 'resolved': 51, 'seem': 52, 'company': 53, 'DISAPPOINTED': 54, 'zones': 55, 'favour': 56, 'OIL': 57, 'then': 58, 'near': 59, 'Anthony': 60, 'SEE': 61, 'Farmers': 62, 'emerge': 63, 'again': 64, 'rigs

  load_model.load_state_dict(torch.load('skipgram.model', map_location=device))


Skipgram(
  (embedding_center): Embedding(4488, 2)
  (embedding_outside): Embedding(4488, 2)
)

In [31]:
sample_input = torch.tensor([[1], [2]])  # Replace with appropriate test data
print("Sample Output:", load_model(sample_input, sample_input, sample_input))

Sample Output: tensor(-0., grad_fn=<NegBackward0>)


In [32]:
words = ('import', 'export', 'soccer', 'king', 'rice','war', 'crop' )  # Replace with the word analogy to find

# Convert word to index
for word in words:
    if word in word2index:
        word_index = word2index[word]
        embedding = load_model.get_embed(word_index)  # Pass the index
        print(f"Embedding for '{word}': {embedding}")
    else:
        print(f"Word '{word}' not found in vocabulary.")

Embedding for 'import': tensor([ 0.0812, -0.2080])
Embedding for 'export': tensor([-0.8013, -0.2833])
Word 'soccer' not found in vocabulary.
Word 'king' not found in vocabulary.
Embedding for 'rice': tensor([ 0.4639, -0.2965])
Embedding for 'war': tensor([0.0535, 0.7407])
Embedding for 'crop': tensor([ 0.4344, -0.0544])


In [33]:
def get_top_similar_contexts(query, model, corpus, word2index, top_n=10):
    # Convert the query to its embedding
    query_embedding = model.get_embed(word2index[query])
    
    # Flatten the corpus to get all words
    flattened_corpus = [word for sentence in corpus for word in sentence]
    
    # Compute the dot product between the query embedding and each word in the corpus
    similarities = []
    for word in flattened_corpus:
        if word in word2index:
            word_embedding = model.get_embed(word2index[word])
            dot_product = torch.dot(query_embedding, word_embedding).item()
            similarities.append((word, dot_product))
    
    # Sort the similarities in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_similar_contexts = similarities[:top_n]
    
    return top_similar_contexts

# Example usage
query = 'robusta'  # Replace with your query word
if query in word2index:
    top_similar_contexts = get_top_similar_contexts(query, model, corpus, word2index)
    print(top_similar_contexts)
    #word_index = word2index[word]
    #embedding = load_model.get_embed(word_index)  # Pass the index
    #print(f"Embedding for '{word}': {embedding}")
else:
    print(f"Word '{query}' not found in vocabulary.")


[('undertaking', 2.431729316711426), ('cnts', 2.188129186630249), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('slide', 2.0630130767822266), ('300', 2.0296192169189453)]
