# Word2Vec (Skipgram)

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib

In [7]:
np.__version__, torch.__version__, matplotlib.__version__

('1.26.3', '2.5.1+cu121', '3.9.2')

In [8]:
device = torch.device('cuda:0')
device 

device(type='cuda', index=0)

## 1. Load data

In [9]:
import nltk
from nltk.corpus import reuters

In [10]:
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
print(reuters.categories())


['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [12]:
corpus = list(reuters.sents())
corpus

[['ASIAN',
  'EXPORTERS',
  'FEAR',
  'DAMAGE',
  'FROM',
  'U',
  '.',
  'S',
  '.-',
  'JAPAN',
  'RIFT',
  'Mounting',
  'trade',
  'friction',
  'between',
  'the',
  'U',
  '.',
  'S',
  '.',
  'And',
  'Japan',
  'has',
  'raised',
  'fears',
  'among',
  'many',
  'of',
  'Asia',
  "'",
  's',
  'exporting',
  'nations',
  'that',
  'the',
  'row',
  'could',
  'inflict',
  'far',
  '-',
  'reaching',
  'economic',
  'damage',
  ',',
  'businessmen',
  'and',
  'officials',
  'said',
  '.'],
 ['They',
  'told',
  'Reuter',
  'correspondents',
  'in',
  'Asian',
  'capitals',
  'a',
  'U',
  '.',
  'S',
  '.',
  'Move',
  'against',
  'Japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'in',
  'the',
  'U',
  '.',
  'S',
  '.',
  'And',
  'lead',
  'to',
  'curbs',
  'on',
  'American',
  'imports',
  'of',
  'their',
  'products',
  '.'],
 ['But',
  'some',
  'exporters',
  'said',
  'that',
  'while',
  'the',
  'conflict',
  'would',
  'hurt',
  'them',
  'in',


In [13]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]

#set vocab limit to the first 10,000 words

words = list(set(flatten(corpus))) #all the words we have in the system - <UNK>
vocabs = words[:10000] #limit the vocab size to 10,000

In [14]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}

In [15]:
last_vocab_idx = len(vocabs)
last_vocab_idx

10000

In [16]:
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_idx

In [17]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'theoretically'

## 2. Prepare train data

In [18]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus):

    skipgrams = []
    window_size = 2
    unk = word2index['<UNK>'] #if the word is not in the vocab, we will use this
    
    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(window_size, len(doc)-window_size):
            # #center word
            # center = word2index[doc[i]]
            # #outside words = 4 words
            # outside = (word2index[doc[i-2]], word2index[doc[i-1]], word2index[doc[i+1]], word2index[doc[i+2]])
            center = word2index.get(doc[i], unk)
            
            # Outside words
            outside = (
                word2index.get(doc[i - 2], unk),
                word2index.get(doc[i - 1], unk),
                word2index.get(doc[i + 1], unk),
                word2index.get(doc[i + 2], unk)
            )
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2 , center, outside2 center, outside4

    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)

    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])

    return np.array(inputs), np.array(labels)

x, y = random_batch(2, corpus)

In [19]:
x.shape, y.shape

((2, 1), (2, 1))

In [20]:
len(vocabs)

10001

In [21]:
vocab_size = len(vocabs)
embedding = nn.Embedding(vocab_size, 2)
x_tensor = torch.LongTensor(x)
embedding(x_tensor).shape

torch.Size([2, 1, 2])

In [22]:

class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.word2index = word2index
    
    def forward(self, center, outside, all_vocabs):
        device = self.embedding_center.weight.device
        center = center.to(device)
        outside = outside.to(device)
        all_vocabs = all_vocabs.to(device)

        # print("Center shape:", center.shape)
        # print("Outside shape:", outside.shape)
        # print("All vocabs shape:", all_vocabs.shape)
        
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        # print("Center embedding shape:", center_embedding.shape)
        # print("Outside embedding shape:", outside_embedding.shape)
        # print("All vocabs embedding shape:", all_vocabs_embedding.shape)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
    
    def get_embed(self, word):
        word2index = self.word2index
        device = next(self.embedding_center.parameters()).device #get the device of the embedding layer
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index]).to(device)
        
        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

In [23]:
# prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

# all_vocabs = prepare_sequence(list(vocabs), word2index)
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs = all_vocabs.to(device)
all_vocabs.shape

torch.Size([2, 10001])

In [24]:
model = Skipgram(voc_size, 2, word2index).to(device)
model

Skipgram(
  (embedding_center): Embedding(10001, 2)
  (embedding_outside): Embedding(10001, 2)
)

In [25]:
input_tensor = torch.LongTensor(x).to(device)
label_tensor = torch.LongTensor(y).to(device)

In [26]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [27]:
loss

tensor(8.4016, device='cuda:0', grad_fn=<NegBackward0>)

### 4. Training

In [44]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size, word2index)
model      = model.to(device)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [26]:
num_epochs = 100

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    loss = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

Epoch     10 | Loss: 9.127102
Epoch     20 | Loss: 8.897195
Epoch     30 | Loss: 10.259260
Epoch     40 | Loss: 8.634832
Epoch     50 | Loss: 9.961052
Epoch     60 | Loss: 8.601841
Epoch     70 | Loss: 8.934109
Epoch     80 | Loss: 9.597401
Epoch     90 | Loss: 8.539652
Epoch    100 | Loss: 9.648331


### 5. Testing

In [28]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [29]:
# File paths for semantic and syntactic data
semantic_file_path = "test/semantic_capital_country.txt"
syntatic_file_path = "test/syntatic_past_tense.txt"

# Function to read content from a file
def open_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

# Load semantic and syntactic data
semantic = [line.strip() for line in open_file(semantic_file_path) if line.strip()]
syntatic = [line.strip() for line in open_file(syntatic_file_path) if line.strip()]

current_test = semantic 
for sent in semantic + syntatic:  
    if sent.startswith(":"):  
        current_test = syntatic
        continue
    current_test.append(sent.strip())
    
    current_test.append(sent.strip())

In [30]:
vector_space = []

device = model.embedding_center.weight.device  # Get device from the model
for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [31]:
print("Model device:", next(model.parameters()).device)
print("Input tensor device:", input_tensor.device)
print("Label tensor device:", label_tensor.device)
print("All vocabs tensor device:", all_vocabs.device)

Model device: cuda:0
Input tensor device: cuda:0
Label tensor device: cuda:0
All vocabs tensor device: cuda:0


In [32]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [33]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

## Syntatic Accuracy

In [34]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [35]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


## Semantic Accuracy

In [36]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [37]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


## Similarity Accuracy

In [38]:
file_path = "test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [39]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [40]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [41]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is 0.11.


## 7. Save the model and data

In [42]:
import pickle
import os

In [45]:
model_dir = './app/models'
os.makedirs(model_dir, exist_ok=True)

torch.save(model.state_dict(), os.path.join(model_dir, 'skipgram.model'))

skipgram_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}
with open(os.path.join(model_dir, 'skipgram.args'), 'wb') as f:
    pickle.dump(skipgram_args, f)


In [46]:
skg_args = pickle.load(open('./app/models/skipgram.args', 'rb'))
load_model = Skipgram(**skg_args)
load_model.load_state_dict(torch.load('./app/models/skipgram.model'))
load_model.to(device)

  load_model.load_state_dict(torch.load('./app/models/skipgram.model'))


Skipgram(
  (embedding_center): Embedding(10001, 2)
  (embedding_outside): Embedding(10001, 2)
)

In [47]:
load_model.get_embed('Heritage')

(0.46847888827323914, -0.4608387053012848)

In [48]:
# Fetch embedding for a word
embedding = load_model.get_embed('succeed')
print(embedding)


(0.46847888827323914, -0.4608387053012848)


In [49]:
print(vocabs[:50])

['downsizings', 'GVMF', 'MESA', '6400', 'Annual', 'theoretically', 'reins', 'endangering', 'NEWHALL', 'Suthee', 'Net', 'infinitesimal', 'PRUDENTIAL', 'CHITTAGONG', 'pledging', 'RBAN', 'knocked', 'Lawton', 'fillets', 'purge', 'PLOTS', 'Further', 'ANNOUNCE', 'VENN', 'EARLY', 'buoyed', 'BILL', 'SHELDAUL', 'GRAIN', 'CSBA', 'Delegates', 'MOG', 'Expected', 'inroads', 'Cream', 'ALRN', 'IVORY', 'PROPOSE', '80', 'Labrador', 'explore', 'dlr', 'jeapordy', 'SCMS', 'proportionately', 'Humphrey', 'PENNY', 'Jovanovich', 'contractors', 'unquantifiable']


In [50]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

vec_heritage = [1.0088, -0.2803]
vec_succeed = [0.3600, -0.3778]

similarity = cosine_similarity(vec_heritage, vec_succeed)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.8584784902698835
