# Word2Vec (Negative Sampling)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
np.__version__, torch.__version__

('1.26.3', '2.1.2')

In [3]:
import matplotlib
matplotlib.__version__

'3.8.2'

## 1. Load data

In [4]:
from nltk.corpus import brown

brown.categories()
corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [5]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [6]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}

In [7]:
vocabs.append('<UNK>')
word2index['<UNK>'] = 6

In [8]:
index2word = {v:k for k, v in word2index.items()}

## 2. Prepare train data

In [9]:
window_size = 2

skipgrams = []

#loop each corpus
for doc in corpus:
    #look from the 2nd word until second last word
    for i in range(window_size, len(doc)-window_size):
        #center word
        center = word2index[doc[i]]
        #outside words = 2 words

        outside = []
        for j in range(window_size):
            outside.append(word2index[doc[i+(j+1)]])
            outside.append(word2index[doc[i-(j+1)]])

        #for each of these two outside words, we gonna append to a list
        for each_out in outside:
            skipgrams.append([center, each_out])
            #center, outside1;   center, outside2

In [10]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, skipgrams):
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

In [11]:
x, y = random_batch(2, corpus, skipgrams)
x, y

(array([[2435],
        [2785]]),
 array([[5800],
        [2487]]))

In [12]:
x.shape  #batch_size, 1

(2, 1)

In [13]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [14]:
z = 0.001

In [15]:
#count
from collections import Counter

word_count = Counter(flatten(corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

100554

In [16]:
vocabs

['Clifford',
 'everyday',
 '869',
 'authorizing',
 'erratic',
 'Garson',
 'debuts',
 'Hiring',
 'Uncas',
 "club's",
 'augment',
 'lows',
 'forth',
 'saluted',
 'thousands',
 'widely',
 'seemed',
 'vicious',
 'newsman',
 'increasing',
 'lasting',
 'Fund',
 'tons',
 'Russians',
 'liquor',
 'sheriff',
 'Asian',
 'bonding',
 'Story',
 'intervals',
 'ladies',
 'viewers',
 'confrontation',
 'crippled',
 'disarmament',
 'Vienna',
 'telephoned',
 'warfare',
 'seashore',
 'honor',
 'Eldon',
 'mantlepiece',
 '55',
 'domain',
 'proper',
 'front',
 'bulwark',
 'invests',
 'Tyson',
 '$7,500,000',
 'Pitcher',
 'surplus',
 'casuals',
 'resistant',
 'Confronted',
 'human',
 'full-scale',
 'fund-raising',
 'thwarted',
 'pulse-jet',
 'Charitable',
 'Tom',
 'George',
 'outfit',
 'Bester',
 'minutes',
 '39',
 'baton',
 'Brig.',
 "Anne's",
 'enigma',
 'quest',
 'negative',
 'identity',
 'provincial',
 'Without',
 'accommodated',
 'semester',
 'rapt',
 'McGehee',
 'snow',
 'tip',
 'Judy',
 "grandmother's",


$$P(w)=U(w)^{3/4}/Z$$

In [17]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'seemed': 1,
         'honor': 1,
         'front': 1,
         'George': 1,
         'minutes': 1,
         'measure': 1,
         'made': 5,
         'war': 1,
         'Communist': 2,
         'players': 1,
         'Americans': 1,
         'additional': 1,
         'news': 1,
         'known': 1,
         'without': 3,
         'Army': 1,
         'room': 1,
         'Monday': 3,
         'San': 1,
         'medical': 1,
         'brought': 1,
         'headed': 1,
         'continue': 1,
         'a.m.': 1,
         'opportunity': 1,
         'Henry': 1,
         'bill': 3,
         'Federal': 1,
         'friends': 1,
         'pay': 2,
         'those': 3,
         'since': 3,
         'service': 2,
         'lines': 1,
         'away': 1,
         'effective': 1,
         'National': 2,
         'pressure': 1,
         'became': 1,
         'plan': 3,
         'David': 1,
         'Senate': 2,
         'gave': 1,
         'later': 2,
         'life': 1,
         "don't

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [18]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [19]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [20]:
batch_size = 2
x, y = random_batch(batch_size, corpus, skipgrams)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [21]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [22]:
y_tensor[1]

tensor([909])

In [23]:
neg_samples[1]

tensor([11604, 12950,  7412,  2640,  5325])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [24]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()

        self.word2index = word2index
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.embedding_center(word)
        embed_o = self.embedding_outside(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

In [25]:
#test your model
emb_size = 2
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size, word2index)

In [26]:
loss = model(x_tensor, y_tensor, neg_samples)

In [27]:
loss

tensor(2.3573, grad_fn=<NegBackward0>)

## 5. Training

In [28]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
import time

num_epochs = 100

total_start = time.time()
for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus, skipgrams)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:5.0f} | Loss: {loss:5.2f}")

total_end = time.time()
training_time_total = total_end - total_start
min, sec = epoch_time(total_start, total_end)

print(f"\nComplete: \nTotal Loss: {loss:2.2f} | Time Taken: {min} minutes and {sec} seconds")

Epoch    10 | Loss:  2.91
Epoch    20 | Loss:  2.10
Epoch    30 | Loss:  3.29
Epoch    40 | Loss:  1.60
Epoch    50 | Loss:  1.33
Epoch    60 | Loss:  1.03
Epoch    70 | Loss:  1.09
Epoch    80 | Loss:  2.69
Epoch    90 | Loss:  1.91
Epoch   100 | Loss:  2.61

Complete: 
Total Loss: 2.61 | Time Taken: 0 minutes and 4 seconds


# 6. Testing

In [31]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [32]:
file_path = "test/word-test.v1.min.txt"

content = open_file(file_path)

semantic = []
syntatic = []

current_test = semantic
for sent in content:
    if sent[0] == ':':
        current_test = syntatic
        continue
    
    current_test.append(sent.strip())

In [33]:
vector_space = []

for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [34]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [35]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

## Semantic Accuracy

In [36]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [37]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


## Syntatic Accuracy

In [38]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [39]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


## Similarity Accuracy

In [40]:
file_path = "test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [41]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [42]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [43]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is 0.22.


# 7. Save the model and meta data

In [44]:
import pickle

torch.save(model.state_dict(), '../app/models/neg.model')

neg_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(neg_args, open('../app/models/neg.args', 'wb'))

In [45]:
neg_args = pickle.load(open('../app/models/neg.args', 'rb'))
model_neg = SkipgramNeg(**neg_args)
model_neg.load_state_dict(torch.load('../app/models/neg.model'))

# Test the model
model_neg.get_embed('sad')

(0.004879907704889774, 0.11010897159576416)