In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df = pd.read_csv('text_corpus.txt',names = ['sentence'])
df

Unnamed: 0,sentence
0,the morning sky was pale and quiet.
1,she brewed coffee while the cat watched the wi...
2,i left my keys on the kitchen table again.
3,the bus arrived late because of heavy rain.
4,students lined up outside the library before o...
...,...
56,we rolled back and wrote a postmortem.
57,the cafe offered quiet seats with strong wifi.
58,rain tapped lightly on the window glass.
59,a street musician played jazz under the bridge.


In [3]:
value = df.iloc[1,0]

In [4]:
value

'she brewed coffee while the cat watched the window.'

In [5]:
sentences = []
for i in range(len(df)):
    value = df.iloc[i,0]
    sentences.append(value)
sentences[:10]

['the morning sky was pale and quiet.',
 'she brewed coffee while the cat watched the window.',
 'i left my keys on the kitchen table again.',
 'the bus arrived late because of heavy rain.',
 'students lined up outside the library before opening.',
 'the teacher wrote three formulas on the board.',
 'we measured the distance with a cheap ruler.',
 'a small error slipped into the final report.',
 'the startup pitched their idea to five investors.',
 'users complained about the app crashing at midnight.']

In [6]:
def preprocessing(corpus):
    words = []
    for s in corpus:
        for w in s.split():
            if w not in words:
                words.append(w)
    return words
    

In [7]:
words = preprocessing(sentences)
words[:10]

['the',
 'morning',
 'sky',
 'was',
 'pale',
 'and',
 'quiet.',
 'she',
 'brewed',
 'coffee']

In [8]:
def indexing_word(words):
    word_to_idx = {word:idx for idx,word in enumerate(words)}
    idx_to_word = {idx:word for idx,word in enumerate(words)}
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = indexing_word(words)

In [9]:
context_size = 2
embedding_dim = 120

In [10]:
def compile_data(sentences, words, word_to_idx, idx_to_word):
    data = []
    for s in sentences:
        sen = s.split()
        m = len(sen)
        for i in range(context_size, m - context_size):
            target_word = sen[i]
            context_words = []
            for j in range(1,context_size+1):
                con_w_1 = sen[i-j]
                con_w_2 = sen[i+j]
                if con_w_1 not in context_words:
                    context_words.append(con_w_1)
                if con_w_2 not in context_words:
                    context_words.append(con_w_2)
            for con_w in context_words:
                if con_w not in data:
                    data.append((target_word,con_w))
    return data


In [11]:
data = compile_data(sentences, words, word_to_idx, idx_to_word)
data[:10]

[('sky', 'morning'),
 ('sky', 'was'),
 ('sky', 'the'),
 ('sky', 'pale'),
 ('was', 'sky'),
 ('was', 'pale'),
 ('was', 'morning'),
 ('was', 'and'),
 ('pale', 'was'),
 ('pale', 'and')]

In [12]:
from torch.utils.data import Dataset, DataLoader


In [13]:
indexed_data = [(word_to_idx[w1], word_to_idx[w2]) for (w1, w2) in data]


In [14]:
indexed_data[:10]

[(2, 1),
 (2, 3),
 (2, 0),
 (2, 4),
 (3, 2),
 (3, 4),
 (3, 1),
 (3, 5),
 (4, 3),
 (4, 5)]

In [15]:
from torch.utils.data import Dataset

class Word2VecDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs   

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]


In [16]:
dataset = Word2VecDataset(indexed_data)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [17]:
print(dataset)

<__main__.Word2VecDataset object at 0x00000193B4B76050>


In [18]:
word_size = len(words)
word_size

341

In [19]:
negative_nums = 8
indexed_set = set(indexed_data)

In [20]:
def create_negative_sampling(center_w, context_w, word_size, negative_nums, indexed_set):
    neg_samples = []
    while len(neg_samples) < negative_nums:
        neg = np.random.randint(0, word_size)
        if neg != context_w and (center_w, neg) not in indexed_set:
            neg_samples.append(neg)
    return neg_samples

        

In [21]:
import torch.nn as nn
class NegativeSamplingModel(nn.Module):
    def __init__(self, word_size, embedding_dim):
        super(NegativeSamplingModel, self).__init__()
        self.word_size = word_size
        self.embedding_dim = embedding_dim
        self.Wv_embedding = nn.Embedding(word_size, embedding_dim)      # Wv la ma tran V x dim, Wv chua vector v_w
        self.Wu_embedding = nn.Embedding(word_size, embedding_dim)      # Wu la ma tran V x dim, Wu chua vector u_w
        self.log_sigmoid = nn.LogSigmoid()
    def forward(self, center_word, context_word, negative_samples):
        center = self.Wv_embedding(center_word)         # Dang 32 x dim do 1 batch = 32
        context = self.Wu_embedding(context_word)       # Dang 32 x dim do 1 batch = 32
        neg = self.Wu_embedding(negative_samples)       # Lay K negative sampling tu ma tran V x dim => duoc ma tran k x dim, them 1 batch = 32 => 32 x k x dim

        pos_loss = self.log_sigmoid(torch.sum(center*context, dim=1))
        neg_score = torch.sum(neg * center.unsqueeze(1), dim=2)     # size 32 x k x dim va size 32 x 1 x dim => 32 x k x dim, cong theo dim=2 => 32xk
        neg_loss = self.log_sigmoid(-neg_score)                      # size 32 x k
        neg_loss = torch.sum(neg_loss, dim=1)                        # size 32
        loss = -(pos_loss + neg_loss).mean()

        return loss

In [22]:
import torch.optim as optim
model = NegativeSamplingModel(word_size, embedding_dim)
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [23]:
epochs = 1000

In [24]:
loss_lst = []
for epoch in range(epochs):
    total_loss = 0
    for center, context in loader:
        center = center.long()
        context = context.long()

        negative_samples = []
        for c, o in zip(center, context):
            negs = create_negative_sampling(
                center_w=c.item(),
                context_w=o.item(),
                word_size=word_size,          
                negative_nums=negative_nums,
                indexed_set=indexed_set
            )
            negative_samples.append(negs)   

        negative_samples = torch.LongTensor(negative_samples)

        loss = model(center, context, negative_samples)

        optimizer.zero_grad()   
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch % 20 == 0:
        loss_lst.append(total_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")



Epoch 1/1000, Loss: 1171.0338
Epoch 21/1000, Loss: 842.5060
Epoch 41/1000, Loss: 677.7638
Epoch 61/1000, Loss: 522.7814
Epoch 81/1000, Loss: 363.4334
Epoch 101/1000, Loss: 228.1483
Epoch 121/1000, Loss: 136.7229
Epoch 141/1000, Loss: 79.1784
Epoch 161/1000, Loss: 43.3326
Epoch 181/1000, Loss: 21.6457
Epoch 201/1000, Loss: 11.0696
Epoch 221/1000, Loss: 8.6528
Epoch 241/1000, Loss: 3.2399
Epoch 261/1000, Loss: 2.0430
Epoch 281/1000, Loss: 1.1421
Epoch 301/1000, Loss: 1.5472
Epoch 321/1000, Loss: 0.6752
Epoch 341/1000, Loss: 0.3619
Epoch 361/1000, Loss: 0.2605
Epoch 381/1000, Loss: 0.2012
Epoch 401/1000, Loss: 0.1462
Epoch 421/1000, Loss: 0.1031
Epoch 441/1000, Loss: 0.0815
Epoch 461/1000, Loss: 0.0588
Epoch 481/1000, Loss: 0.0472
Epoch 501/1000, Loss: 0.0343
Epoch 521/1000, Loss: 0.0247
Epoch 541/1000, Loss: 0.0180
Epoch 561/1000, Loss: 0.0138
Epoch 581/1000, Loss: 0.0106
Epoch 601/1000, Loss: 0.0076
Epoch 621/1000, Loss: 0.0057
Epoch 641/1000, Loss: 0.0043
Epoch 661/1000, Loss: 0.0029
E

In [25]:
import torch
import torch.nn.functional as F

@torch.no_grad()
def get_similar_words(word: str, word_to_idx: dict, idx_to_word: dict, model, top_n: int = 5):
    E_in  = model.Wv_embedding.weight.detach()
    E_out = model.Wu_embedding.weight.detach()
    E = (E_in + E_out) / 2    # [V, D]
    E_norm = F.normalize(E, p=2, dim=1)
    if word not in word_to_idx:
        raise KeyError(f"'{word}' does not exist")
    idx = word_to_idx[word]
    v = E_norm[idx]                
    sims = (E_norm @ v)             
    sims[idx] = -1.0               
    vals, ids = torch.topk(sims, k=top_n)
    return [idx_to_word[i.item()] for i in ids]


In [26]:
similar = get_similar_words("sky", word_to_idx, idx_to_word, model, top_n=5)
print(similar)


['pale', 'was', 'my', 'chance.', 'size']


In [27]:
torch.save(model.state_dict(), "sgns_model.pt")
