# 1. Skip-gram with naiive softmax 

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf
* https://arxiv.org/abs/1301.3781
* http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

In [1]:
import mindspore
from mindspore import nn, Tensor, ops
import nltk
import random
import numpy as np
from collections import Counter
from mindnlp.modules import Accumulator
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

  from tqdm.autonotebook import tqdm


In [2]:
print(mindspore.__version__)
print(nltk.__version__)

2.0.0.20230623
3.7


In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [47]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w]
                    if word2index.get(w) is not None
                    else word2index["<UNK>"], seq))
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence


def prepare_word(word, word2index):
    return Tensor([word2index[word]], dtype=mindspore.int64) \
        if word2index.get(word) is not None \
        else Tensor([word2index["<UNK>"]], dtype=mindspore.int64)

## Data load and Preprocessing 

### Load corpus : Gutenberg corpus

If you don't have gutenberg corpus, you can download it first using nltk.download()

In [6]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [7]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]  # sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus]

### Extract Stopwords from unigram distribution's tails

In [8]:
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01)

In [9]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]

In [10]:
stopwords = [s[0] for s in stopwords]

In [11]:
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

### Build vocab

In [12]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [13]:
print(len(set(flatten(corpus))), len(vocab))

592 583


In [14]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v: k for k, v in word2index.items()}

### Prepare train data 

window data example

<img src="../images/01.skipgram-prepare-data.png">
<center>borrowed image from http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/</center>

In [15]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

In [16]:
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [17]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:WINDOW_SIZE * 2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [18]:
X_p = []
y_p = []

In [19]:
train_data[0]

('[', 'moby')

In [20]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

In [21]:
train_data = list(zip(X_p, y_p))

In [22]:
len(train_data)

7606

## Modeling

<img src="../images/01.skipgram-objective.png">
<center>borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture2.pdf</center>

In [60]:
class Skipgram(nn.Cell):

    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        minval = Tensor(-1, mindspore.float32)
        maxval = Tensor(1, mindspore.float32)
        self.embedding_v.embedding_table.set_data(ops.uniform(self.embedding_v.embedding_table.shape, minval, maxval))  # init
        self.embedding_u.embedding_table.set_data(ops.zeros(self.embedding_u.embedding_table.shape, mindspore.float32))  # init

    def construct(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words)  # B x 1 x D
        target_embeds = self.embedding_u(target_words)  # B x 1 x D
        outer_embeds = self.embedding_u(outer_words)  # B x V x D

        scores = ops.BatchMatMul()(target_embeds, ops.transpose(center_embeds, (0, 2, 1))).squeeze(2)  # Bx1xD * BxDx1 => Bx1
        norm_scores = ops.BatchMatMul()(outer_embeds, ops.transpose(center_embeds, (0, 2, 1))).squeeze(2)  # BxVxD * BxDx1 => BxV

        nll = -ops.mean(ops.log(ops.exp(scores) / ops.sum(ops.exp(norm_scores), 1).unsqueeze(1)))  # log-softmax

        return nll  # negative log likelihood

    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)

        return embeds

## Train  

In [24]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 101

In [61]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.01)

In [62]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(inputs, targets, vocabs):
    loss = model(inputs, targets, vocabs)
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
def train_step(inputs, targets, vocabs):
    loss, grads = grad_fn(inputs, targets, vocabs)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [63]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):

        inputs, targets = zip(*batch)

        inputs = ops.cat(inputs)  # B x 1
        targets = ops.cat(targets)  # B x 1
        vocabs = prepare_sequence(list(vocab), word2index)  # B x V
        vocabs = vocabs.broadcast_to((inputs.shape[0], len(vocab)))

        loss = train_step(inputs, targets, vocabs)

        losses.append(loss.asnumpy().item(0) * accumulate_step)

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 6.27
Epoch : 10, mean_loss : 4.69
Epoch : 20, mean_loss : 3.60
Epoch : 30, mean_loss : 3.32
Epoch : 40, mean_loss : 3.23
Epoch : 50, mean_loss : 3.19
Epoch : 60, mean_loss : 3.17
Epoch : 70, mean_loss : 3.15
Epoch : 80, mean_loss : 3.14
Epoch : 90, mean_loss : 3.14


## Test

In [64]:
def word_similarity(target, vocab):
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue

        vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = ops.cosine_similarity(target_V, vector).asnumpy().tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]  # sort by similarity

In [65]:
test = random.choice(list(vocab))
test

'hopeless'

In [67]:
word_similarity(test, vocab)

[['no', 0.6545044779777527],
 ['belongest', 0.6357013583183289],
 ['thou', 0.6291839480400085],
 ['sallow', 0.6139289736747742],
 ['seas', 0.5946469306945801],
 ['brought', 0.58446204662323],
 ['incredible', 0.5820481181144714],
 [':', 0.5575752258300781],
 ['biggest', 0.5565429925918579],
 ['wine', 0.5476349592208862]]