# 3. GloVe: Global Vectors for Word Representation

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture3.pdf
* https://nlp.stanford.edu/pubs/glove.pdf

In [1]:
import os
import mindspore
from mindspore import nn, ops, Tensor, Parameter
import nltk
import random
import numpy as np
from collections import Counter
from mindnlp.modules import Accumulator
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

  from tqdm.autonotebook import tqdm


In [2]:
print(mindspore.__version__)
print(nltk.__version__)

2.0.0.20230623
3.7


In [3]:
gpu = '0'
# 设置使用哪些显卡进行训练
os.environ["CUDA_VISIBLE_DEVICES"] = gpu

In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w]
                    if word2index.get(w) is not None
                    else word2index["<UNK>"], seq))
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence


def prepare_word(word, word2index):
    return Tensor([word2index[word]], dtype=mindspore.int64) \
        if word2index.get(word) is not None \
        else Tensor([word2index["<UNK>"]], dtype=mindspore.int64)

## Data load and Preprocessing 

In [6]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

### Build vocab

In [7]:
vocab = list(set(flatten(corpus)))

In [8]:
word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v: k for k, v in word2index.items()}

In [9]:
WINDOW_SIZE = 5
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

window_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        window_data.append((window[WINDOW_SIZE], window[i]))

### Weighting Function 

<img src="../images/03.glove-weighting-function.png">
<center>borrowed image from https://nlp.stanford.edu/pubs/glove.pdf</center>

In [10]:
def weighting(w_i, w_j):
    try:
        x_ij = X_ik[(w_i, w_j)]
    except Exception:
        x_ij = 1

    x_max = 100  # 100 fixed in paper
    alpha = 0.75

    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    else:
        result = 1

    return result

### Build Co-occurence Matrix X

Because of model complexity, It is important to determine whether a tighter bound can be placed on the number of nonzero elements of X.

In [11]:
X_i = Counter(flatten(corpus))  # X_i

In [12]:
X_ik_window_5 = Counter(window_data)  # Co-occurece in window size 5

In [13]:
X_ik = {}
weighting_dic = {}

In [14]:
from itertools import combinations_with_replacement

In [15]:
for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_window_5.get(bigram) is not None:  # nonzero elements
        co_occer = X_ik_window_5[bigram]
        X_ik[bigram] = co_occer + 1  # log(Xik) -> log(Xik+1) to prevent divergence
        X_ik[(bigram[1], bigram[0])] = co_occer + 1
    else:
        pass

    weighting_dic[bigram] = weighting(bigram[0], bigram[1])
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0])

In [16]:
test = random.choice(window_data)
print(test)
try:
    print(X_ik[(test[0], test[1])] == X_ik[(test[1], test[0])])
except Exception:
    1

('sacred', 'any')
True


### Prepare train data

In [17]:
u_p = []  # center vec
v_p = []  # context vec
co_p = []  # log(x_ij)
weight_p = []  # f(x_ij)

for pair in window_data:
    u_p.append(prepare_word(pair[0], word2index).view(1, -1))
    v_p.append(prepare_word(pair[1], word2index).view(1, -1))

    try:
        cooc = X_ik[pair]
    except Exception:
        cooc = 1

    co_p.append(ops.log(Parameter(Tensor([cooc], dtype=mindspore.float32))).view(1, -1))
    weight_p.append(Parameter(Tensor([weighting_dic[pair]], dtype=mindspore.float32)).view(1, -1))

train_data = list(zip(u_p, v_p, co_p, weight_p))
del u_p
del v_p
del co_p
del weight_p
print(train_data[0])  # tuple (center vec i, context vec j log(x_ij), weight f(w_ij))

(Tensor(shape=[1, 1], dtype=Int64, value=
[[144]]), Tensor(shape=[1, 1], dtype=Int64, value=
[[2169]]), Tensor(shape=[1, 1], dtype=Float32, value=
[[ 6.93147182e-01]]), Tensor(shape=[1, 1], dtype=Float32, value=
[[ 5.31829596e-02]]))


## Modeling 

<img src="../images/03.glove-objective.png">
<center>borrowed image from https://nlp.stanford.edu/pubs/glove.pdf</center>

In [18]:
class GloVe(nn.Cell):

    def __init__(self, vocab_size, projection_dim):
        super(GloVe, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)  # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)  # out embedding

        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)

        initrange = (2.0 / (vocab_size + projection_dim))**0.5  # Xavier init
        minval = Tensor(-initrange, mindspore.float32)
        maxval = Tensor(initrange, mindspore.float32)
        self.embedding_v.embedding_table.set_data(ops.uniform(self.embedding_v.embedding_table.shape, minval, maxval))  # init
        self.embedding_u.embedding_table.set_data(ops.uniform(self.embedding_u.embedding_table.shape, minval, maxval))  # init
        self.v_bias.embedding_table.set_data(ops.uniform(self.v_bias.embedding_table.shape, minval, maxval))  # init
        self.u_bias.embedding_table.set_data(ops.uniform(self.u_bias.embedding_table.shape, minval, maxval))  # init

    def construct(self, center_words, target_words, coocs, weights):
        center_embeds = self.embedding_v(center_words)  # B x 1 x D
        target_embeds = self.embedding_u(target_words)  # B x 1 x D

        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)

        inner_product = ops.BatchMatMul()(target_embeds, ops.transpose(center_embeds, (0, 2, 1))).squeeze(2)  # Bx1

        loss = weights * ops.pow(inner_product + center_bias + target_bias - coocs, 2)

        return ops.sum(loss)

    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs)  # B x 1 x D
        u_embeds = self.embedding_u(inputs)  # B x 1 x D

        return v_embeds + u_embeds  # final embed

## Train 

In [19]:
EMBEDDING_SIZE = 50
BATCH_SIZE = 256
EPOCH = 51

In [20]:
losses = []
model = GloVe(len(word2index), EMBEDDING_SIZE)
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.001)

In [21]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(inputs, targets, coocs, weights):
    """Forward function"""
    loss = model(inputs, targets, coocs, weights)
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
@mindspore.jit
def train_step(inputs, targets, coocs, weights):
    """Training steps"""
    loss, grads = grad_fn(inputs, targets, coocs, weights)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [22]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):

        inputs, targets, coocs, weights = zip(*batch)
        inputs = ops.cat(inputs)  # B x 1
        targets = ops.cat(targets)  # B x 1
        coocs = ops.cat(coocs)
        weights = ops.cat(weights)

        loss = train_step(inputs, targets, coocs, weights)
        losses.append(loss.asnumpy().item() * accumulate_step)

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 379.39
Epoch : 10, mean_loss : 5.88
Epoch : 20, mean_loss : 1.22
Epoch : 30, mean_loss : 0.51
Epoch : 40, mean_loss : 0.20
Epoch : 50, mean_loss : 0.08


## Test 

In [23]:
def word_similarity(target, vocab):
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue

        vector = model.prediction(prepare_word(list(vocab)[i], word2index))

        cosine_sim = ops.cosine_similarity(target_V, vector).asnumpy().tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [24]:
test = random.choice(list(vocab))
test

'surely'

In [25]:
word_similarity(test, vocab)

[['rev', 0.9097740650177002],
 ['nick', 0.8757970929145813],
 ['trod', 0.8692667484283447],
 [',', 0.8691204786300659],
 ['breaches', 0.8660611510276794],
 ['henry', 0.8610873222351074],
 ['flail', 0.8497377038002014],
 ['sharks', 0.8434864282608032],
 ['and', 0.8433796763420105],
 ['artificial', 0.8418195843696594]]

## TODO

* Use <a href="https://docs.scipy.org/doc/scipy/reference/sparse.html">sparse-matrix</a> to build co-occurence matrix for memory efficiency

## Suggested Readings

* <a href="http://ruder.io/word-embeddings-2017/?utm_campaign=Revue%20newsletter&utm_medium=Newsletter&utm_source=The%20Wild%20Week%20in%20AI">Word embeddings in 2017: Trends and future directions</a>