# Glove - 126161

his notebook implements the GloVe model from scratch using a weighted least-squares objective over global word co-occurrence statistics, trained on a subset of the Reuters-21578 corpus with a dynamic context window.

In [2]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import re
from collections import Counter, defaultdict

import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize

In [3]:
for res in ["reuters", "punkt", "punkt_tab"]:
    try:
        nltk.data.find(res)
    except LookupError:
        nltk.download(res)

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/michaellacar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
def load_reuters():
    corpus = []
    for fid in reuters.fileids():
        text = reuters.raw(fid).lower()
        text = re.sub(r"[^a-z\s]", "", text)
        tokens = word_tokenize(text)
        if len(tokens) > 2:
            corpus.append(tokens)
    return corpus

In [5]:
corpus = load_reuters()

In [6]:
MAX_DOCS = 300   # recommended: 200â€“300
corpus = corpus[:MAX_DOCS]


In [7]:
MIN_COUNT = 5

all_tokens = [word for doc in corpus for word in doc]
word_counts = Counter(all_tokens)

vocabs = [word for word, count in word_counts.items() if count >= MIN_COUNT]

word2index = {word: idx for idx, word in enumerate(vocabs)}
index2word = {idx: word for word, idx in word2index.items()}

vocab_size = len(vocabs)
print("Vocabulary size:", vocab_size)


Vocabulary size: 1190


# Build Co-occurence

In [8]:
def build_cooccurrence(corpus, window_size=2):
    cooc = defaultdict(float)

    for doc in corpus:
        for i, word in enumerate(doc):
            if word not in word2index:
                continue

            center_id = word2index[word]

            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)

            for j in range(start, end):
                if i != j and doc[j] in word2index:
                    context_id = word2index[doc[j]]
                    distance = abs(i - j)
                    cooc[(center_id, context_id)] += 1.0 / distance

    return cooc

window_size = 2
cooc_matrix = build_cooccurrence(corpus, window_size)
print("Number of co-occurrence pairs:", len(cooc_matrix))


Number of co-occurrence pairs: 46351


# Glove Model

In [9]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.wi = nn.Embedding(vocab_size, emb_size)
        self.wj = nn.Embedding(vocab_size, emb_size)
        self.bi = nn.Embedding(vocab_size, 1)
        self.bj = nn.Embedding(vocab_size, 1)

    def forward(self, i_idx, j_idx, x_ij):
        wi = self.wi(i_idx)
        wj = self.wj(j_idx)
        bi = self.bi(i_idx).squeeze()
        bj = self.bj(j_idx).squeeze()

        dot = torch.sum(wi * wj, dim=1)
        loss = (dot + bi + bj - torch.log(x_ij)) ** 2

        return loss


In [10]:
def weighting_function(x, x_max=100, alpha=0.75):
    return torch.where(
        x < x_max,
        (x / x_max) ** alpha,
        torch.ones_like(x)
    )


In [11]:
pairs = list(cooc_matrix.keys())
values = list(cooc_matrix.values())

pairs = torch.LongTensor(pairs)
values = torch.FloatTensor(values)


# Training configuration

In [12]:
embedding_dim = 2
learning_rate = 0.05
num_epochs = 30
batch_size = 512


# Train Glove Model

In [13]:
model = GloVe(vocab_size, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_batches = len(values) // batch_size

import time

start_time = time.time()

for epoch in range(num_epochs):
    total_loss = 0

    perm = torch.randperm(len(values))

    for i in range(num_batches):
        idx = perm[i * batch_size:(i + 1) * batch_size]

        i_idx = pairs[idx][:, 0]
        j_idx = pairs[idx][:, 1]
        x_ij = values[idx]

        weights = weighting_function(x_ij)

        optimizer.zero_grad()
        loss = model(i_idx, j_idx, x_ij)
        loss = torch.mean(weights * loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {total_loss:.4f}")

end_time = time.time()

glove_loss = total_loss / num_batches
glove_time = end_time - start_time

print(f"GloVe Loss: {glove_loss:.4f}")
print(f"GloVe Training Time: {glove_time:.2f} seconds")

Epoch [1/30] Loss: 10.4356
Epoch [2/30] Loss: 3.8168
Epoch [3/30] Loss: 3.1627
Epoch [4/30] Loss: 2.9171
Epoch [5/30] Loss: 2.7187
Epoch [6/30] Loss: 2.5554
Epoch [7/30] Loss: 2.4565
Epoch [8/30] Loss: 2.4008
Epoch [9/30] Loss: 2.3724
Epoch [10/30] Loss: 2.3503
Epoch [11/30] Loss: 2.3428
Epoch [12/30] Loss: 2.3254
Epoch [13/30] Loss: 2.3229
Epoch [14/30] Loss: 2.3522
Epoch [15/30] Loss: 2.3466
Epoch [16/30] Loss: 2.3359
Epoch [17/30] Loss: 2.3550
Epoch [18/30] Loss: 2.3463
Epoch [19/30] Loss: 2.3211
Epoch [20/30] Loss: 2.3397
Epoch [21/30] Loss: 2.3418
Epoch [22/30] Loss: 2.3564
Epoch [23/30] Loss: 2.3263
Epoch [24/30] Loss: 2.3479
Epoch [25/30] Loss: 2.3559
Epoch [26/30] Loss: 2.3414
Epoch [27/30] Loss: 2.3591
Epoch [28/30] Loss: 2.3317
Epoch [29/30] Loss: 2.3829
Epoch [30/30] Loss: 2.3523
GloVe Loss: 0.0261
GloVe Training Time: 0.77 seconds


In [None]:
embeddings = (
    model.wi.weight.detach().cpu().numpy() +
    model.wj.weight.detach().cpu().numpy()
)


# Plot Embeddings

In [None]:
TOP_K = 50
top_words = [w for w, _ in word_counts.most_common(TOP_K) if w in word2index]

plt.figure(figsize=(10, 10))

for word in top_words:
    idx = word2index[word]
    x, y = embeddings[idx]
    plt.scatter(x, y)
    plt.text(x, y, word, fontsize=9)

plt.title("GloVe Word Embeddings (Reuters Corpus)")
plt.grid(True)
plt.show()
