In [5]:
# If needed:
# pip install torch tqdm

import re, math, collections, random
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#!pip install torch

In [6]:
corpus = [
    "king queen man woman prince princess royal palace throne crown",
    "the quick brown fox jumps over the lazy dog",
    "paris is the capital of france and rome is the capital of italy",
    "ice is solid at low temperature but steam is gas at high temperature",
    "deep learning uses neural networks and gradient descent optimization",
]

In [7]:
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

tokens = [tokenize(doc) for doc in corpus]
min_count = 1

freq = collections.Counter(w for doc in tokens for w in doc)
vocab = [w for w,c in freq.items() if c >= min_count]
vocab.sort()
stoi = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in stoi.items()}
V = len(vocab)
print("Vocab size:", V)


Vocab size: 43


In [26]:
#vocab

In [12]:
#vocab

In [13]:
win = 5
cooc = collections.defaultdict(float)

for doc in tokens:
    for i, wi in enumerate(doc):
        wi_idx = stoi.get(wi)
        if wi_idx is None: 
            continue
        start = max(0, i - win)
        end   = min(len(doc), i + win + 1)
        for j in range(start, end):
            if i == j: 
                continue
            wj = doc[j]
            wj_idx = stoi.get(wj)
            if wj_idx is None: 
                continue
            dist = abs(j - i)
            cooc[(wi_idx, wj_idx)] += 1.0 / dist

pairs = np.array([(i,j,x) for (i,j),x in cooc.items() if x>0.0], dtype=object)
len(pairs), pairs[:5]


(318,
 array([[18, 32, 1.0],
        [18, 22, 0.5],
        [18, 42, 0.3333333333333333],
        [18, 30, 0.25],
        [18, 31, 0.2]], dtype=object))

In [14]:
class CoocDataset(Dataset):
    def __init__(self, pairs):
        self.i = np.array([p[0] for p in pairs], dtype=np.int64)
        self.j = np.array([p[1] for p in pairs], dtype=np.int64)
        self.x = np.array([p[2] for p in pairs], dtype=np.float32)
    def __len__(self): return len(self.x)
    def __getitem__(self, idx):
        return (self.i[idx], self.j[idx], self.x[idx])

batch_size = 1024
ds = CoocDataset(pairs)
dl = DataLoader(ds, batch_size=batch_size, shuffle=True)


In [15]:
class GloVe(nn.Module):
    def __init__(self, vocab_size, dim=50):
        super().__init__()
        self.W  = nn.Embedding(vocab_size, dim)       # word
        self.C  = nn.Embedding(vocab_size, dim)       # context
        self.b  = nn.Embedding(vocab_size, 1)         # word bias
        self.bt = nn.Embedding(vocab_size, 1)         # context bias
        # init
        nn.init.xavier_uniform_(self.W.weight)
        nn.init.xavier_uniform_(self.C.weight)
        nn.init.zeros_(self.b.weight)
        nn.init.zeros_(self.bt.weight)
        self.dim = dim

    def forward(self, i, j):
        wi = self.W(i)        # (B, d)
        wj = self.C(j)        # (B, d)
        bi = self.b(i).squeeze(-1)   # (B,)
        bj = self.bt(j).squeeze(-1)  # (B,)
        dot = (wi * wj).sum(dim=1)   # (B,)
        return dot + bi + bj          # predicted log(X_ij)


In [18]:
# Hyperparams (paper defaults work well)
dim = 50
x_max = 100.0
alpha = 0.75
epochs = 30
lr = 0.05

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GloVe(V, dim=dim).to(device)
opt = torch.optim.Adam(model.parameters(), lr=lr)


In [19]:
def weight_fn(x_ij):
    # f(x) = (x/x_max)^alpha  if x < x_max else 1
    wx = (x_ij / x_max) ** alpha
    return torch.where(x_ij < x_max, wx, torch.ones_like(x_ij))

model.train()
for ep in range(1, epochs+1):
    total_loss = 0.0
    for i, j, x in dl:
        i = i.to(device)
        j = j.to(device)
        x = x.to(device)

        pred = model(i, j)
        w = weight_fn(x)
        loss = (w * (pred - torch.log(x))**2).mean()

        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item() * len(x)

    print(f"Epoch {ep:02d} | Loss: {total_loss/len(ds):.4f}")


Epoch 01 | Loss: 0.0103
Epoch 02 | Loss: 0.0049
Epoch 03 | Loss: 0.0019
Epoch 04 | Loss: 0.0010
Epoch 05 | Loss: 0.0012
Epoch 06 | Loss: 0.0014
Epoch 07 | Loss: 0.0012
Epoch 08 | Loss: 0.0007
Epoch 09 | Loss: 0.0005
Epoch 10 | Loss: 0.0006
Epoch 11 | Loss: 0.0007
Epoch 12 | Loss: 0.0007
Epoch 13 | Loss: 0.0006
Epoch 14 | Loss: 0.0004
Epoch 15 | Loss: 0.0003
Epoch 16 | Loss: 0.0003
Epoch 17 | Loss: 0.0003
Epoch 18 | Loss: 0.0003
Epoch 19 | Loss: 0.0002
Epoch 20 | Loss: 0.0002
Epoch 21 | Loss: 0.0002
Epoch 22 | Loss: 0.0002
Epoch 23 | Loss: 0.0002
Epoch 24 | Loss: 0.0001
Epoch 25 | Loss: 0.0001
Epoch 26 | Loss: 0.0001
Epoch 27 | Loss: 0.0001
Epoch 28 | Loss: 0.0001
Epoch 29 | Loss: 0.0001
Epoch 30 | Loss: 0.0001


In [20]:
@torch.no_grad()
def get_embeddings(model):
    emb = model.W.weight + model.C.weight     # v_i = w_i + w̃_i
    return emb.cpu().numpy()

E = get_embeddings(model)  # shape: (V, dim)

# Cosine similarity utilities
def vec(w):
    return E[stoi[w]]

def most_similar(word, topn=5):
    v = vec(word)
    v = v / np.linalg.norm(v)
    M = E / np.linalg.norm(E, axis=1, keepdims=True)
    sims = M @ v
    ranks = np.argsort(-sims)
    out = []
    for idx in ranks:
        w = itos[idx]
        if w == word: 
            continue
        out.append((w, float(sims[idx])))
        if len(out) >= topn:
            break
    return out

def analogy(a, b, c, topn=5):
    # a:b :: c:?
    v = vec(b) - vec(a) + vec(c)
    v = v / np.linalg.norm(v)
    M = E / np.linalg.norm(E, axis=1, keepdims=True)
    sims = M @ v
    ranks = np.argsort(-sims)
    out = []
    ban = {a,b,c}
    for idx in ranks:
        w = itos[idx]
        if w in ban: 
            continue
        out.append((w, float(sims[idx])))
        if len(out) >= topn:
            break
    return out


In [21]:
print("Similar to 'king':", most_similar("king"))
print("Analogy king:man :: ? : woman ->", analogy("man", "king", "woman"))
print("Similar to 'paris':", most_similar("paris"))
print("Similar to 'temperature':", most_similar("temperature"))


Similar to 'king': [('queen', 0.7101271152496338), ('crown', 0.6386337280273438), ('throne', 0.482085257768631), ('dog', 0.4398803114891052), ('lazy', 0.36637669801712036)]
Analogy king:man :: ? : woman -> [('palace', 0.39574524760246277), ('lazy', 0.3455282151699066), ('queen', 0.2928714454174042), ('royal', 0.2756752371788025), ('crown', 0.26887965202331543)]
Similar to 'paris': [('rome', 0.5738002061843872), ('over', 0.3151131570339203), ('low', 0.2945200204849243), ('descent', 0.25710663199424744), ('at', 0.19899845123291016)]
Similar to 'temperature': [('but', 0.5507892370223999), ('low', 0.5312858819961548), ('steam', 0.4997710883617401), ('at', 0.31528133153915405), ('the', 0.28273504972457886)]


In [22]:
out_path = "mini_glove.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(f"{V} {dim}\n")
    for i in range(V):
        w = itos[i]
        vec_str = " ".join(f"{x:.6f}" for x in E[i])
        f.write(f"{w} {vec_str}\n")
print("Saved to", out_path)


Saved to mini_glove.txt
