# Word2Vec Exploration

Interpreting and analyzing the trained embeddings. The testing here is mostly
exploratory, so I only tried a few different hyperparameter configurations.

In [None]:
import os

import numpy as np
from word2vec.data import TEXT_PATH, load_tokens, build_vocab

## Load embeddings and rebuild vocabulary

In [78]:
tokens = load_tokens(TEXT_PATH)
word_to_id, id_to_word, word_counts = build_vocab(tokens, min_count=5)

W = np.load("embeddings/dim100_neg5_ep5.npy")
norms = np.maximum(np.linalg.norm(W, axis=1, keepdims=True), 1e-12)
W_norm = W / norms
print(f"Embeddings: {W.shape}")

Vocabulary: 71290 words (min_count=5)
Embeddings: (71290, 100)


In [79]:
def nearest(word, top_n):
    if word not in word_to_id:
        return f"'{word}' not in vocabulary"
    wid = word_to_id[word]
    cos_sims = W_norm @ W_norm[wid]
    cos_sims[wid] = -1
    top_ids = np.argsort(-cos_sims)[:top_n]
    return [(id_to_word[i], round(float(cos_sims[i]), 3)) for i in top_ids]


def analogy(a, b, c, top_n):
    for w in [a, b, c]:
        if w not in word_to_id:
            return f"'{w}' not in vocabulary"
    query = W_norm[word_to_id[b]] - W_norm[word_to_id[a]] + W_norm[word_to_id[c]]
    query /= np.maximum(np.linalg.norm(query), 1e-12)
    cos_sims = W_norm @ query
    for w in [a, b, c]:
        cos_sims[word_to_id[w]] = -1
    top_ids = np.argsort(-cos_sims)[:top_n]
    return [(id_to_word[i], round(float(cos_sims[i]), 3)) for i in top_ids]

## Nearest neighbors

The model only learns from which words tend to appear near each other. I wanted to
check whether that's enough to pick up on different kinds of similarity.

In [80]:
for word in ["university", "hospital", "war", "guitar", "ocean"]:
    print(f"\n{word}: {nearest(word, 5)}")


university: [('polytechnic', 0.801), ('tsinghua', 0.778), ('gyeonggi', 0.768), ('throop', 0.765), ('ume', 0.764)]

hospital: [('clinic', 0.82), ('nurse', 0.777), ('nursing', 0.757), ('hospitalized', 0.753), ('hospitals', 0.746)]

war: [('wartime', 0.729), ('escalated', 0.714), ('bloodiest', 0.704), ('raged', 0.7), ('surrender', 0.7)]

guitar: [('guitars', 0.89), ('bass', 0.887), ('drums', 0.869), ('bassists', 0.856), ('acoustic', 0.843)]

ocean: [('atlantic', 0.832), ('strait', 0.805), ('volcanoes', 0.793), ('reefs', 0.789), ('atolls', 0.785)]


In [81]:
for word in ["three", "hundred", "million"]:
    print(f"{word}: {nearest(word, 5)} \n")

three: [('four', 0.785), ('seven', 0.712), ('six', 0.696), ('two', 0.672), ('five', 0.664)] 

hundred: [('thousand', 0.835), ('forty', 0.749), ('fifty', 0.744), ('twenty', 0.734), ('sixty', 0.73)] 

million: [('estimated', 0.77), ('billion', 0.741), ('totaled', 0.706), ('approximately', 0.703), ('rmb', 0.693)] 



## Analogy tests

a : b :: c : ? solved via vec(b) - vec(a) + vec(c).

If the model learns consistent directions for relationships (e.g. a "gender" direction,
a "capital-of" direction) then vector arithmetic should recover them.

In [82]:
tests = [
    ("king",          "man",       "queen",     "woman"),
    ("madrid",        "spain",     "berlin",    "germany"),
    ("france",        "latin",     "poland",    "slavic"),
    ("man",           "woman",     "uncle",     "aunt"),
    ("going",         "went",      "playing",   "played"),
    ("brother",       "sister",    "father",    "mother"),
    ("car",           "cars",      "dog",       "dogs"),
    ("doctor",        "medicine",  "professor", "science"),
    ("christianity",  "jesus",     "islam",     "muhammad"),
    ("italy",         "europe",    "japan",     "asia"),
]

for a, b, c, _ in tests:
    results = analogy(a, b, c, 3)
    print(f"{a} : {b} :: {c} : ? -> {results} \n")

king : man :: queen : ? -> [('sally', 0.512), ('woman', 0.51), ('cobbler', 0.506)] 

madrid : spain :: berlin : ? -> [('germany', 0.639), ('austria', 0.606), ('vienna', 0.558)] 

france : latin :: poland : ? -> [('slavic', 0.645), ('alphabet', 0.579), ('runic', 0.579)] 

man : woman :: uncle : ? -> [('aunt', 0.685), ('grandmother', 0.683), ('mother', 0.674)] 

going : went :: playing : ? -> [('played', 0.611), ('virtuoso', 0.599), ('toured', 0.592)] 

brother : sister :: father : ? -> [('daughter', 0.644), ('mother', 0.592), ('sisters', 0.578)] 

car : cars :: dog : ? -> [('dogs', 0.714), ('breeds', 0.666), ('keeshond', 0.664)] 

doctor : medicine :: professor : ? -> [('phd', 0.699), ('sciences', 0.686), ('integrative', 0.657)] 

christianity : jesus :: islam : ? -> [('allah', 0.796), ('muhammad', 0.749), ('prophet', 0.748)] 

italy : europe :: japan : ? -> [('asia', 0.617), ('china', 0.565), ('thailand', 0.558)] 



From what I could tell the dataset is heavy on history and geography (makes sense
since it's from Wikipedia), so that's the kind of relationships I was mostly after
when picking the test cases. 6 out of 10 got the expected word as the top-1 result
(madrid/berlin, france/slavic, uncle/aunt, going/played, car/dogs, italy/asia).
A few near misses: king/queen gives "woman" as the second result,
brother/sister::father gives "daughter" instead of "mother", and
christianity/jesus::islam gives "allah" first but "muhammad" second.

## Hyperparameter comparison

I tested four different hyperparameter configurations, as I was interested how their
results would compare on these kinds of tests. Each varies one parameter from the baseline.

| Config | dim | neg | epochs | train time |
|--------|-----|-----|--------|------------|
| baseline | 100 | 5 | 5 | ~25 min |
| more negatives | 100 | 15 | 5 | ~73 min |
| higher dim | 200 | 5 | 5 | ~55 min |
| fewer epochs | 100 | 5 | 1 | ~5 min |

In [83]:
def normalise_rows(M):
    return M / np.maximum(np.linalg.norm(M, axis=1, keepdims=True), 1e-12)

EMBED_DIR = "embeddings"

configs = {
    "dim100_neg5_ep5":  "baseline (d=100, neg=5, ep=5)",
    "dim100_neg15_ep5": "more negatives (neg=15)",
    "dim200_neg5_ep5":  "higher dim (d=200)",
    "dim100_neg5_ep1":  "fewer epochs (ep=1)",
}

embeddings = {}
for key, label in configs.items():
    path = os.path.join(EMBED_DIR, f"{key}.npy")
    E = np.load(path)
    embeddings[key] = normalise_rows(E)

In [84]:
def analogy_with(normed, a, b, c):
    for w in [a, b, c]:
        if w not in word_to_id:
            return None
    query = normed[word_to_id[b]] - normed[word_to_id[a]] + normed[word_to_id[c]]
    query /= np.maximum(np.linalg.norm(query), 1e-12)
    cos_sims = normed @ query
    for w in [a, b, c]:
        cos_sims[word_to_id[w]] = -1
    return id_to_word[np.argmax(cos_sims)]


train_times = {
    "dim100_neg5_ep5":  25.5,
    "dim100_neg15_ep5": 72.8,
    "dim200_neg5_ep5":  55.0,
    "dim100_neg5_ep1":   5.4,
}

print(f"{'Config':<30} {'Correct':>7} {'Accuracy':>8} {'Time':>10}")
print("-" * 58)
for key in configs:
    if key not in embeddings:
        continue
    normed = embeddings[key]
    correct = sum(1 for a, b, c, exp in tests if analogy_with(normed, a, b, c) == exp)
    total = sum(1 for a, b, c, exp in tests if analogy_with(normed, a, b, c) is not None)
    acc = correct / total if total > 0 else 0
    t = train_times.get(key, 0)
    print(f"{configs[key]:<30} {correct:>3}/{total:<3} {acc:>8.1%} {t:>7.1f} min")

Config                         Correct Accuracy       Time
----------------------------------------------------------
baseline (d=100, neg=5, ep=5)    6/10     60.0%    25.5 min
more negatives (neg=15)          6/10     60.0%    72.8 min
higher dim (d=200)               7/10     70.0%    55.0 min
fewer epochs (ep=1)              2/10     20.0%     5.4 min


### Side-by-side nearest neighbors

In [86]:
def nearest_with(normed, word, top_n):
    if word not in word_to_id:
        return []
    wid = word_to_id[word]
    cos_sims = normed @ normed[wid]
    cos_sims[wid] = -1
    top_ids = np.argsort(-cos_sims)[:top_n]
    return [id_to_word[i] for i in top_ids]


for word in ["university", "hospital", "war", "guitar", "ocean"]:
    print(f"\n'{word}':")
    for key in configs:
        if key not in embeddings:
            continue
        neighbors = nearest_with(embeddings[key], word, 5)
        print(f"  {configs[key]}: {', '.join(neighbors)}")


'university':
  baseline (d=100, neg=5, ep=5): polytechnic, tsinghua, gyeonggi, throop, ume
  more negatives (neg=15): polytechnic, tsinghua, gyeonggi, ume, throop
  higher dim (d=200): polytechnic, gyeonggi, throop, abet, ume
  fewer epochs (ep=1): college, alumni, oxford, indiana, angeles

'hospital':
  baseline (d=100, neg=5, ep=5): clinic, nurse, nursing, hospitalized, hospitals
  more negatives (neg=15): clinic, hospitalized, hospitals, nurse, pneumonia
  higher dim (d=200): clinic, nurse, nursing, hospitals, hospitalized
  fewer epochs (ep=1): fellow, illegitimate, commissioned, murdered, daughters

'war':
  baseline (d=100, neg=5, ep=5): wartime, escalated, bloodiest, raged, surrender
  more negatives (neg=15): allied, escalated, sepoys, bloodiest, manchukuo
  higher dim (d=200): escalated, bloodiest, invasion, wartime, manchukuo
  fewer epochs (ep=1): communist, soviet, democratic, civil, britain

'guitar':
  baseline (d=100, neg=5, ep=5): guitars, bass, drums, bassists, acous