### Word representation
The easiest way to embed a word is one-hot.

$$
\mathbf x =
\begin{bmatrix}
0 \\
0 \\
\vdots \\
1 \\
\vdots \\
0 
\end{bmatrix}_{||V|| \times 1}
$$

#### Analogy of two vectors

$$
\begin{align*}
e_{\text{man}} - e_{\text{woman}} &\approx e_{\text{king}} - e \\
e &\approx e_{\text{king}} - e_{\text{man}} + e_{\text{woman}} \\
\end{align*}
$$

The similarity of the vectors are calculated with cosine similarity.

$$
-1 \leq\mathrm{cossim}(e, e_{\text{king}} - e_{\text{man}} + e_{\text{woman}}) \leq 1
$$

### Embedding matrix

In [169]:
import numpy as np
import matplotlib.pyplot as plt

# embedding matrix
E = np.random.randn(10, 100)

# one-hot vector 
x = np.zeros((100,1))
x[42] += 1

# embedded vector
e = E @ x

### Word2Vec

#### Skip-gram

In [170]:
text = """John quickly realized that the fox was jumping over a brown fence. Meanwhile, the lazy dog slept under the warm sun, dreaming of chasing squirrels in the park. A wizard in a distant land cast spells to levitate objects and summon mystical creatures. The gym was full of athletes lifting weights, running on treadmills, and practicing yoga poses. Buzzing bees were collecting nectar from vibrant flowers, while a group of birds sang harmoniously from the treetops. In the city, cars zoomed by as people hurried to work, their minds filled with tasks and deadlines. The library was a sanctuary of knowledge, where students pored over books and researchers delved into ancient manuscripts. A chef in a bustling kitchen prepared exquisite dishes, skillfully chopping vegetables and grilling meats. At the beach, waves crashed against the shore as children built sandcastles and surfers rode the swells. In the forest, a lumberjack wielded his axe, cutting down trees for timber. The night sky was a tapestry of stars, constellations, and planets, inspiring wonder and awe in all who gazed upon it."""
text = text.lower()

chars = sorted(list(set(text)))
c2i = lambda c: chars.index(c)
i2c = lambda i: chars[i]

X = []
context_size = 2
for i in range(context_size, len(text)-context_size):
    for offset in range(-context_size, context_size+1):
        if offset == 0:
            continue
        X.append([c2i(text[i]), c2i(text[i+offset])])

print(f"{len(text) = }")
print(f"{len(X)    = }")

len(text) = 1093
len(X)    = 4356


In [171]:
import torch
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size=len(chars), embedding_dim=5):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(num_embeddings=self.vocab_size,
                                      embedding_dim=self.embedding_dim)
        self.linear = nn.Linear(in_features=self.embedding_dim,
                                out_features=self.vocab_size)
        
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        return x

KeyboardInterrupt: 

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

skipgram = SkipGram()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(skipgram.parameters(), lr=0.1)

skipgram.train()
for e in range(1):
    running_loss = 0
    count = 0
    for x, y in DataLoader(X, batch_size=32, shuffle=True):
        count += 1
        optimizer.zero_grad()

        y_hat = skipgram(x)
        loss = criterion(y_hat, y) # <- computing every skip-grams is expensive. Cross entropy loss of huge vocab size (expensive!) * context size

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"[epoch {e+1:0>3}] {running_loss/count:7.5f}", end='\r')

#### Skip-gram (with negative sampling)

In [None]:
import random

text = """John quickly realized that the fox was jumping over a brown fence. Meanwhile, the lazy dog slept under the warm sun, dreaming of chasing squirrels in the park. A wizard in a distant land cast spells to levitate objects and summon mystical creatures. The gym was full of athletes lifting weights, running on treadmills, and practicing yoga poses. Buzzing bees were collecting nectar from vibrant flowers, while a group of birds sang harmoniously from the treetops. In the city, cars zoomed by as people hurried to work, their minds filled with tasks and deadlines. The library was a sanctuary of knowledge, where students pored over books and researchers delved into ancient manuscripts. A chef in a bustling kitchen prepared exquisite dishes, skillfully chopping vegetables and grilling meats. At the beach, waves crashed against the shore as children built sandcastles and surfers rode the swells. In the forest, a lumberjack wielded his axe, cutting down trees for timber. The night sky was a tapestry of stars, constellations, and planets, inspiring wonder and awe in all who gazed upon it."""
text = text.lower()

chars = sorted(list(set(text)))
c2i = lambda c: chars.index(c)
i2c = lambda i: chars[i]

X = []
context_size = 2
for i in range(context_size, len(text)-context_size):
    target = c2i(text[i])

    positive_samples = []
    for offset in range(-context_size, context_size+1):
        if offset == 0:
            continue
        i = c2i(text[i+offset])
        positive_samples.append(i)

    negative_sample_candidates = list(set(range(len(chars))) - set(positive_samples))
    negative_samples = random.sample(negative_sample_candidates, context_size*2*5) # 1:5 ratio of negative sampling

    for s in positive_samples:
        X.append([target, s, 1])

    for s in negative_samples:
        X.append([target, s, 0])

print(f"{len(text) = }")
print(f"{len(X)    = }")

<img src="https://wikidocs.net/images/page/69141/그림7.PNG" height=300/>

### GloVe (Global vectors for word representation)

Co-occurance of words in a sentence.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define a tiny corpus
corpus = ["I like machine learning",
          "I like coding in Python",
          "I enjoy learning new things"]

vocab = set()
for sentence in corpus:
    vocab.update(sentence.lower().split())
vocab = sorted(list(vocab))

vocab_size = len(vocab)
co_occurrence_matrix = np.zeros((vocab_size, vocab_size))

for sentence in corpus:
    words = sentence.lower().split()
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            if word1 != word2:
                co_occurrence_matrix[vocab.index(word1), vocab.index(word2)] += 1

fig, ax = plt.subplots(1,4,figsize=(20,4))
ax[0].set_title('co-occurrence matrix')
ax[0].imshow(co_occurrence_matrix, cmap='binary_r')
ax[0].xaxis.set_ticks_position('top')
ax[0].set_xticks(range(len(vocab)))
ax[0].set_xticklabels(vocab, rotation=90)
ax[0].set_yticks(range(len(vocab)))
ax[0].set_yticklabels(vocab)

U, S, V_T = np.linalg.svd(co_occurrence_matrix)
ax[1].set_title('$U$ (word vector)')
ax[1].imshow(U, cmap='binary_r')
ax[1].xaxis.set_ticks_position('top')
ax[1].set_xticks(range(len(vocab)))
ax[1].set_xticklabels(vocab, rotation=90)
ax[1].set_yticks([])

ax[2].set_title('$S$')
ax[2].imshow(np.diag(S), cmap='binary_r')
ax[2].set_xticks([])
ax[2].set_yticks(range(len(vocab)))
ax[2].set_yticklabels(vocab)

ax[3].set_title('$V^\intercal$')
ax[3].imshow(V_T, cmap='binary_r')
ax[3].set_xticks([])
ax[3].set_yticks(range(len(vocab)))
ax[3].set_yticklabels(vocab)

plt.show()


### Debiasing word embeddings

One way to debias some word embeddings (doctor:man=nurse:woman) is to find a non-biased dimension where is perpendicular to gender axis and project them.

<img src="src/debiasing.png" height=400 />

In [None]:
from pathlib import Path
import pandas as pd

with open('src/glove.6B.50d.txt', 'r') as f:
    rows = f.readlines()

rows = [row.split() for row in rows]
df = pd.DataFrame(rows)
df = df.apply(lambda col: pd.to_numeric(col, errors='ignore'), axis=0)
df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
229985,e.t.c.,0.30635,-1.0473,-0.30217,-0.19257,-1.0385,-0.71002,0.27873,0.33858,-0.14785,...,-0.38414,-1.2382,0.37545,-1.3216,0.15848,0.04765,-0.023359,-0.90518,1.3467,1.2792
289528,2137,-0.27732,-0.15675,-0.028893,-0.11471,-0.57858,-0.68888,0.50591,-0.034682,0.069475,...,0.27508,-0.50924,-0.27562,0.42645,-0.075152,0.50506,0.27285,0.098014,0.27318,0.18083
235861,discoursing,-0.44085,-0.093508,-0.74241,-0.39799,-0.14382,-0.23926,0.092085,-0.087855,-0.098514,...,0.12397,-0.36853,0.024875,0.78534,-0.6821,-0.34576,1.0824,0.38669,0.29955,-1.2247
284668,actualizing,0.10355,0.079821,-0.31639,-0.97387,-0.45961,-0.30537,1.0857,-0.11077,0.59911,...,0.27357,-0.45699,-0.047865,0.42122,0.14269,-0.42483,0.4414,0.3275,0.1086,-0.11897
237748,cubensis,0.2942,-0.56971,-0.4795,0.010294,0.50231,0.19395,0.99063,-0.21224,-0.36277,...,-0.9121,-0.30648,-0.3715,-0.77941,0.20265,0.079314,-0.61854,0.30088,0.30027,1.6014
303066,kezi,-0.12759,-0.22709,0.23436,0.49923,-0.7396,0.31408,-0.008921,0.59597,0.12743,...,0.21097,-0.051466,-0.22572,-0.33439,-0.24956,0.26571,-0.32147,-0.29567,0.44369,0.38995
78013,fraph,1.0388,-0.24066,0.55636,-0.24025,0.71568,-0.97994,0.6426,-0.6505,-0.089445,...,-0.15353,-0.58518,-0.5163,-0.78217,0.30869,-0.21708,-0.12758,0.13397,-0.25089,-0.71658
16695,mcdowell,-0.37532,-0.34333,-0.33512,0.98491,0.25654,0.25304,-0.54693,0.3204,-0.12831,...,-0.060907,0.68877,-0.50853,0.12962,-0.15329,0.29597,0.17177,-0.23018,-0.4034,0.4592
326348,nedelciu,-0.41663,-0.55227,-0.20812,0.15578,-0.50325,0.75483,-0.069079,0.05815,-0.10665,...,0.38774,-0.31589,-1.055,0.063344,-0.1672,-0.46235,0.20665,0.27444,0.33568,-0.44969
116742,sli,0.27939,-0.70668,0.55555,1.1022,-0.46336,0.33452,1.1241,-0.63299,-0.4405,...,0.20182,-0.39273,-0.47327,0.13977,-0.38912,-0.85162,0.77209,0.20051,0.53654,0.56097


In [None]:
import numpy as np
from typing import *

def _vectorize(word):
    word = word.lower()
    if word in df[0].tolist():
        vector = df[df[0] == word].iloc[:,1:]
        vector = vector.to_numpy().flatten()
    else:
        vector = np.zeros(df.shape[1]-1)
    return vector

def vectorize(words: Union[List[str], str]):
    if type(words) is list:
        words = list(map(_vectorize, words))
        words = np.stack(words, axis=0)
        return words
    return _vectorize(words)

def cossim(u, v):
    return u @ v / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
cossim(*vectorize('man game'.split())), cossim(*vectorize('woman game'.split())) # -> gender bias

(0.4837982694403796, 0.29868826318084823)

In [None]:
g = vectorize('woman') - vectorize('man') # man -> woman

words = ['game', 'lipstick', 'guns', 'science', 'arts', 'literature',
         'warrior','doctor', 'tree', 'receptionist', 'technology', 
         'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']

for word in words:
    e = vectorize(word)
    sim = cossim(e,g)
    print(f"{word:15} {sim:>8.5f} {'♂' if sim < 0 else '♀'}")

game            -0.33507 ♂
lipstick         0.27692 ♀
guns            -0.18885 ♂
science         -0.06083 ♂
arts             0.00819 ♀
literature       0.06473 ♀
warrior         -0.20920 ♂
doctor           0.11895 ♀
tree            -0.07089 ♂
receptionist     0.33078 ♀
technology      -0.13194 ♂
fashion          0.03564 ♀
teacher          0.17921 ♀
engineer        -0.08039 ♂
pilot            0.00108 ♀
computer        -0.10330 ♂
singer           0.18501 ♀


### Debiasing technique

#### Neutralize
<img src="src/neutralize.png" width=800 />

In [None]:
def neutralize(e, g): # debiasing non-gender-related words
    g_unit = g / np.linalg.norm(g)
    e_biased_factor = np.dot(e, g_unit) * g_unit
    e_debiased = e - e_biased_factor
    return e_debiased

g = vectorize('woman') - vectorize('man')
e = vectorize('game')

e_neutralized = neutralize(e, g)
cossim(e, g), cossim(e_neutralized, g) # near 0!

(-0.3350731148943847, 1.15603655367518e-16)

#### Equalize
<img src="src/equalize.png" width=800 />

In [None]:
def equalize(e1, e2, g): # equalizing genderness of grammatical gender words
    e_mu = (e1 + e2) / 2
    g_unit = g / np.linalg.norm(g)
    e_mu_biased_factor = np.dot(e_mu, g_unit) * g_unit
    e_mu_orthogonal = e_mu - e_mu_biased_factor # lying on the g_orthogonal axis

    e1_biased_factor = np.dot(e1, g_unit) * g_unit # lying on the g axis
    e2_biased_factor = np.dot(e2, g_unit) * g_unit # lying on the g axis

    e1_corrected = e1_biased_factor + e_mu_orthogonal
    e2_corrected = e2_biased_factor + e_mu_orthogonal

    return e1_corrected, e2_corrected

g = vectorize('woman') - vectorize('man')
e1, e2 = vectorize('actor'), vectorize('actress')

e1_corrected, e2_corrected = equalize(e1, e2, g)

cossim(e2-e1, g), cossim(e2_corrected-e1_corrected, g) # near 1!

(0.7787298355385984, 0.9999999999999999)