In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import collections
import random
import os
import requests
import zipfile
from torch.utils.data import Dataset, DataLoader

# 1. Download and Preprocess Data
def download_data():
    url = "https://mattmahoney.net/dc/text8.zip"
    filename = "text8.zip"
    if not os.path.exists(filename):
        print("Downloading text8 dataset...")
        r = requests.get(url)
        with open(filename, "wb") as f:
            f.write(r.content)

    with zipfile.ZipFile(filename) as f:
        data = f.read(f.namelist()[0]).decode('utf-8').split()
    return data

def build_vocab(words, vocab_size=10000):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocab_size - 1))
    word2idx = {word: i for i, (word, _) in enumerate(count)}

    data = []
    unk_count = 0
    for word in words:
        if word in word2idx:
            index = word2idx[word]
        else:
            index = 0  # UNK
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    idx2word = {i: word for word, i in word2idx.items()}
    return data, count, word2idx, idx2word

# 2. Dataset and Negative Sampling
class SkipGramDataset(Dataset):
    def __init__(self, data, word2idx, count, window_size=5, num_neg_samples=5):
        self.data = torch.LongTensor(data)
        self.window_size = window_size
        self.num_neg_samples = num_neg_samples

        # Calculate frequencies for negative sampling (P(w)^0.75)
        word_counts = np.array([c[1] for c in count])
        freqs = word_counts / np.sum(word_counts)
        self.neg_weights = torch.from_numpy(freqs ** 0.75)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Target word
        center_word = self.data[idx]

        # Context window indices
        start = max(0, idx - self.window_size)
        end = min(len(self.data), idx + self.window_size + 1)

        # Pick a random context word from the window
        context_idx = random.choice([i for i in range(start, end) if i != idx])
        context_word = self.data[context_idx]

        # Negative samples
        neg_samples = torch.multinomial(self.neg_weights, self.num_neg_samples, replacement=True)

        return center_word, context_word, neg_samples

# 3. The SGNS Model
class SkipGramNeg(nn.Module):
    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramNeg, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dimension = emb_dimension

        # Target word embeddings (u)
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension)
        # Context word embeddings (v)
        self.v_embeddings = nn.Embedding(vocab_size, emb_dimension)

        # Initialization
        initrange = 0.5 / emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.fill_(0) # Fixed: Changed .constant_(0) to .fill_(0)

    def forward(self, center_words, context_words, negative_samples):
        # center_words: [batch_size]
        # context_words: [batch_size]
        # negative_samples: [batch_size, num_neg_samples]

        emb_u = self.u_embeddings(center_words)    # [batch_size, emb_dim]
        emb_v = self.v_embeddings(context_words)   # [batch_size, emb_dim]
        emb_neg = self.v_embeddings(negative_samples) # [batch_size, num_neg, emb_dim]

        # Positive loss: log(sigmoid(u dot v))
        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        pos_loss = torch.log(torch.sigmoid(score))

        # Negative loss: sum(log(sigmoid(-u dot v_neg)))
        # bmm = Batch Matrix Multiplication
        neg_score = torch.bmm(emb_neg, emb_u.unsqueeze(2)).squeeze(2) # [batch_size, num_neg]
        neg_loss = torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)

        return -(pos_loss + neg_loss).mean()

# 4. Training Loop
def train():
    # Hyperparameters
    VOCAB_SIZE = 20000
    EMBED_DIM = 100
    WINDOW_SIZE = 5
    NEG_SAMPLES = 5
    BATCH_SIZE = 1024
    EPOCHS = 1
    LR = 0.001

    # Prepare data
    raw_words = download_data()
    data, count, word2idx, idx2word = build_vocab(raw_words, VOCAB_SIZE)
    dataset = SkipGramDataset(data, word2idx, count, WINDOW_SIZE, NEG_SAMPLES)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SkipGramNeg(VOCAB_SIZE, EMBED_DIM).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)

    print(f"Starting training on {device}...")
    for epoch in range(EPOCHS):
        total_loss = 0
        for i, (center, context, negs) in enumerate(dataloader):
            center, context, negs = center.to(device), context.to(device), negs.to(device)

            optimizer.zero_grad()
            loss = model(center, context, negs)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if i % 1000 == 0:
                print(f"Epoch {epoch}, Batch {i}, Loss: {loss.item():.4f}")

    # Save embeddings
    embeddings = model.u_embeddings.weight.data.cpu().numpy()
    return embeddings, word2idx, idx2word

if __name__ == "__main__":
    embeddings, word2idx, idx2word = train()

    # Test Similarity
    def get_similarity(word, embeddings, word2idx, idx2word, top_k=5):
        if word not in word2idx:
            return "Word not in vocab"
        idx = word2idx[word]
        vector = embeddings[idx]
        # Cosine similarity
        sim = np.dot(embeddings, vector) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(vector))
        nearest = (-sim).argsort()[1:top_k+1]
        return [idx2word[i] for i in nearest]

    test_word = "queen"
    print(f"Words nearest to '{test_word}': {get_similarity(test_word, embeddings, word2idx, idx2word)}")



Starting training on cpu...




Epoch 0, Batch 0, Loss: 4.1589
Epoch 0, Batch 1000, Loss: 2.5113
Epoch 0, Batch 2000, Loss: 2.4690
Epoch 0, Batch 3000, Loss: 2.4006
Epoch 0, Batch 4000, Loss: 2.4069
Epoch 0, Batch 5000, Loss: 2.4166
Epoch 0, Batch 6000, Loss: 2.4115
Epoch 0, Batch 7000, Loss: 2.3935
Epoch 0, Batch 8000, Loss: 2.4176
Epoch 0, Batch 9000, Loss: 2.3763
Epoch 0, Batch 10000, Loss: 2.3969
Epoch 0, Batch 11000, Loss: 2.3967
Epoch 0, Batch 12000, Loss: 2.3943
Epoch 0, Batch 13000, Loss: 2.3734
Epoch 0, Batch 14000, Loss: 2.3536
Epoch 0, Batch 15000, Loss: 2.3633
Epoch 0, Batch 16000, Loss: 2.3333
Words nearest to 'queen': ['constantine', 'patriarch', 'grandson', 'vii', 'sigismund']


In [9]:
import torch
import numpy as np
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# 1. SETUP: We assume 'embeddings' and 'word2idx' are from the PyTorch model
# trained in the previous step. For this script, we'll wrap them for easy access.
class MyModelWrapper:
    def __init__(self, embeddings, word2idx, idx2word):
        self.embeddings = embeddings
        self.word2idx = word2idx
        self.idx2word = idx2word

    def get_vector(self, word):
        if word in self.word2idx:
            return self.embeddings[self.word2idx[word]]
        return None

    def similarity(self, w1, w2):
        v1 = self.get_vector(w1)
        v2 = self.get_vector(w2)
        if v1 is None or v2 is None: return 0
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# 2. TRAIN GENSIM MODEL (on the same text8 dataset)
print("Loading text8 and training Gensim model...")
dataset = api.load('text8')
# sg=1 (Skip-gram), vector_size=100 to match our manual implementation
gensim_model = Word2Vec(sentences=dataset, vector_size=100, window=5, min_count=5, sg=1, workers=4)
gensim_wv = gensim_model.wv

# 3. COMPARE MODELS
my_model = MyModelWrapper(embeddings, word2idx, idx2word) # 'embeddings' from previous step

test_pairs = [
    ("king", "queen"),
    ("man", "woman"),
    ("france", "paris"),
    ("car", "engine"),
    ("dog", "cat"),
    ("computer", "keyboard")
]

print(f"\n{'Word Pair':<20} | {'Manual Sim':<12} | {'Gensim Sim':<12} | {'Diff'}")
print("-" * 65)

for w1, w2 in test_pairs:
    try:
        sim_mine = my_model.similarity(w1, w2)
        sim_gensim = gensim_wv.similarity(w1, w2)
        print(f"{w1 + '-' + w2:<20} | {sim_mine:12.4f} | {sim_gensim:12.4f} | {abs(sim_mine-sim_gensim):.4f}")
    except KeyError:
        print(f"{w1}-{w2} not in vocab")

# 4. COMPARE TOP-K NEIGHBORS
target_word = "queen"
print(f"\nTop 5 Nearest Neighbors for '{target_word}':")

# Manual Neighbors
idx = word2idx[target_word]
vec = embeddings[idx]
sims = np.dot(embeddings, vec) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(vec))
manual_neighbors = [idx2word[i] for i in sims.argsort()[::-1][1:6]]

# Gensim Neighbors
gensim_neighbors = [w for w, s in gensim_wv.most_similar(target_word, topn=5)]

print(f"Manual implementation: {manual_neighbors}")
print(f"Gensim implementation: {gensim_neighbors}")

Loading text8 and training Gensim model...

Word Pair            | Manual Sim   | Gensim Sim   | Diff
-----------------------------------------------------------------
king-queen           |       0.9050 |       0.7220 | 0.1830
man-woman            |       0.8049 |       0.7242 | 0.0807
france-paris         |       0.7530 |       0.7221 | 0.0309
car-engine           |       0.6786 |       0.6024 | 0.0762
dog-cat              |       0.8873 |       0.6729 | 0.2144
computer-keyboard    |       0.7813 |       0.5267 | 0.2546

Top 5 Nearest Neighbors for 'queen':
Manual implementation: ['constantine', 'patriarch', 'grandson', 'vii', 'sigismund']
Gensim implementation: ['elizabeth', 'consort', 'regnant', 'prince', 'highness']


In [10]:
def solve_analogy(a, b, c, embeddings, word2idx, idx2word, top_k=1):
    for word in [a, b, c]:
        if word not in word2idx:
            return f"'{word}' not in vocabulary."

    # Vector arithmetic: v_b - v_a + v_c
    vec_a = embeddings[word2idx[a]]
    vec_b = embeddings[word2idx[b]]
    vec_c = embeddings[word2idx[c]]
    target_vec = vec_b - vec_a + vec_c

    # Calculate cosine similarity with all vectors
    norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    norm_target = target_vec / np.linalg.norm(target_vec)

    similarities = np.dot(norm_embeddings, norm_target)

    # Get top results, excluding the input words
    sorted_indices = np.argsort(similarities)[::-1]
    results = []
    for idx in sorted_indices:
        word = idx2word[idx]
        if word not in [a, b, c]:
            results.append((word, similarities[idx]))
        if len(results) >= top_k:
            break

    return results

# Test the analogy
print(f"man : king :: woman : ?", solve_analogy("man", "king", "woman", embeddings, word2idx, idx2word))
print(f"france : paris :: germany : ?", solve_analogy("france", "paris", "germany", embeddings, word2idx, idx2word))

man : king :: woman : ? [('afonso', np.float32(0.8578553))]
france : paris :: germany : ? [('zurich', np.float32(0.8630364))]


In [11]:
def detect_gender_bias(neutral_words, embeddings, word2idx):
    # 1. Define the gender direction
    def get_diff(w1, w2):
        return embeddings[word2idx[w1]] - embeddings[word2idx[w2]]

    # Average several pairs to get a more stable "gender axis"
    gender_axis = (get_diff('man', 'woman') + get_diff('he', 'she')) / 2.0
    gender_axis /= np.linalg.norm(gender_axis) # Normalize

    bias_scores = []
    for word in neutral_words:
        if word in word2idx:
            vec = embeddings[word2idx[word]]
            vec /= np.linalg.norm(vec)
            # Projection onto gender axis
            score = np.dot(vec, gender_axis)
            bias_scores.append((word, score))

    # Sort by bias score (most masculine to most feminine)
    return sorted(bias_scores, key=lambda x: x[1], reverse=True)

occupations = ["doctor", "nurse", "engineer", "teacher", "homemaker", "scientist", "boss", "secretary"]
results = detect_gender_bias(occupations, embeddings, word2idx)

print(f"{'Word':<12} | {'Gender Bias Score'}")
print("-" * 30)
for word, score in results:
    print(f"{word:<12} | {score:.4f}")

Word         | Gender Bias Score
------------------------------
scientist    | 0.0995
secretary    | 0.0746
engineer     | 0.0539
doctor       | 0.0354
teacher      | -0.0030
nurse        | -0.0990
boss         | -0.1281
