In [1]:
import numpy as np

# Pre-processing: Create a corpus

In [None]:
my_corpus = ["python code", "c code", "hi there", "hi all"]
tokens = set(" ".join(my_corpus).split())  # Create a set of unique words
tokens = sorted(tokens)
tokens

In [None]:
word_to_idx = {word: idx for idx, word in enumerate(tokens)}
word_to_idx

In [None]:
vocab_size = len(tokens)
vocab_size

# Hyperparameters

In [5]:
N = 2  # Size of the hidden layer
learning_rate = 0.01
nof_epochs = 10000

# Initialize weights

In [6]:
W = np.random.rand(vocab_size, N) # input->hidden layer weights
W_prime = np.random.rand(N, vocab_size) # hidden->output layer weights

In [7]:
def one_hot_encode(word_idx, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[word_idx] = 1
    return one_hot_vector

In [8]:
def softmax(x):
    e_x = np.exp(x - np.max(x))  # Stability improvement for softmax
    return e_x / e_x.sum(axis=0)
    # return np.exp(x)/np.sum(np.exp(x))

# Training loop

In [None]:
loss_all = []
for epoch in range(nof_epochs):  # Run for a fixed number of epochs
    loss = 0
    for sentence in my_corpus:
        words = sentence.split()
        for i, target_word in enumerate(words):
            # Define context and target
            # context_word = words[i-1] if i-1 >= 0 else None
            if i==1:
                context_word = words[0]
            elif i==0:
                context_word = words[1]
            else:
                context_word = None


            if context_word:
                # One-hot encode
                context_idx = word_to_idx[context_word]
                target_idx = word_to_idx[target_word]
                x = one_hot_encode(context_idx, vocab_size)

                # Forward pass
                h = W.T @ x
                u = W_prime.T @ h
                y_pred = softmax(u)


                # Backpropagation
                e = y_pred.copy()
                e[target_idx] -= 1  # y_pred - y_true, y_true=1

                # Loss (negative log likelihood)
                loss += -np.log(y_pred[target_idx] + 1e-8)


                # print(i, context_word, target_word, loss)

                # Gradient for W_prime and W
                dW_prime = np.outer(h, e)
                dW = np.outer(x, W_prime @ e)

                # Update weights
                W_prime -= learning_rate * dW_prime
                W -= learning_rate * dW

    loss_all.append(loss)

    if epoch % 1000 == 0:
        print(f'Epoch: {epoch}, Loss: {loss:.4f}')  

# Display learned word vectors

In [None]:
for word, idx in word_to_idx.items():
    print(f"Word: {word}, Vector: {W[idx]}")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(4, 3))
plt.plot(loss_all)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.show()

In [None]:
for word in word_to_idx:
    print(word, word_to_idx[word])

In [None]:
word_vectors = np.array([W[word_to_idx[word]] for word in word_to_idx])
word_vectors

# Compute all-by-all similarities for the learned word vectors

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarities = cosine_similarity(word_vectors)
similarities

In [None]:
# Plot the matrix
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(similarities, xticklabels=word_to_idx.keys(), yticklabels=word_to_idx.keys(), cmap='binary', annot=True)
plt.title('Word Vector Similarities')
plt.show()

In [17]:
# !pip install adjustText

In [None]:
# 2D map of word_vectors
import matplotlib.pyplot as plt
from adjustText import adjust_text  # You'll need to: pip install adjustText

plt.figure(figsize=(5, 4))
texts = []
for word, (x, y) in zip(word_to_idx.keys(), word_vectors):
    plt.scatter(x, y)
    texts.append(plt.text(x, y, word, fontsize=12))


adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

plt.title('2D Map of Word Vectors')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()