In [None]:
import spacy
# Load the large English NLP model
nlp  = spacy.load("en_core_web_lg")
# List of words to get vectors for
words = ["dog", "cat", "tiger", "car", "bus", "bicycle"]
vecs = []
# Iterate over each word, get its vector, and print it
for word in words:
    token = nlp(word)
    vecs.append(token.vector)
    #print(f"Word: {word}\nVector: {token.vector}\n")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#use cosine similarity to find similarity between word vectors

print("Cosine Similarity between word vectors:")
print(f"Similarity between {words[0]} and {words[1]} : {cosine_similarity([vecs[0]], [vecs[1]])[0][0]}")
print(f"Similarity between {words[0]} and {words[2]} : {cosine_similarity([vecs[0]], [vecs[2]])[0][0]}")
print(f"Similarity between {words[3]} and {words[4]} : {cosine_similarity([vecs[3]], [vecs[4]])[0][0]}")
print(f"Similarity between {words[3]} and {words[5]} : {cosine_similarity([vecs[3]], [vecs[5]])[0][0]}")

In [None]:
import torch
import torch.nn as nn

class w2v(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(w2v, self).__init__()
        self.embeddings = nn.Parameter(torch.randn(vocab_size, embedding_dim), requires_grad=True)
        self.classifier = nn.Linear(embedding_dim, vocab_size)

    # Define the forward pass of the model
    def forward(self, ids):
        words = self.embeddings[ids]
        classifier_input = torch.sum(words, dim=0)
        output = self.classifier(classifier_input)
        return output
    # Method to get embeddings for a given id
    def getEmbeddings(self, id):
        id = torch.LongTensor([id])
        return self.embeddings[id]

In [None]:
corpus = ["the dog is nice", "the cat is lovely", "the tiger is wild",
          "I drive a car", "I ride a bus", "I like to cycle on a bicycle"]

# Function to create a word to ID mapping
def toId(texts):
    idDict = {}
    nrWords = 0
    for text in texts:
        for word in text.split():
            if word not in idDict:
                idDict[word] = nrWords
                nrWords += 1
    return idDict

idDict = toId(corpus)
print("Vocabulary:", idDict)

In [None]:
# Function to transform corpus into sequences of IDs
def applyAndTransform(idDict, corpus):
    corpusAsIds = []
    for text in corpus:
        textAsNr = []
        for word in text.split():
            textAsNr.append(idDict[word])
        corpusAsIds.append(textAsNr)
    return corpusAsIds


applyAndTransform(idDict, corpus)

In [None]:
inputs = torch.LongTensor([[0,2],[1,3],[0,2],[4,3]])
targets = torch.LongTensor([[1],[2],[4],[2]])

In [None]:
model = w2v(vocab_size=len(idDict), embedding_dim=8)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Training loop
for epoch in range(100):
    lossAbs = 0
    # Iterate over each input-target pair in the dataset
    for sample,target in zip(inputs, targets):
        output = model(sample) # Forward pass
        loss = criterion(output.unsqueeze(0), target) # Compute loss
        optimizer.zero_grad() # Zero gradients
        loss.backward() # Backpropagation
        optimizer.step() # Update weights
        lossAbs += float(loss) # Accumulate loss
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {lossAbs}")

print("Training complete.")