### Text Reperesentation using Vord2Vec and LSTM

In [1]:
from torch import nn

In [2]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_size):
      # first, we are defining the word2vec class as a child class of Module in pytorch so we can inherit its methods
        super().__init__()
        # this is our embedding layer for the words we input to convert to a one-hot-encoding input
        # and project the weights from the hidden layer
        self.embed = nn.Embedding(vocab_size, embedding_size)
        # this is our activation function (we discussed it being linear in class)
        # we also remove the bias/intercept with bias=False since we apply
        # softmax for rescaling anyway
        self.expand = nn.Linear(embedding_size, vocab_size, bias=False)

    def forward(self, input):
        # Encode input to lower-dimensional representation
        hidden = self.embed(input)
        # Expand hidden layer to predictions
        logits = self.expand(hidden)
        return logits

In [3]:
import preprocess

# Load the data
df_train, df_test, df_val = preprocess.load()

# get the third line of the training data
sentence = df_train.headline.to_list()

# print(sentence)
# add them to the a text
text = ". ".join(sentence)

In [4]:
print(text)



In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenize the text data
# text = "The farm was home to a variety of animals, each with their own distinct personalities and characteristics. The cows were docile and hardworking, providing the farm with milk and cream. The pigs were intelligent and ambitious, often vying for more power and control. The chickens clucked and pecked around the barnyard, laying eggs for the farm's breakfast. The horses were strong and proud, plowing the fields and carrying heavy loads. The sheep were gentle and timid, content to graze in the meadow. But the true leader of the farm was a pig named Napoleon, who through manipulation and deceit, rose to power and convinced the other animals to overthrow their human owner and run the farm themselves, with the pigs as the ruling class in George Orwell's Animal Farm."
tokens = word_tokenize(text)

# Create a vocabulary of unique words
vocab = set(tokens)

# Create training data
data = []
window_size = 2
for i, word in enumerate(tokens):
    for j in range(i-window_size, i+window_size+1):
        if i != j and 0 <= j < len(tokens):
            data.append((word, tokens[j]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
print(len(vocab))
print(tokens)

21139


In [7]:
#Create a mapping from integers to words
id2tok = dict(enumerate(vocab))

# Create a mapping from words to integers
word2int = {word: i for i, word in id2tok.items()}

# Convert words to integers
data = [(word2int[word[0]], word2int[word[1]]) for word in data]

# Create a Pytorch dataloader
import torch
from torch.utils.data import DataLoader

dataloader = DataLoader(data, batch_size=32, shuffle=True)

In [8]:
feature_size = 100
model = Word2Vec(vocab_size=len(vocab), embedding_size=feature_size)

# Relevant if you have a GPU you want to use, we will ignore this step
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training parameters
learning = 3e-4
epochs = 200
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning)

In [9]:
running_loss = []
for epoch in range(epochs):
    epoch_loss = 0
    for i, (center, context) in enumerate(dataloader):
        center, context = center.to(device), context.to(device)
      # again, the commented to(device) code is only if you want to make use of GPU
        #center, context = center.to(device), context.to(device)
        #print(center, context)
        optimizer.zero_grad()
        logits = model(input=center)
        loss = loss_fn(logits, context)

        epoch_loss += loss.item()
        loss.backward() # This is where we backpropogate and update the weights of the network
        optimizer.step()
        # print(f'loss: {loss.item()}')
    epoch_loss /= len(dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')
    running_loss.append(epoch_loss)

print(running_loss)

Epoch 1/200, Loss: 7.5803
Epoch 2/200, Loss: 6.7073
Epoch 3/200, Loss: 6.5207
Epoch 4/200, Loss: 6.4148
Epoch 5/200, Loss: 6.3426
Epoch 6/200, Loss: 6.2894


KeyboardInterrupt: 

In [None]:
wordvecs = model.expand.weight.cpu().detach().numpy() #just want the vectors now so we detach from tensor object
print(wordvecs[0])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np
import random

# Assuming the model and other components are already defined and the model is trained
# Let's define a function to compute the sentence embedding

def get_sentence_embedding(sentence, word_to_ix, model):
    tokens = sentence.lower().split()
    embeddings = torch.Tensor([])
    embeddings = embeddings.to(device)
    for token in tokens:
        if token in word_to_ix:
            token_idx = torch.tensor([word_to_ix[token]], dtype=torch.long)
            token_idx = token_idx.to(device)
            # append to embedding
            embeddings = torch.cat((embeddings, model.embed(token_idx)), 0)
        else:
            var = torch.zeros(1, model.embed.embedding_dim)
            var = var.to(device)
            embeddings = torch.cat((embeddings, var), 0)
    print(type(embeddings))
    print(embeddings.shape)
    sentence_embedding = torch.mean(embeddings, dim=0)
    print(sentence_embedding.shape)
    return sentence_embedding.squeeze()

# # Example usage
sentence = "Ehsan is not a donkey"
sentence_embedding = get_sentence_embedding(sentence, word2int, model)

print(f"Sentence Embedding for '{sentence}':\n{sentence_embedding}")

In [None]:
# from scipy.spatial import distance
# import numpy as np

# words_of_interest = ['Napoleon', 'horses']

# def get_distance_matrix(wordvecs, metric):
#     dist_matrix = distance.squareform(distance.pdist(wordvecs, metric))
#     return dist_matrix

# def get_k_similar_words(word, dist_matrix, k=5):
#     idx = word2int[word]
#     dists = dist_matrix[idx]
#     ind = np.argpartition(dists, k)[:k+1]
#     ind = ind[np.argsort(dists[ind])][1:]
#     out = [(i, id2tok[i], dists[i]) for i in ind]
#     return out

# dmat = get_distance_matrix(wordvecs, 'cosine')

# for word in words_of_interest:
#     print(word, [t[1] for t in get_k_similar_words(word, dmat)], "\n")