In [1]:
!pip install nltk
!pip install torch



In [3]:
import random
import os

# Initial Corpus Dataset (randomization)

dictionary = [
    ['dog', 'cat', 'animal', 'goldfish', 'pet', 'house', 'sleep', 'play'],
    ['lion', 'zebra', 'mammal', 'africa', 'elephant', 'cheetah', 'hunt', 'animal'],
    ['crown', 'queen', 'king', 'kingdom', 'peasant', 'royal', 'country', 'rule', 'castle']
]
text_corpus = ""
file_location = "Datasets/text_corpus.txt"
iteration_count = 10000

def shuffle_corpus(pos):
    random.shuffle(dictionary[pos])
    return ' ' + ' '.join(dictionary[pos])

def generate_corpus(iterations, pos):
    global text_corpus
    for a in range(iterations):
        add_string = shuffle_corpus(pos)
        text_corpus += add_string

for i in range(len(dictionary)):
    generate_corpus(iteration_count, i)

if os.path.exists(file_location):
    os.remove(file_location) 

with open(file_location, 'w') as file:
    file.write(text_corpus)

print("Text Corpus Uploaded")

Text Corpus Uploaded


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

nltk.download('stopwords')

file_location = "Datasets/text_corpus.txt"
text_corpus = ""
with open(file_location, 'r') as file:
    text_corpus = file.read()
    
# Text Normalization + Tokenization
def generate_cbows(text, window_size):
    text = text.lower()
    words = TreebankWordTokenizer().tokenize(text)
    words = [word for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    cbows = []
    for i, target_word in enumerate(words):
        context_words = words[max(0, i - window_size):i] + words[i + 1:i + window_size + 1]
        if len(context_words) == window_size * 2:
            cbows.append((context_words, target_word))
    return cbows

cbows = generate_cbows(text_corpus, window_size=3)

for context_words, target_word in cbows[:5]:
    print(f'Context Words: {context_words}, Target Word: {target_word}'  )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Context Words: ['pet', 'cat', 'play', 'house', 'animal', 'goldfish'], Target Word: dog
Context Words: ['cat', 'play', 'dog', 'animal', 'goldfish', 'sleep'], Target Word: house
Context Words: ['play', 'dog', 'house', 'goldfish', 'sleep', 'animal'], Target Word: animal
Context Words: ['dog', 'house', 'animal', 'sleep', 'animal', 'sleep'], Target Word: goldfish
Context Words: ['house', 'animal', 'goldfish', 'animal', 'sleep', 'house'], Target Word: sleep


In [3]:
import torch

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

def find_unique_words(file_location):
    text_corpus = ""
    with open(file_location, 'r') as file:
        text_corpus = file.read()
    text_corpus = text_corpus.lower()
    words = TreebankWordTokenizer().tokenize(text_corpus)
    words = [word for word in words if word.isalpha()]
    return sorted(set(words))
    
unique_words = find_unique_words("Datasets/text_corpus.txt")

# Converting context words and target word into one-hot encodings, then into torch tensors.

def one_hot_encoding(word, unique_words):
    encoding = [1 if word == i else 0 for i in unique_words]
    return torch.tensor(encoding, dtype=torch.float32)
    
encodings = {word: one_hot_encoding(word, unique_words) for word in unique_words}

cbow_vector_pairs = [([encodings[context_word] for context_word in context_words], encodings[target_word]) for context_words, target_word in cbows]
cbow_vector_pairs = [(torch.sum(torch.stack(context_vectors), dim=0), target_vector) for context_vectors, target_vector in cbow_vector_pairs]
print(cbow_vector_pairs[:1])

[(tensor([0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]))]


In [None]:
import os
import torch
from torch import nn

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

class NaiveWord2Vec(nn.Module):
    def __init__(self, VOCAB_SIZE, VECTOR_DIM) -> None:
        super().__init__()
        self.vocab_size = VOCAB_SIZE
        self.vector_dim = VECTOR_DIM
        self.W1 = nn.Parameter(data=torch.randn(self.vocab_size, self.vector_dim), requires_grad=True)
        self.W2 = nn.Parameter(data=torch.randn(self.vector_dim, self.vocab_size), requires_grad=True)
        
    def forward(self, x) -> torch.tensor:
        X = X @ self.W1
        X = X @ self.W2
        return X
    
VOCAB_SIZE = len(unique_words)
VECTOR_DIM = 2

model = NaiveWord2Vec(VOCAB_SIZE, VECTOR_DIM)

# Define train model function and convert tensors into PyTorch datasets

def train_model():
    loss_fn = nn.CrossEntropyLoss()
    