In [3]:
import tqdm
import collections
import more_itertools
import wandb
import pandas as pd
import torch
import random
import string
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(42)
random.seed(52)

# Generate a list of random letters (a-z)
letters = random.choices(string.ascii_lowercase, k=100)

# Join the letters with whitespace
text8 = ' '.join(letters)
text8

'z b s m y k x e z k e i q a k a b e f h u k w x x y o r w o d n n e c e s v r l a a r n l e d e d p h o l o t u d y w v l y v g r w t x n r b r d s x j b b n e t p n r d l b o u e r z i f g v c u g t'

In [5]:
def preprocess(text: str) -> list[str]:
  text = text.lower()
  words = text.split()
  stats = collections.Counter(words)
  words = [word for word in words if stats[word] > 0]
  return words

In [6]:
corpus: list[str] = preprocess(text8)

In [4]:
def create_lookup_tables(words: list[str]) -> tuple[dict[str, int], dict[int, str]]:
  word_counts = collections.Counter(words)
  vocab = sorted(word_counts, key=lambda k: word_counts.get(k), reverse=True)
  int_to_vocab = {ii+1: word for ii, word in enumerate(vocab)}
  int_to_vocab[0] = '<PAD>'
  vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
  return vocab_to_int, int_to_vocab

In [None]:
#creating dictionary 
words_to_ids, ids_to_words = create_lookup_tables(corpus)

In [9]:
#creating token for the list we have from our dictionary 
tokens = [words_to_ids[word] for word in corpus]

In [34]:
class SkipGramFoo(torch.nn.Module):
  def __init__(self, voc, emb, ctx):
    super().__init__()
    self.ctx = ctx
    self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
    self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
    self.sig = torch.nn.Sigmoid()


#new forwarding for batch size 
  def forward(self, inpt, trgs):
    
    emb = self.emb(inpt)
    batch_size = inpt.size(0)  
    
    ctx = self.ffw.weight[trgs.to(inpt.device)]  
    
    assert ctx.size(0) == emb.size(0), f"Context batch size {ctx.size(0)} doesn't match embeddings batch size {emb.size(0)}"
    # Perform batch matrix multiplication
    print(emb.shape)
    out = torch.bmm(ctx, emb.unsqueeze(2)).squeeze(2)  # Shape: (batch_size, 2)
    
    print(out.shape)
    # Apply sigmoid and clamp to prevent NaNs
    out = self.sig(out).clamp(min=1e-7, max=1 - 1e-7)
    
    return out 

In [99]:
import torch

# class SkipGramFoo(torch.nn.Module):
#     def __init__(self, voc, emb, ctx):
#         super().__init__()
#         self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
#         self.ctx_emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)  # Additional embedding for context
#         self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
#         self.sig = torch.nn.Sigmoid()

#     def forward(self, inpt, trgs):
#         # Get embeddings for input and context
#         emb = self.emb(inpt)
#         print(emb.shape)
#         ctx = self.ctx_emb(trgs)
#         print(ctx.shape)  # Get context embeddings
        
#         # Ensure dimensions match for operations
#         assert ctx.size(0) == emb.size(0), f"Context batch size {ctx.size(0)} doesn't match embeddings batch size {emb.size(0)}"
        
#         # Perform batch matrix multiplication
#         # (batch_size, emb_dim) * (batch_size, vocab_size) -> (batch_size, vocab_size)
#         context_matrix = torch.bmm(ctx, emb) # Change this if necessary
#         print('context matrix is', context_matrix.shape)
#         similarity_matrix = torch.matmul(context_matrix, context_matrix.T) 
#         print('similarity matrix is', similarity_matrix.shape)
#         soft_matrix = torch.nn.functional.softmax(similarity_matrix,dim=1)
    

#         return soft_matrix
import torch

class SkipGramFoo(torch.nn.Module):
    def __init__(self, voc, emb, ctx):
        super().__init__()
        self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
        self.ctx_emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)  # Additional embedding for context
        self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
        self.sig = torch.nn.Sigmoid()

    def forward(self, inpt, trgs):
        # Get embeddings for input and context
        emb = self.emb(inpt)  # Shape: (batch_size, emb_dim) -> e.g., (1, 64)
        print('shape of emb',emb.shape)
        ctx = self.ctx_emb(trgs)  # Shape: (batch_size, num_context_tokens, emb_dim) -> e.g., (1, 4, 64)
        print('shape of target words',ctx.shape)
        
        # Ensure dimensions match for operations
        assert ctx.size(2) == emb.size(1), f"Context batch size {ctx.size(0)} doesn't match embeddings batch size {emb.size(0)}"
        
        # Reshape emb to (batch_size, emb_dim, 1) to make it compatible for bmm
        ctx = ctx.squeeze(0)  # Shape: (1, 64, 1)

        assert ctx.size(1) == emb.size(1), f"Context batch size {ctx.size(0)} doesn't match embeddings batch size {emb.size(0)}"

        # Perform batch matrix multiplication
        # (batch_size, num_context_tokens, emb_dim) * (batch_size, emb_dim, 1) -> (batch_size, num_context_tokens, 1)
        context_matrix = torch.cat((emb,ctx) ,dim=0) # Resulting shape: (1, 4, 1)
        print('context matrix is', context_matrix.shape)

        # Compute similarity matrix for the context matrix
        similarity_matrix = torch.matmul(context_matrix, context_matrix.T) 
        print('similarity matrix is', similarity_matrix.shape)

        # Apply softmax over similarity matrix
        soft_matrix = torch.nn.functional.softmax(similarity_matrix, dim=1)
    
        return soft_matrix



In [100]:
args = (len(words_to_ids), 64,2)
mFoo = SkipGramFoo(*args)
print('mFoo', sum(p.numel() for p in mFoo.parameters()))

mFoo 5184


In [67]:
#learning rate thing 
opFoo = torch.optim.Adam(mFoo.parameters(), lr=0.003)

In [101]:
import torch
import more_itertools
import tqdm
import wandb

# Initialize W&B
wandb.init(project="word2vec_attention", entity="omareweis123", name='no_batching, tokens1000000000, 30epochs, titlesadded')

# Set parameters
learning_rate = 0.001  # Define your learning rate
mFoo = mFoo.to(device)

# Set context size
context_size = 2  # Example context size
window_size = 2 * context_size + 1  # Total tokens in the window

# Initialize the optimizer
opFoo = torch.optim.Adam(mFoo.parameters(), lr=learning_rate)

# Training loop
for epoch in range(1):
    wins = list(more_itertools.windowed(tokens[:1000000000], window_size))  # Convert to list for easier iteration
    prgs = tqdm.tqdm(wins, total=len(wins), desc=f"Epoch {epoch + 1}", leave=False)

    total_loss = 0.0  # Initialize total loss for the epoch

    for win in prgs:
        # Prepare input and target tensors for a single window
        inpt = torch.LongTensor([win[context_size]]).to(device)  # Central token for the window
        trgs = torch.LongTensor([win[:context_size] + win[context_size + 1:]]).to(device)  # Context tokens (left and right)

        print(inpt.shape)
        print(trgs.shape)

        # Zero gradients
        opFoo.zero_grad()
        
        # Forward pass
        loss = mFoo(inpt, trgs)

        # Backward pass and optimization
        loss.backward()
        opFoo.step()

        # Accumulate loss
        total_loss += loss.item()

        # Log the loss
        wandb.log({'loss': loss.item(), 'learning_rate': learning_rate})

    # Calculate and log average loss for the epoch
    average_loss = total_loss / len(wins) if len(wins) > 0 else 0
    wandb.log({'average_loss': average_loss})

# Finish the W&B logging
wandb.finish()


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

                                               

torch.Size([1])
torch.Size([1, 4])
shape of emb torch.Size([1, 64])
shape of target words torch.Size([1, 4, 64])
context matrix is torch.Size([5, 64])
similarity matrix is torch.Size([5, 5])




RuntimeError: grad can be implicitly created only for scalar outputs