In [2]:
#
#
#

import tqdm
import collections
import more_itertools
import wandb
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#
#
#
torch.manual_seed(42)


#
#
#
with open('text8') as f: text8: str = f.read()

# Load the CSV file and extract the 'title_cleaned' column
df = pd.read_csv('titles_20_scores')
titles_cleaned = df['title_cleaned'].tolist()

# Convert the list of titles to a single string (you can choose a delimiter)
titles_string = ' '.join(titles_cleaned)  # Joining with a space

# Concatenate the titles string to the text8 variable
text8 += ' ' + titles_string  # Add a space for separation

#
#
#
def preprocess(text: str) -> list[str]:
  text = text.lower()
  text = text.replace('.',  ' <PERIOD> ')
  text = text.replace(',',  ' <COMMA> ')
  text = text.replace('"',  ' <QUOTATION_MARK> ')
  text = text.replace(';',  ' <SEMICOLON> ')
  text = text.replace('!',  ' <EXCLAMATION_MARK> ')
  text = text.replace('?',  ' <QUESTION_MARK> ')
  text = text.replace('(',  ' <LEFT_PAREN> ')
  text = text.replace(')',  ' <RIGHT_PAREN> ')
  text = text.replace('--', ' <HYPHENS> ')
  text = text.replace('?',  ' <QUESTION_MARK> ')
  text = text.replace(':',  ' <COLON> ')
  words = text.split()
  stats = collections.Counter(words)
  words = [word for word in words if stats[word] > 5]
  return words


#
#
#
corpus: list[str] = preprocess(text8)
print(type(corpus)) # <class 'list'>
print(len(corpus))  # 16,680,599
print(corpus[:7])   # ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


#
#
#
def create_lookup_tables(words: list[str]) -> tuple[dict[str, int], dict[int, str]]:
  word_counts = collections.Counter(words)
  vocab = sorted(word_counts, key=lambda k: word_counts.get(k), reverse=True)
  int_to_vocab = {ii+1: word for ii, word in enumerate(vocab)}
  int_to_vocab[0] = '<PAD>'
  vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
  return vocab_to_int, int_to_vocab


#
#
#
words_to_ids, ids_to_words = create_lookup_tables(corpus)
tokens = [words_to_ids[word] for word in corpus]
print(type(tokens)) # <class 'list'>
print(len(tokens))  # 16,680,599
print(tokens[:7])   # [5234, 3081, 12, 6, 195, 2, 3134]


#
#
#
print(ids_to_words[5234])        # anarchism
print(words_to_ids['anarchism']) # 5234
print(words_to_ids['have'])      # 3081
print(len(words_to_ids))         # 63,642


#
#
#
class SkipGramOne(torch.nn.Module):
  def __init__(self, voc, emb, _):
    super().__init__()
    self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
    self.ffw = torch.nn.Linear(in_features=emb, out_features=voc)
    self.max = torch.nn.Softmax(dim=1)

  def forward(self, inpt, trgs):
    emb = self.emb(inpt)
    out = self.ffw(emb)
    sft = self.max(out)
    return -(sft[0, trgs]).log().mean()


#
#
#
class SkipGramTwo(torch.nn.Module):
  def __init__(self, voc, emb, ctx):
    super().__init__()
    self.ctx = ctx
    self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
    self.ffw = torch.nn.Linear(in_features=emb, out_features=ctx*voc)
    self.max = torch.nn.Softmax(dim=1)

  def forward(self, inpt, trgs):
    emb = self.emb(inpt)
    hid = self.ffw(emb)
    lgt = hid.view(self.ctx, -1)
    sft = self.max(lgt)
    arg = torch.arange(sft.size(0))
    foo = sft[arg, trgs]
    return -foo.log().mean()


#
#
#
class SkipGramTre(torch.nn.Module):
  def __init__(self, voc, emb, ctx):
    super().__init__()
    self.ctx = ctx
    self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
    self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
    self.sig = torch.nn.Sigmoid()

  def forward(self, inpt, trgs):
    emb = self.emb(inpt)
    ctx = self.ffw.weight[trgs]
    lgt = torch.mm(ctx, emb.T)
    sig = self.sig(lgt)
    return -sig.log().mean()


#
#
#
class SkipGramFoo(torch.nn.Module):
  def __init__(self, voc, emb, ctx):
    super().__init__()
    self.ctx = ctx
    self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
    self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
    self.sig = torch.nn.Sigmoid()

  # def forward(self, inpt, trgs, rand):
  #   emb = self.emb(inpt)
  #   ctx = self.ffw.weight[trgs]
  #   rnd = self.ffw.weight[rand]
  #   out = torch.mm(ctx, emb.T)
  #   rnd = torch.mm(rnd, emb.T)
  #   out = self.sig(out).clamp(min=1e-7, max=1 - 1e-7)
  #   rnd = self.sig(rnd).clamp(min=1e-7, max=1 - 1e-7)
  #   pst = -out.log().mean()
  #   ngt = -(1 - rnd).log().mean()
  #   return pst + ngt

#new forwarding for batch size 
  def forward(self, inpt, trgs, rand):
    # Embedding lookup for input (shape: [batch_size, embedding_dim])
    emb = self.emb(inpt)
    
    # Ensure context (trgs) and random samples (rand) have the same batch size as inpt
    batch_size = inpt.size(0)  # Get the current batch size

    # Slice or generate the random tensor according to the input batch size
    rand = rand[:batch_size]  # Adjust random tensor to match current batch size
    
    ctx = self.ffw.weight[trgs.to(inpt.device)]  # Shape: [batch_size, 2, embedding_dim]
    rnd = self.ffw.weight[rand.to(inpt.device)]  # Shape: [batch_size, 2, embedding_dim]

    # Ensure the batch size matches before performing batch matrix multiplication
    assert ctx.size(0) == emb.size(0), f"Context batch size {ctx.size(0)} doesn't match embeddings batch size {emb.size(0)}"
    assert rnd.size(0) == emb.size(0), f"Random batch size {rnd.size(0)} doesn't match embeddings batch size {emb.size(0)}"
    
    # Perform batch matrix multiplication
    out = torch.bmm(ctx, emb.unsqueeze(2)).squeeze(2)  # Shape: (batch_size, 2)
    rnd = torch.bmm(rnd, emb.unsqueeze(2)).squeeze(2)  # Shape: (batch_size, 2)
    
    # Apply sigmoid and clamp to prevent NaNs
    out = self.sig(out).clamp(min=1e-7, max=1 - 1e-7)
    rnd = self.sig(rnd).clamp(min=1e-7, max=1 - 1e-7)

    # Calculate loss
    pst = -out.log().mean()   # Positive sample log-likelihood
    ngt = -(1 - rnd).log().mean()  # Negative sample log-likelihood
    
    return pst + ngt





#
#
#
args = (len(words_to_ids), 64, 2)
mOne = SkipGramOne(*args)
mTwo = SkipGramTwo(*args)
mTre = SkipGramTre(*args)
mFoo = SkipGramFoo(*args)


#
#
#
print('mOne', sum(p.numel() for p in mOne.parameters()))
print('mTwo', sum(p.numel() for p in mTwo.parameters()))
print('mTre', sum(p.numel() for p in mTre.parameters()))
print('mFoo', sum(p.numel() for p in mFoo.parameters()))


#
#
#
opOne = torch.optim.Adam(mOne.parameters(), lr=0.003)
opTwo = torch.optim.Adam(mTwo.parameters(), lr=0.003)
opTre = torch.optim.Adam(mTre.parameters(), lr=0.003)
opFoo = torch.optim.Adam(mFoo.parameters(), lr=0.003)


# #
# #
# #
# wandb.init(project='skip-gram', name='mOne')
# for epoch in range(10):
#   wins = more_itertools.windowed(tokens[:10000], 3)
#   prgs = tqdm.tqdm(enumerate(wins), total=len(tokens[:10000]), desc=f"Epoch {epoch+1}", leave=False)
#   for i, tks in prgs:
#     opOne.zero_grad()
#     inpt = torch.LongTensor([tks[1]])
#     trgs = torch.LongTensor([tks[0], tks[2]])
#     loss = mOne(inpt, trgs)
#     loss.backward()
#     opOne.step()
#     wandb.log({'loss': loss.item()})
# wandb.finish()


# #
# #
# #
# wandb.init(project='skip-gram', name='mTwo')
# for epoch in range(10):
#   wins = more_itertools.windowed(tokens[:10000], 3)
#   prgs = tqdm.tqdm(wins, desc=f"Epoch {epoch+1}", leave=False)
#   for i, tks in prgs:
#     inpt = torch.LongTensor([tks[1]])
#     trgs = torch.LongTensor([tks[0], tks[2]])
#     opTwo.zero_grad()
#     loss = mTwo(inpt, trgs)
#     loss.backward()
#     opTwo.step()
#     wandb.log({'loss': loss.item()})
# wandb.finish()


# #
# #
# #
# wandb.init(project='skip-gram', name='mTre')
# for epoch in range(10):
#   wins = more_itertools.windowed(tokens[:10000], 3)
#   prgs = tqdm.tqdm(enumerate(wins), total=len(tokens[:10000]), desc=f"Epoch {epoch+1}", leave=False)
#   for i, tks in prgs:
#     inpt = torch.LongTensor([tks[1]])
#     trgs = torch.LongTensor([tks[0], tks[2]])
#     opTre.zero_grad()
#     loss = mTre(inpt, trgs)
#     loss.backward()
#     opTre.step()
#     wandb.log({'loss': loss.item()})
# wandb.finish()
import torch
import more_itertools
import tqdm
import wandb

# Initialize W&B
wandb.init(project="cbow_training", entity="omareweis123", name='batch_size(4096),tokens1000000000,30epochs,titlesadded')

# Set parameters
batch_size = 4096
learning_rate = 0.001  # Define your learning rate
mFoo = mFoo.to(device)

# Set context size
context_size = 2  # Example context size
window_size = 2 * context_size + 1  # Total tokens in the window

# Initialize the optimizer
opFoo = torch.optim.Adam(mFoo.parameters(), lr=learning_rate)

# Training loop
for epoch in range(30):
    wins = list(more_itertools.windowed(tokens[:1000000000], window_size))  # Convert to list for easier batching
    prgs = tqdm.tqdm(range(0, len(wins), batch_size), total=len(wins) // batch_size, desc=f"Epoch {epoch + 1}", leave=False)

    total_loss = 0.0  # Initialize total loss for the epoch
    num_batches = 0   # Counter for batches

    for batch_idx in prgs:
        batch_wins = wins[batch_idx:batch_idx + batch_size]

        # Prepare batch inputs and targets, ensuring they're on the correct device
        inpts = torch.LongTensor([win[context_size] for win in batch_wins]).to(device)  # Central token for each window
        trgs = torch.LongTensor([[win[i] for i in range(context_size)] + [win[i] for i in range(context_size + 1, window_size)]
                                  for win in batch_wins]).to(device)  # Context tokens (left and right)
        rand = torch.randint(0, len(words_to_ids), (batch_size, 2)).to(device)  # Random negative samples on the same device

        # Zero gradients
        opFoo.zero_grad()

        # Forward pass
        loss = mFoo(inpts, trgs, rand)

        # Backward pass and optimization
        loss.backward()
        opFoo.step()

        # Accumulate loss
        total_loss += loss.item()
        num_batches += 1

        # Log the loss
        wandb.log({'loss': loss.item(), 'learning_rate': learning_rate})

    # Calculate and log average loss for the epoch
    average_loss = total_loss / num_batches if num_batches > 0 else 0
    wandb.log({'average_loss': average_loss})

# Finish the W&B logging
wandb.finish()





## this is without batch size 
# # Initialize Weights and Biases
# wandb.init(project="cbow_training", entity="omareweis123",name='clamped_logs')

# # Move the model to the GPU if available
# mFoo = mFoo.to(device)

# # Training loop
# for epoch in range(10):
#     wins = more_itertools.windowed(tokens[:10000], 3)
#     prgs = tqdm.tqdm(enumerate(wins), total=len(tokens[:10000]), desc=f"Epoch {epoch+1}", leave=False)
#     for i, tks in prgs:
#         # Move input tensors to the same device (GPU or CPU)
#         inpt = torch.LongTensor([tks[1]]).to(device)
#         trgs = torch.LongTensor([tks[0], tks[2]]).to(device)
#         rand = torch.randint(0, len(words_to_ids), (2,)).to(device)

#         # Zero gradients
#         opFoo.zero_grad()

#         # Forward pass
#         loss = mFoo(inpt, trgs, rand)

#         # Backward pass and optimization
#         loss.backward()
#         opFoo.step()

#         # Log the loss
#         wandb.log({'loss': loss.item()})

# # Finish the W&B logging
# wandb.finish()


#
#
#
# wandb.init(project="cbow_training", entity="omareweis123")

# # Define your token limit here (e.g., 10,000 tokens)
# token_limit = 10000

# # Split data into 80% train, 20% validation from the limited set
# train_tokens = tokens[:int(0.8 * token_limit)]
# val_tokens = tokens[int(0.8 * token_limit):token_limit]

# # Move the model to the correct device
# mFoo = mFoo.to(device)

# for epoch in range(10):
#     # Training loop
#     mFoo.train()
#     train_wins = list(more_itertools.windowed(train_tokens, 3))  # Create training windows
#     train_prgs = tqdm.tqdm(enumerate(train_wins), total=len(train_wins), desc=f"Epoch {epoch+1} [Train]", leave=False)
#     train_loss_total = 0
#     for i, tks in train_prgs:
#         if None in tks:  # Skip invalid windows (e.g., at the end)
#             continue
#         inpt = torch.LongTensor([tks[1]]).to(device)
#         trgs = torch.LongTensor([tks[0], tks[2]]).to(device)
#         rand = torch.randint(0, len(words_to_ids), (2,)).to(device)  # Move rand to device
#         opFoo.zero_grad()
#         loss = mFoo(inpt, trgs, rand)
#         loss.backward()
#         opFoo.step()
#         train_loss_total += loss.item()
#         wandb.log({'train_loss': loss.item()})

#     avg_train_loss = train_loss_total / len(train_wins)
#     wandb.log({'avg_train_loss': avg_train_loss})

#     # Validation loop
#     mFoo.eval()
#     val_wins = list(more_itertools.windowed(val_tokens, 3))  # Create validation windows
#     val_prgs = tqdm.tqdm(enumerate(val_wins), total=len(val_wins), desc=f"Epoch {epoch+1} [Validation]", leave=False)
#     val_loss_total = 0
#     with torch.no_grad():
#         for i, tks in val_prgs:
#             if None in tks:  # Skip invalid windows
#                 continue
#             inpt = torch.LongTensor([tks[1]]).to(device)
#             trgs = torch.LongTensor([tks[0], tks[2]]).to(device)
#             rand = torch.randint(0, len(words_to_ids), (2,)).to(device)  # Move rand to device
#             val_loss = mFoo(inpt, trgs, rand)
#             val_loss_total += val_loss.item()

#     avg_val_loss = val_loss_total / len(val_wins)
#     wandb.log({'avg_val_loss': avg_val_loss})

# wandb.finish()


<class 'list'>
19214318
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
<class 'list'>
19214318
[6078, 3612, 13, 7, 226, 2, 2811]
eggs
6078
39
70572
mOne 9103788
mTwo 13690968
mTre 9033216
mFoo 9033216


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33momareweis123[0m. Use [1m`wandb login --relogin`[0m to force relogin


                                                              

0,1
average_loss,█▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▄▂▂▃▂▂▂▃▂▃▂▃▂▂▂▂▃▂▁▂▁▃▃▂▂▂▂▃▃▂▂▃▃▃▂▂▃▂▂

0,1
average_loss,0.42727
learning_rate,0.001
loss,0.49783


In [3]:
size_of_ids_to_word = len(ids_to_words)
print(f"Size of ids_to_word dictionary: {size_of_ids_to_word}")

Size of ids_to_word dictionary: 70572


In [4]:
len(text8)

116327976

In [5]:
# Assuming text8 is your string
tokens = te.split()  # Tokenize by splitting on whitespace
number_of_tokens = len(tokens)  # Count the number of tokens

print(f"Number of tokens in text8: {number_of_tokens}")


NameError: name 'te' is not defined

In [6]:
#saving dictionary of words 
import pickle


# save dictionary to person_data.pkl file
with open('vocab_dict.pkl', 'wb') as fp:
    pickle.dump(ids_to_words, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [38]:
#verigying model 
# Print all weights and biases of the model
print(mFoo.state_dict())


OrderedDict({'emb.weight': tensor([[ 1.8787, -0.1666, -1.8597,  ..., -1.0010, -0.9638, -0.6151],
        [-0.1616,  0.0884, -0.0934,  ...,  0.0709,  0.4607,  0.6704],
        [-0.0627, -0.1363,  0.0163,  ...,  0.0908, -0.1663,  0.4475],
        ...,
        [-3.3083, -1.5824,  1.5640,  ...,  0.6715,  0.5412, -0.2053],
        [-0.5803, -1.9302, -0.1088,  ..., -0.6040, -0.7218,  1.2112],
        [-0.4179,  0.4456, -0.5507,  ...,  1.5949,  1.3342, -0.4006]],
       device='cuda:0'), 'ffw.weight': tensor([[-0.0559, -3.1326,  3.4805,  ..., -2.4313,  0.8235,  1.0211],
        [-0.0750,  4.2574, -4.5288,  ...,  2.9554, -1.2074, -0.9694],
        [-0.1841,  3.7869, -4.1561,  ...,  2.5711, -1.0139, -1.0283],
        ...,
        [-0.1226, -3.0458,  3.6798,  ..., -2.2324,  0.8989,  1.1990],
        [-0.1888, -3.4186,  3.6209,  ..., -2.3780,  1.0202,  0.9866],
        [-0.1028, -3.1571,  3.4493,  ..., -2.2879,  1.0355,  0.8634]],
       device='cuda:0')})


In [7]:
model_save_path = "skipgram_model_titles.pth"  # You can name the file as you like
torch.save(mFoo.state_dict(), model_save_path)

# Fine tuning model 

In [52]:
import torch
import pickle
import pandas as pd
import tqdm
import more_itertools

# Step 1: Load the updated dictionary (vocabulary)
with open('updated_dict_20.pkl', 'rb') as fp:
    updated_vocab = pickle.load(fp)

# Step 2: Load the CSV file and preprocess the 'title_cleaned' column
df = pd.read_csv('titles_20')
titles = df['title_cleaned'].tolist()

# Step 3: Preprocess and tokenize the 'title_cleaned' column
def tokenize_titles(titles, vocab):
    tokens = []
    for title in titles:
        words = title.split()
        tokenized = [vocab.get(word, vocab.get('<UNK>')) for word in words]  # Use <UNK> for unknown words
        tokens.append(tokenized)
    return tokens

# Tokenize the new data
tokenized_titles = tokenize_titles(titles, updated_vocab)

# Step 4: Load the trained model
model_path = "skipgram_model.pth"  # Update with your actual saved model path
mFoo = SkipGramFoo(len(updated_vocab), 64, 2).to(device)

# Step 5: Load the state_dict and modify keys
state_dict = torch.load(model_path)

# Load the modified state_dict into the model
mFoo.load_state_dict(state_dict)

## Set parameters
batch_size = 512
learning_rate = 0.001  # Define your learning rate
mFoo = mFoo.to(device)

# Set context size
context_size = 2  # Example context size
window_size = 2 * context_size + 1  # Total tokens in the window

# Initialize the optimizer
opFoo = torch.optim.Adam(mFoo.parameters(), lr=learning_rate)

# Training loop
for epoch in range(30):
    wins = list(more_itertools.windowed(tokenized_titles[:1000000000], window_size))  # Convert to list for easier batching
    prgs = tqdm.tqdm(range(0, len(wins), batch_size), total=len(wins) // batch_size, desc=f"Epoch {epoch + 1}", leave=False)

    total_loss = 0.0  # Initialize total loss for the epoch
    num_batches = 0   # Counter for batches

    for batch_idx in prgs:
        batch_wins = wins[batch_idx:batch_idx + batch_size]

        # Prepare batch inputs and targets, ensuring they're on the correct device
        inpts = torch.LongTensor([win[context_size] for win in batch_wins]).to(device)  # Central token for each window
        trgs = torch.LongTensor([[win[i] for i in range(context_size)] + [win[i] for i in range(context_size + 1, window_size)]
                                  for win in batch_wins]).to(device)  # Context tokens (left and right)
        rand = torch.randint(0, len(updated_vocab), (batch_size, 2)).to(device)  # Random negative samples on the same device

        # Zero gradients
        opFoo.zero_grad()

        # Forward pass
        loss = mFoo(inpts, trgs, rand)

        # Backward pass and optimization
        loss.backward()
        opFoo.step()

        # Accumulate loss
        total_loss += loss.item()
        num_batches += 1

        # Log the loss
        wandb.log({'loss': loss.item(), 'learning_rate': learning_rate})

    # Calculate and log average loss for the epoch
    average_loss = total_loss / num_batches if num_batches > 0 else 0
    wandb.log({'average_loss': average_loss})

# Finish the W&B logging
wandb.finish()


  state_dict = torch.load(model_path)


RuntimeError: Error(s) in loading state_dict for SkipGramFoo:
	size mismatch for emb.weight: copying a param with shape torch.Size([63642, 64]) from checkpoint, the shape in current model is torch.Size([110812, 64]).
	size mismatch for ffw.weight: copying a param with shape torch.Size([63642, 64]) from checkpoint, the shape in current model is torch.Size([110812, 64]).

In [73]:
import torch

# Assuming you have the model and vocab already set up
word_to_check = 'anarchism'  # The word you want to check similarity for
word_index = words_to_ids[word_to_check]  # Get the index of the word
embedding_dim = 64  # Set the embedding dimension used in your model
cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-08)

# Get the embedding for the word you want to check
with torch.no_grad():
    target_embedding = mFoo.emb(torch.LongTensor([word_index]).to(device))  # Shape: [1, embedding_dim]

# Initialize a list to store similarities
similarities = []

# Loop through all words in your vocabulary to calculate cosine similarity
for idx in range(len(words_to_ids)):
    # Get the embedding for each word
    current_embedding = mFoo.emb(torch.LongTensor([idx]).to(device))  # Shape: [1, embedding_dim]

    # Compute cosine similarity
    similarity = cosine_similarity(target_embedding, current_embedding)  # Shape: [1]
    similarities.append((ids_to_words[idx], similarity.item()))  # Store the word and its similarity

# Sort by similarity score
similarities.sort(key=lambda x: x[1], reverse=True)

# Get top 10 most similar words
top_similar_words = similarities[:10]

# Display the results
print("Top 10 words similar to '{}':".format(word_to_check))
for word, sim in top_similar_words:
    print(f"{word}: {sim:.4f}")


Top 10 words similar to 'anarchism':
anarchism: 1.0000
anarcho: 0.6730
libertarianism: 0.6398
capitalism: 0.6392
conservatism: 0.6252
individualist: 0.6215
ideologies: 0.6089
sombart: 0.5846
darwinism: 0.5809
critique: 0.5808


In [9]:
import torch

# Assuming you have the model and vocab already set up
word_to_check = 'car'  # The word you want to check similarity for
word_index = words_to_ids[word_to_check]  # Get the index of the word
embedding_dim = 64  # Set the embedding dimension used in your model
cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-08)

# Get the embedding for the target word you want to check
with torch.no_grad():
    target_embedding = mFoo.emb(torch.LongTensor([word_index]).to(device))  # Shape: [1, embedding_dim]

# Get all embeddings at once and move to GPU
with torch.no_grad():
    all_embeddings = mFoo.emb.weight.to(device)  # Shape: [vocab_size, embedding_dim]

# Repeat the target embedding to match the shape of all embeddings
target_embedding = target_embedding.expand_as(all_embeddings)  # Shape: [vocab_size, embedding_dim]

# Compute cosine similarity in one batch
with torch.no_grad():
    similarities = cosine_similarity(target_embedding, all_embeddings)  # Shape: [vocab_size]

# Convert similarities to a list of tuples with word and similarity score
similarities_list = [(ids_to_words[idx], similarities[idx].item()) for idx in range(len(words_to_ids))]

# Sort by similarity score
similarities_list.sort(key=lambda x: x[1], reverse=True)

# Get top 10 most similar words
top_similar_words = similarities_list[:10]

# Display the results
print(f"Top 10 words similar to '{word_to_check}':")
for word, sim in top_similar_words:
    print(f"{word}: {sim:.4f}")


Top 10 words similar to 'car':
car: 1.0000
cars: 0.7031
passenger: 0.5895
airliners: 0.5852
canaveral: 0.5777
diesel: 0.5452
automobile: 0.5444
boat: 0.5436
transatlantic: 0.5403
supersonic: 0.5341


: 

In [80]:
import torch

# Assuming you have the model and vocab already set up
word_to_check = 'france'  # The word you want to check similarity for
word_index = words_to_ids[word_to_check]  # Get the index of the word
embedding_dim = 64  # Set the embedding dimension used in your model
cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-08)

# Get the embedding for the word you want to check
with torch.no_grad():
    target_embedding = mFoo.emb(torch.LongTensor([word_index]).to(device))  # Shape: [1, embedding_dim]

# Initialize a list to store similarities
similarities = []

# Loop through all words in your vocabulary to calculate cosine similarity
for idx in range(len(words_to_ids)):
    # Get the embedding for each word
    current_embedding = mFoo.emb(torch.LongTensor([idx]).to(device))  # Shape: [1, embedding_dim]

    # Compute cosine similarity
    similarity = cosine_similarity(target_embedding, current_embedding)  # Shape: [1]
    similarities.append((ids_to_words[idx], similarity.item()))  # Store the word and its similarity

# Sort by similarity score
similarities.sort(key=lambda x: x[1], reverse=True)

# Get top 10 most similar words
top_similar_words = similarities[:10]

# Display the results
print("Top 10 words similar to '{}':".format(word_to_check))
for word, sim in top_similar_words:
    print(f"{word}: {sim:.4f}")

Top 10 words similar to 'france':
france: 1.0000
italy: 0.7116
french: 0.6191
spain: 0.6022
luxembourg: 0.5642
maggiore: 0.5628
paris: 0.5480
normandy: 0.5472
belgium: 0.5318
monaco: 0.5258


In [79]:
import torch

# Assuming you have the model and vocab already set up
word_to_check = 'fruit'  # The word you want to check similarity for
word_index = words_to_ids[word_to_check]  # Get the index of the word
embedding_dim = 64  # Set the embedding dimension used in your model
cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-08)

# Get the embedding for the word you want to check
with torch.no_grad():
    target_embedding = mFoo.emb(torch.LongTensor([word_index]).to(device))  # Shape: [1, embedding_dim]

# Initialize a list to store similarities
similarities = []

# Loop through all words in your vocabulary to calculate cosine similarity
for idx in range(len(words_to_ids)):
    # Get the embedding for each word
    current_embedding = mFoo.emb(torch.LongTensor([idx]).to(device))  # Shape: [1, embedding_dim]

    # Compute cosine similarity
    similarity = cosine_similarity(target_embedding, current_embedding)  # Shape: [1]
    similarities.append((ids_to_words[idx], similarity.item()))  # Store the word and its similarity

# Sort by similarity score
similarities.sort(key=lambda x: x[1], reverse=True)

# Get top 10 most similar words
top_similar_words = similarities[:10]

# Display the results
print("Top 10 words similar to '{}':".format(word_to_check))
for word, sim in top_similar_words:
    print(f"{word}: {sim:.4f}")

Top 10 words similar to 'fruit':
fruit: 1.0000
meat: 0.7175
potatoes: 0.6929
maize: 0.6673
vegetables: 0.6496
tomatoes: 0.6483
goats: 0.6483
foods: 0.6424
boiled: 0.6423
chocolate: 0.6352
