# model based on queries 

In [7]:
#loading in dictionary 
# Loading the vocab_dict again
import pickle
with open("updated_vocab_dict.pkl", "rb") as f:
    updated_vocab = pickle.load(f)

In [8]:
len(updated_vocab)

86996

In [9]:
import pandas as pd
import torch
import random
from tqdm import tqdm
import numpy as np  # Import the tqdm library for progress bars

# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create a reverse vocabulary from the updated vocabulary
def create_reverse_vocab(vocab):
    return {word: index for index, word in vocab.items()}

# Load the reverse vocabulary
reverse_vocab = create_reverse_vocab(updated_vocab)

# Tokenize the titles using the reverse vocabulary
def tokenize_titles(titles, reverse_vocab):
    tokens = []
    
    for title in titles:
        words = title.lower().split()  # Convert the title to lowercase to match training preprocessing
        
        tokenized = []
        for word in words:
            if word in reverse_vocab:
                tokenized.append(reverse_vocab[word])  # Get the index from reverse_vocab
            # No else clause needed; unknown words are simply skipped
        
        tokens.append(tokenized)
    
    return tokens


In [21]:

# Step 3: Define the SkipGramFoo model
class SkipGramFoo(torch.nn.Module):
    def __init__(self, voc, emb, ctx):
        super().__init__()
        self.ctx = ctx
        self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
        self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
        self.sig = torch.nn.Sigmoid()

    def forward(self, inpt, trgs, rand):
        emb = self.emb(inpt)
        batch_size = inpt.size(0)
        rand = rand[:batch_size]
        
        ctx = self.ffw.weight[trgs.to(inpt.device)]
        rnd = self.ffw.weight[rand.to(inpt.device)]
        
        out = torch.bmm(ctx.view(batch_size, 1, -1), emb.unsqueeze(2)).squeeze(2)
        rnd = torch.bmm(rnd.view(batch_size, 1, -1), emb.unsqueeze(2)).squeeze(2)

        out = self.sig(out).clamp(min=1e-7, max=1 - 1e-7)
        rnd = self.sig(rnd).clamp(min=1e-7, max=1 - 1e-7)

        pst = -out.log().mean()
        ngt = -(1 - rnd).log().mean()
        
        return pst + ngt

# Load the model
embedding_dim = 64
model_path = "finetuned_skipgram_model.pth"
mFoo = SkipGramFoo(len(updated_vocab), embedding_dim, 2).to(device)
mFoo.load_state_dict(torch.load(model_path), strict=False)
mFoo.eval()

# Step 4: Generate embeddings for tokenized texts (query, passage, negative samples) with a progress bar
def get_embeddings_for_tokens(tokenized_list, model):
    embeddings_list = []
    
    with torch.no_grad():
        for tokens in tqdm(tokenized_list, desc="Generating embeddings", unit="text"):  # Add progress bar
            if len(tokens) > 0:
                token_tensor = torch.LongTensor(tokens).to(device)
                token_embeddings = model.emb(token_tensor)  # Shape: [num_tokens, embedding_dim]
                embeddings_list.append(token_embeddings.cpu().numpy())  # Keep the full sequence embeddings
            else:
                embeddings_list.append(torch.zeros((1, embedding_dim)).cpu().numpy())  # Zero vector for empty sequences
    
    return embeddings_list


  mFoo.load_state_dict(torch.load(model_path), strict=False)


In [11]:
# Step 5: Generate embeddings for each title
def get_embeddings_for_titles(tokenized_titles, model):
    embeddings_list = []
    
    with torch.no_grad():  # Disable gradient calculations for faster performance
        for tokens in tokenized_titles:
            if len(tokens) > 0:
                # Move the tokens to the GPU
                token_tensor = torch.LongTensor(tokens).to(device)
                
                # Get the embeddings for each token in the title
                token_embeddings = model.emb(token_tensor)  # Shape: [num_tokens, embedding_dim]
                
                # Average the token embeddings to get a single vector for the entire title
                title_embedding = token_embeddings.mean(dim=0)  # Shape: [embedding_dim]
                
                embeddings_list.append(title_embedding.cpu().numpy())  # Store the embedding as a NumPy array
            else:
                # Handle empty titles (if any)
                embeddings_list.append(torch.zeros(embedding_dim).cpu().numpy())  # Zero vector for empty titles
    
    return embeddings_list

In [12]:
results = pd.read_csv('results_negative.csv')

In [70]:
results = results.iloc[1:30000]

In [71]:
results

Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,query,passage_text,negative_sample
3,3,3,3,3,3,3,what is rba,the inner of atomizer are surprisingly simple ...,or are herbal that promote greater density and...
4,4,4,4,4,4,4,what is rba,based accountability also known as rba is way ...,find by star from the big dipper if you re fam...
5,5,5,5,5,5,5,what is rba,based accountability also known as rba is way ...,how to convert to one degree is equal rad the ...
6,6,6,6,6,6,6,what is rba,rba data driven decision making process to hel...,the hydrogen bond length of water with tempera...
7,7,7,7,7,7,7,what is rba,identity manager risk based authentication rba...,pancreatitis is inflammation of the pancreas t...
...,...,...,...,...,...,...,...,...,...
495,495,495,495,495,495,495,history of microchip timeline,illustration from jack inventor journal circui...,pathologist the under microscope to see if the...
496,496,496,496,496,496,496,history of microchip timeline,home computer history description are made up ...,solution there are in one inch set up the conv...
497,497,497,497,497,497,497,history of microchip timeline,the history of rfid shown steady development i...,if you it approximately four to process refund...
498,498,498,498,498,498,498,history of microchip timeline,history of circuit the circuit otherwise known...,of the maximum and minimum the maximum and min...


In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence  # For padding sequences
from tqdm import tqdm  # For progress bar
import wandb  # Import Weights and Biases


class TowerOneRNN(nn.Module):
    def __init__(self):
        super(TowerOneRNN, self).__init__()
        self.rnn = nn.RNN(input_size=64, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 3)

    def forward(self, x, lengths):
        # Pack the padded sequences
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        # Pass input through the RNN layer
        x, _ = self.rnn(x)
        
        # Unpack the output and get the last time step
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = x[torch.arange(x.size(0)), lengths - 1]  # Get the last valid time step
        
        # Pass through the fully connected layer
        x = self.fc(x)
        
        return x

class TowerTwoRNN(nn.Module):
    def __init__(self):
        super(TowerTwoRNN, self).__init__()
        self.rnn = nn.RNN(input_size=64, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 3)

    def forward(self, x, lengths):
        # Pack the padded sequences
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        # Pass input through the RNN layer
        x, _ = self.rnn(x)
        
        # Unpack the output and get the last time step
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = x[torch.arange(x.size(0)), lengths - 1]  # Get the last valid time step
        
        # Pass through the fully connected layer
        x = self.fc(x)
        
        return x


In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence  # For padding sequences
from tqdm import tqdm  # For progress bar
import wandb  # Import Weights and Biases

# Initialize W&B project
wandb.init(project="twotower_training", entity="omareweis123", name='fine_tuning_skipgram/packed_embedding')

# Load the saved SkipGram model
model_save_path = "finetuned_skipgram_model.pth"
checkpoint = torch.load(model_save_path)

# Initialize the SkipGram model
skipgram_model = SkipGramFoo(86996, 64, 2).to(device)  # Ensure to send the model to the correct device
skipgram_model.load_state_dict(checkpoint['model_state_dict'])
skipgram_model.eval()  # Set the model to evaluation mode if you're not training it again

# Tower RNN Models
tower_one = TowerOneRNN().to(device)
tower_two = TowerTwoRNN().to(device)

# Triplet margin loss with cosine distance
triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1.0 - nn.functional.cosine_similarity(x, y),
    margin=1.0,
    reduction='mean'
)

# Define optimizer for the tower models
optimizer = optim.Adam(list(tower_one.parameters()) + list(tower_two.parameters()), lr=0.001)

# Define query batch size
query_batch_size = 128  # Number of queries to process in a batch

# Define number of epochs
num_epochs = 30  # Example

# Track model and hyperparameters in W&B
wandb.watch([tower_one, tower_two], log="all")  # Log gradients and model weights

# Training loop with batched queries
for epoch in range(num_epochs):
    # Group by 'query' to handle variable number of positives and negatives per query
    query_groups = list(results.groupby('query'))

    # Iterate through the dataset in batches of 'query_batch_size' queries
    for q_batch_start in tqdm(range(0, len(query_groups), query_batch_size), desc=f"Epoch {epoch + 1}/{num_epochs}"):
        query_batch = query_groups[q_batch_start:q_batch_start + query_batch_size]

        all_anchor_embeddings = []
        all_positive_embeddings = []
        all_negative_embeddings = []

        # Process each query group in the current batch
        for query, group in query_batch:
            # Tokenize the queries, passage_text, and negative_sample using your updated_vocab or reverse_vocab
            query_tokens = tokenize_titles(group['query'].tolist(), reverse_vocab)
            positive_tokens = tokenize_titles(group['passage_text'].tolist(), reverse_vocab)
            negative_tokens = tokenize_titles(group['negative_sample'].tolist(), reverse_vocab)

            # Get embeddings for each group
            anchor_embeddings = get_embeddings_for_titles(query_tokens, skipgram_model)
            positive_embeddings = get_embeddings_for_titles(positive_tokens, skipgram_model)
            negative_embeddings = get_embeddings_for_titles(negative_tokens, skipgram_model)

            # Append the embeddings to the lists
            all_anchor_embeddings.append(torch.tensor(anchor_embeddings))
            all_positive_embeddings.append(torch.tensor(positive_embeddings))
            all_negative_embeddings.append(torch.tensor(negative_embeddings))

        # Pad sequences to the same length
            anchor_batch = pad_sequence(all_anchor_embeddings, batch_first=True).to(device)
            positive_batch = pad_sequence(all_positive_embeddings, batch_first=True).to(device)
            negative_batch = pad_sequence(all_negative_embeddings, batch_first=True).to(device)

            # Get lengths of the original sequences and move them to CPU
            anchor_lengths = torch.tensor([len(seq) for seq in all_anchor_embeddings]).cpu()  # Move to CPU
            positive_lengths = torch.tensor([len(seq) for seq in all_positive_embeddings]).cpu()  # Move to CPU
            negative_lengths = torch.tensor([len(seq) for seq in all_negative_embeddings]).cpu()  # Move to CPU

            # Forward pass through the two towers
            anchor_output = tower_one(anchor_batch, anchor_lengths)
            positive_output = tower_two(positive_batch, positive_lengths)
            negative_output = tower_two(negative_batch, negative_lengths)



        # Calculate triplet loss
        triplet_loss = triplet_loss_fn(anchor_output, positive_output, negative_output)

        # Backpropagation
        optimizer.zero_grad()
        triplet_loss.backward()
        optimizer.step()

        # Log the loss to W&B
        wandb.log({"epoch": epoch + 1, "triplet_loss": triplet_loss.item()})

    print(f"Epoch {epoch + 1}/{num_epochs} completed. Loss: {triplet_loss.item()}")

    # Optionally, save a model checkpoint after every epoch and log it to W&B
    torch.save({
        'model_state_dict': skipgram_model.state_dict(),
        'tower_one_state_dict': tower_one.state_dict(),
        'tower_two_state_dict': tower_two.state_dict(),
    }, f"model_1_checkpoint_epoch_{epoch+1}.pth")

    wandb.finish

print("Training complete.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
triplet_loss,█▇▇▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,30.0
triplet_loss,0.11852


  checkpoint = torch.load(model_save_path)
Epoch 1/30: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


Epoch 1/30 completed. Loss: 0.9984116554260254


Epoch 2/30: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]


Epoch 2/30 completed. Loss: 0.9477606415748596


Epoch 3/30: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]


Epoch 3/30 completed. Loss: 0.9107734560966492


Epoch 4/30: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


Epoch 4/30 completed. Loss: 0.8764935731887817


Epoch 5/30: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Epoch 5/30 completed. Loss: 0.8411028385162354


Epoch 6/30: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


Epoch 6/30 completed. Loss: 0.8026745319366455


Epoch 7/30: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


Epoch 7/30 completed. Loss: 0.7607420086860657


Epoch 8/30: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]


Epoch 8/30 completed. Loss: 0.7134956121444702


Epoch 9/30: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


Epoch 9/30 completed. Loss: 0.664115846157074


Epoch 10/30: 100%|██████████| 1/1 [00:00<00:00,  1.49it/s]


Epoch 10/30 completed. Loss: 0.6160687804222107


Epoch 11/30: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


Epoch 11/30 completed. Loss: 0.5746490955352783


Epoch 12/30: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


Epoch 12/30 completed. Loss: 0.5393111109733582


Epoch 13/30: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


Epoch 13/30 completed. Loss: 0.5077862739562988


Epoch 14/30: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


Epoch 14/30 completed. Loss: 0.4739944636821747


Epoch 15/30: 100%|██████████| 1/1 [00:00<00:00,  1.49it/s]


Epoch 15/30 completed. Loss: 0.4429342746734619


Epoch 16/30: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


Epoch 16/30 completed. Loss: 0.4128074645996094


Epoch 17/30: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


Epoch 17/30 completed. Loss: 0.38203954696655273


Epoch 18/30: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


Epoch 18/30 completed. Loss: 0.35317283868789673


Epoch 19/30: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


Epoch 19/30 completed. Loss: 0.3216959834098816


Epoch 20/30: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


Epoch 20/30 completed. Loss: 0.2921810448169708


Epoch 21/30: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


Epoch 21/30 completed. Loss: 0.27183815836906433


Epoch 22/30: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]


Epoch 22/30 completed. Loss: 0.248849019408226


Epoch 23/30: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]


Epoch 23/30 completed. Loss: 0.2261599749326706


Epoch 24/30: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


Epoch 24/30 completed. Loss: 0.20688356459140778


Epoch 25/30: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


Epoch 25/30 completed. Loss: 0.18950271606445312


Epoch 26/30: 100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


Epoch 26/30 completed. Loss: 0.17165708541870117


Epoch 27/30: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


Epoch 27/30 completed. Loss: 0.15961307287216187


Epoch 28/30: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Epoch 28/30 completed. Loss: 0.14617334306240082


Epoch 29/30: 100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


Epoch 29/30 completed. Loss: 0.13972172141075134


Epoch 30/30: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]

Epoch 30/30 completed. Loss: 0.13416405022144318
Training complete.





# doing query analysis 

In [74]:
#loading in tower two to create database of text_passages 

tower_two = TowerTwoRNN()

# Load the saved TowerTwo model weights
checkpoint = torch.load("model_1_checkpoint_epoch_30.pth")
tower_two.load_state_dict(checkpoint['tower_two_state_dict'])

# Set TowerTwo to evaluation mode
tower_two.eval()

  checkpoint = torch.load("model_1_checkpoint_epoch_30.pth")


TowerTwoRNN(
  (rnn): RNN(64, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=3, bias=True)
)

In [75]:
len(updated_vocab)

86996

In [76]:
import pandas as pd
import torch
import numpy as np

# Assuming the necessary models and device are already set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the saved SkipGram model
model_save_path = "finetuned_skipgram_model.pth"
checkpoint = torch.load(model_save_path)

# Initialize the SkipGram model
skipgram_model = SkipGramFoo(86996, 64, 2).to(device)  # Ensure to send the model to the correct device
skipgram_model.load_state_dict(checkpoint['model_state_dict'])
skipgram_model.eval()  


  checkpoint = torch.load(model_save_path)


SkipGramFoo(
  (emb): Embedding(86996, 64)
  (ffw): Linear(in_features=64, out_features=86996, bias=False)
  (sig): Sigmoid()
)

In [77]:
import pandas as pd
import torch
import random
from tqdm import tqdm
import numpy as np  # Import the tqdm library for progress bars

# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create a reverse vocabulary from the updated vocabulary
def create_reverse_vocab(vocab):
    return {word: index for index, word in vocab.items()}

# Load the reverse vocabulary
reverse_vocab = create_reverse_vocab(updated_vocab)

# Function to tokenize a list of texts using the reverse vocabulary
def tokenize_texts(texts, reverse_vocab):
    tokens_list = []
    
    for text in texts:
        words = text.lower().split()  # Convert the text to lowercase to match training preprocessing
        
        tokenized = []
        for word in words:
            if word in reverse_vocab:
                tokenized.append(reverse_vocab[word])  # Get the index from reverse_vocab
            # No else clause needed; unknown words are simply skipped
        
        tokens_list.append(tokenized)
    
    return tokens_list


# Step 2: Tokenize the query, passage_text, and negative_samples columns
tokenized_queries = tokenize_texts(results['passage_text'].tolist(), reverse_vocab)



In [78]:

# Step 3: Define the SkipGramFoo model
class SkipGramFoo(torch.nn.Module):
    def __init__(self, voc, emb, ctx):
        super().__init__()
        self.ctx = ctx
        self.emb = torch.nn.Embedding(num_embeddings=voc, embedding_dim=emb)
        self.ffw = torch.nn.Linear(in_features=emb, out_features=voc, bias=False)
        self.sig = torch.nn.Sigmoid()

    def forward(self, inpt, trgs, rand):
        emb = self.emb(inpt)
        batch_size = inpt.size(0)
        rand = rand[:batch_size]
        
        ctx = self.ffw.weight[trgs.to(inpt.device)]
        rnd = self.ffw.weight[rand.to(inpt.device)]
        
        out = torch.bmm(ctx.view(batch_size, 1, -1), emb.unsqueeze(2)).squeeze(2)
        rnd = torch.bmm(rnd.view(batch_size, 1, -1), emb.unsqueeze(2)).squeeze(2)

        out = self.sig(out).clamp(min=1e-7, max=1 - 1e-7)
        rnd = self.sig(rnd).clamp(min=1e-7, max=1 - 1e-7)

        pst = -out.log().mean()
        ngt = -(1 - rnd).log().mean()
        
        return pst + ngt

# Load the model
embedding_dim = 64
model_path = "finetuned_skipgram_model.pth"
mFoo = SkipGramFoo(len(updated_vocab), embedding_dim, 2).to(device)
mFoo.load_state_dict(torch.load(model_path), strict=False)
mFoo.eval()

# Step 4: Generate embeddings for tokenized texts (query, passage, negative samples) with a progress bar
def get_embeddings_for_tokens(tokenized_list, model):
    embeddings_list = []
    
    with torch.no_grad():
        for tokens in tqdm(tokenized_list, desc="Generating embeddings", unit="text"):  # Add progress bar
            if len(tokens) > 0:
                token_tensor = torch.LongTensor(tokens).to(device)
                token_embeddings = model.emb(token_tensor)  # Shape: [num_tokens, embedding_dim]
                embeddings_list.append(token_embeddings.cpu().numpy())  # Keep the full sequence embeddings
            else:
                embeddings_list.append(torch.zeros((1, embedding_dim)).cpu().numpy())  # Zero vector for empty sequences
    
    return embeddings_list

query_embeddings = get_embeddings_for_tokens(tokenized_queries, mFoo)

# Save embeddings as a pickle file
with open('passage_text.pkl', 'wb') as f:
    pickle.dump(query_embeddings, f)

print("Query embeddings saved to 'query_embeddings.pkl'")

  mFoo.load_state_dict(torch.load(model_path), strict=False)
Generating embeddings: 100%|██████████| 497/497 [00:00<00:00, 6867.62text/s]

Query embeddings saved to 'query_embeddings.pkl'





In [79]:
# Ensure your model is on the GPU
tower_two = tower_two.to(device)  # Move the model to the specified device
BATCH_SIZE = 64 
tower_two_outputs = []
# Process embeddings in batches
for i in tqdm(range(0, len(query_embeddings), BATCH_SIZE), desc="Processing Batches"):
    # Get the current batch of embeddings
    batch_embeddings = query_embeddings[i:i + BATCH_SIZE]

    # Convert the batch to tensors
    embeddings_tensors = [torch.tensor(emb) for emb in batch_embeddings]

    # Pad the current batch of tensors
    padded_embeddings = pad_sequence(embeddings_tensors, batch_first=True).to(device)
    
    # Get lengths of the original sequences and move to CPU
    lengths = torch.tensor([len(seq) for seq in embeddings_tensors]).cpu()  # Move lengths tensor to CPU

    # Run the model
    with torch.no_grad():
        batch_outputs = tower_two(padded_embeddings, lengths)

    # Append results
    tower_two_outputs.append(batch_outputs)

# Combine results if needed
tower_two_outputs = torch.cat(tower_two_outputs, dim=0)



Processing Batches: 100%|██████████| 8/8 [00:00<00:00, 103.27it/s]


In [80]:
tower_two_outputs.shape

torch.Size([497, 3])

In [81]:
#loading in tower two to create database of text_passages 

tower_one = TowerOneRNN()

# Load the saved TowerTwo model weights
checkpoint = torch.load("model_checkpoint_epoch_5.pth")
tower_one.load_state_dict(checkpoint['tower_one_state_dict'])

# Set TowerTwo to evaluation mode
tower_one.eval()

  checkpoint = torch.load("model_checkpoint_epoch_5.pth")


TowerOneRNN(
  (rnn): RNN(64, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=3, bias=True)
)

In [82]:
# Define the TowerOne and TowerTwo RNN models
class TowerOneRNN(nn.Module):
    def __init__(self):
        super(TowerOneRNN, self).__init__()
        self.rnn = nn.RNN(input_size=64, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 3)

    def forward(self, x, lengths):
        x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.rnn(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = x[torch.arange(x.size(0)), lengths - 1]
        x = self.fc(x)
        return x

In [104]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
import pickle

# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create a reverse vocabulary from the updated vocabulary
def create_reverse_vocab(vocab):
    return {word: index for index, word in vocab.items()}

# Load the reverse vocabulary
reverse_vocab = create_reverse_vocab(updated_vocab)

# Function to tokenize a single text using the reverse vocabulary
def tokenize_text(text, reverse_vocab):
    words = text.lower().split()  # Convert the text to lowercase to match training preprocessing
    tokenized = []
    for word in words:
        if word in reverse_vocab:
            tokenized.append(reverse_vocab[word])  # Get the index from reverse_vocab
    return tokenized

# Input your query
single_query = "what is rba"  # Repl with your actual query text
tokenized_query = tokenize_text(single_query, reverse_vocab)

# Load the saved TowerTwo model weights
checkpoint = torch.load("model_checkpoint_epoch_5.pth")
tower_one.load_state_dict(checkpoint['tower_one_state_dict'])

# Set TowerTwo to evaluation mode
tower_one.eval()

def get_embedding_for_single_token(tokens, model):
    with torch.no_grad():
        if len(tokens) > 0:
            token_tensor = torch.LongTensor(tokens).to(device)
            token_embedding = model.emb(token_tensor)  # Shape: [num_tokens, embedding_dim]
            return token_embedding.cpu().numpy()  # Return the full sequence embeddings
        else:
            return torch.zeros((1, embedding_dim)).cpu().numpy()  # Zero vector for empty sequences


query_embeddings = get_embedding_for_single_token(tokenized_query, mFoo)




  checkpoint = torch.load("model_checkpoint_epoch_5.pth")


In [105]:
# Ensure your model is on the GPU
tower_one = tower_one.to(device)  # Move the model to the specified device

# Assuming 'query_embedding' is the embedding for the single query
query_embedding_tensor = torch.tensor(query_embeddings).unsqueeze(0).to(device)  # Add a batch dimension

# Get the length of the original sequence
length = torch.tensor([len(query_embeddings)]).cpu()  # Move lengths tensor to CPU

# Run the model
with torch.no_grad():
    tower_one_output = tower_one(query_embedding_tensor, length)

# Process the output as needed
# You can now use 'tower_one_output' for further processing
print("Output from tower_one:", tower_one_output)


Output from tower_one: tensor([[-0.3574, -1.3020, -0.2408]], device='cuda:0')


In [106]:
tower_two_outputs.shape

torch.Size([497, 3])

In [107]:
tower_one_output.shape

torch.Size([1, 3])

In [108]:
import torch
import torch.nn.functional as F

# Ensure your outputs are on the same device (GPU/CPU)
tower_one_output = tower_one_output.to(device)
tower_two_output = tower_two_outputs.to(device)

# Calculate cosine similarity
# tower_one_output is (1, 3), so we need to repeat it for each of the 60000 rows
similarities = F.cosine_similarity(tower_one_output, tower_two_output, dim=1)

# Find the index of the maximum similarity
most_similar_index = torch.argmax(similarities)

# Print the result
print("Most similar index:", most_similar_index.item())  # Convert to Python integer if needed
print("Cosine similarity value:", similarities[most_similar_index].item())  # Convert to Python float if needed


Most similar index: 187
Cosine similarity value: 0.9925868511199951


In [109]:
# Assuming results_df is your DataFrame containing the 'passage_text' column

# Get the most similar index from the cosine similarity results
most_similar_index = most_similar_index.item()  # Ensure it's a Python integer

# Retrieve the passage_text from results_df using the most similar index
similar_passage_text = results['passage_text'].iloc[most_similar_index]

# Print the passage text
print("Most similar passage text:", similar_passage_text)


Most similar passage text: never seen pound cake take less than an hour most take min and have one recipe that for those long times tho are for slow oven about till toothpick inserted in the center of the cake comes out clean on the heat of your oven but start at about and when the cake is golden brown ago up


In [111]:
import torch
import torch.nn.functional as F

# Ensure your outputs are on the same device (GPU/CPU)
tower_one_output = tower_one_output.to(device)
tower_two_output = tower_two_outputs.to(device)

# Calculate cosine similarity
similarities = F.cosine_similarity(tower_one_output, tower_two_output, dim=1)

# Get the top 5 most similar indices
top_k = 10
top_k_indices = torch.topk(similarities, top_k).indices.cpu()  # Move to CPU

# Retrieve the top 5 passages from results_df
top_passages = results['passage_text'].iloc[top_k_indices.numpy()]  # Convert to NumPy array

# Print the results
for i in range(top_k):
    print(f"Most similar index {top_k_indices[i].item()}:")
    print("Cosine similarity value:", similarities[top_k_indices[i]].item())
    print("Passage text:", top_passages.iloc[i])
    print()  # Add a newline for better readability


Most similar index 187:
Cosine similarity value: 0.9925868511199951
Passage text: never seen pound cake take less than an hour most take min and have one recipe that for those long times tho are for slow oven about till toothpick inserted in the center of the cake comes out clean on the heat of your oven but start at about and when the cake is golden brown ago up

Most similar index 342:
Cosine similarity value: 0.9874629378318787
Passage text: as baby girl name is currently not popular baby name in the usa the following chart the popularity of the name in the usa over the past usa baby name statistics you need the adobe flash player and browser with to see all baby name popularity the following are baby related to and are suitable for and of ona both

Most similar index 341:
Cosine similarity value: 0.9874410033226013
Passage text: the name the following meaning my father joy an form of in turn variant of abigail also variant of the name gala meaning merry maker the following are baby

: 