In [4]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

Using device: mps


## Load data

In [57]:
easy = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
validate = pd.read_parquet('validate.parquet')

### collect all texts to one dataset

In [None]:

# with open('AllTexts.txt', 'w') as f:
#     pass  # This just creates the file, immediately closing it

# with open('AllTexts.txt', 'a') as f:  # Open file in append mode
#     for _, row in train.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in test.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in validate.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')

### train tokanizer

In [None]:
# spm.SentencePieceTrainer.train(
#     input = 'AllTexts.txt',
#     model_prefix='spm_AllTexts', 
#     vocab_size=30000,
# )

## Load Tokanizer

In [58]:
sp = spm.SentencePieceProcessor()
sp.load('spm_AllTexts.model')

True

In [59]:
def tokenize_file(file_path, sp_processor):
    tokenized_sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming each line in the file is a separate sentence or paragraph
            # Tokenize the line and add the list of tokens to the tokenized_sentences list
            tokenized_sentences.append(sp_processor.encode_as_pieces(line.strip()))
    return tokenized_sentences

### Tokinize all the Dataset

In [None]:
# tokinized_sentences = tokenize_file("AllTexts.txt", sp)

### Export tokens to JSON

In [None]:
# import json
# with open("Tokens_AllText.json", 'w', encoding='utf-8') as file:
#     json.dump(tokinized_sentences, file, ensure_ascii=False, indent=4)

### Define W2V

In [60]:
vector_size = 128

In [61]:
w2v_model = Word2Vec(
    min_count  =20,
    window     =10,
    vector_size=vector_size,
    sample     =6e-5, 
    alpha      = 0.03, 
    min_alpha  = 0.0007, 
    negative   = 20,
    workers    = multiprocessing.cpu_count() - 1
)

In [None]:
# print(len(tokinized_sentences))
# w2v_model.build_vocab(tokinized_sentences)
# w2v_model.save("word2vec.model")

In [None]:
# with open("word2vec_vocab.txt", 'w') as vocab_file:
#     for word in w2v_model.wv.key_to_index.keys():
#         vocab_file.write(word + '\n')

In [None]:
# w2v_model.train(tokinized_sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1)
# w2v_model.save("word2vec.model")

# Load Word2Vec Model

In [62]:
w2v_model = Word2Vec.load("word2vec.model")

In [63]:
similar_words = w2v_model.wv.most_similar('▁hacker', topn=4)
print(similar_words)
print(w2v_model.wv.most_similar(sp.encode_as_pieces('Hacker'.lower())))

[('▁cyber', 0.5855585336685181), ('▁hack', 0.5675662159919739), ('▁malicious', 0.5665103197097778), ('▁malware', 0.5581293106079102)]
[('▁cyber', 0.5855585336685181), ('▁hack', 0.5675662159919739), ('▁malicious', 0.5665103197097778), ('▁malware', 0.5581293106079102), ('▁scam', 0.5384910702705383), ('▁spyware', 0.5197509527206421), ('▁legitimate', 0.5031879544258118), ('▁adware', 0.4846465289592743), ('▁pretend', 0.469952255487442), ('▁insider', 0.46461066603660583)]


In [23]:
# def to_w2v_embedding(sp, text):
#     tokens = sp.encode_as_pieces(text.lower())

#     embeddings = []
#     for token in tokens:
#         if (token in w2v_model.wv): 
#             embeddings.append(w2v_model.wv[token])

#     return np.stack(embeddings)

### Triples for training

In [64]:
def prepareTriplesTokens(dataframe):
    triples = []
    for index, row in dataframe.iterrows():
        available_indices = list(dataframe.index)
        available_indices.remove(index)
        
        for relevant in row['passages']['passage_text']:
            random_index = np.random.choice(available_indices)
            random_doc_index = np.random.choice(
                list(
                    range(
                        len(dataframe.iloc[random_index]['passages']['passage_text'])
                    )
                )
            )

            irrelevant = dataframe.iloc[random_index]['passages']['passage_text'][random_doc_index]

            triples.append([
                row['query'],
                relevant,
                irrelevant,
            ])

    return triples

# train_triplets = prepareTriplesTokens(train)
# test_triplets = prepareTriplesTokens(test)
# validate_triplets = prepareTriplesTokens(validate)

train_triplets = pd.read_parquet('train_triplets.parquet')
test_triplets = pd.read_parquet('test_triplets.parquet')
validate_triplets = pd.read_parquet('validate_triplets.parquet')

In [20]:
print(train_triplets[:3])

         query                                           relevant  \
0  what is rba  Since 2007, the RBA's outstanding reputation h...   
1  what is rba  The Reserve Bank of Australia (RBA) came into ...   
2  what is rba  RBA Recognized with the 2014 Microsoft US Regi...   

                                          irrelevant  
0  This report describes the typical weather at t...  
1  1. district, community the vicar of a small pa...  
2  They have tried to make Panda Express prices c...  


In [20]:
# # Convert the list of triples to a DataFrame
# columns = ['query', 'relevant', 'irrelevant']
# train_triplets = pd.DataFrame(train_triplets, columns=columns)
# test_triplets = pd.DataFrame(test_triplets, columns=columns)
# validate_triplets = pd.DataFrame(validate_triplets, columns=columns)


In [62]:
# test_triplets_embeddings = pd.DataFrame()

# test_triplets_embeddings['query_embeddings'] = test_triplets['query'].apply(lambda x: to_w2v_embedding(sp, x))
# test_triplets_embeddings['relevant_embeddings'] = test_triplets['relevant'].apply(lambda x: to_w2v_embedding(sp, x))
# test_triplets_embeddings['irrelevant_embeddings'] = test_triplets['irrelevant'].apply(lambda x: to_w2v_embedding(sp, x))

In [63]:
# Calculate the memory usage of each column in bytes, then sum them up, and convert to megabytes
# total_memory_mb = test_triplets_embeddings.memory_usage(deep=True).sum() / (1024 ** 2)

# print(f'Total DataFrame size: {total_memory_mb:.2f} MB')

Total DataFrame size: 134.55 MB


In [64]:
# train_triplets_embeddings = pd.DataFrame()
# train_triplets_embeddings['query_embeddings'] = train_triplets['query'].apply(lambda x: to_w2v_embedding(sp, x))
# train_triplets_embeddings['relevant_embeddings'] = train_triplets['relevant'].apply(lambda x: to_w2v_embedding(sp, x))
# train_triplets_embeddings['irrelevant_embeddings'] = train_triplets['irrelevant'].apply(lambda x: to_w2v_embedding(sp, x))

In [65]:
# Calculate the memory usage of each column in bytes, then sum them up, and convert to megabytes
# total_memory_mb = train_triplets_embeddings.memory_usage(deep=True).sum() / (1024 ** 2)

# print(f'Total DataFrame size: {total_memory_mb:.2f} MB')

Total DataFrame size: 1150.24 MB


In [66]:
# print(test_triplets_embeddings['query_embeddings'].head(1))

# train_triplets_embeddings.to_parquet('train_triplets_with_embedings.parquet', engine='pyarrow') 
# test_triplets_embeddings.to_parquet('test_triplets_with_embedings.parquet', engine='pyarrow') 
# validate_triplets.to_parquet('validate_triplets_with_embedings.parquet', engine='pyarrow') 

0    [[-0.38117662, -1.751613, 0.118526, -4.4650145...
Name: query_embeddings, dtype: object


# Dataset

In [65]:
class QueryDocsDataset(Dataset):
    def __init__(self, sp, queries, relevant_docs, irrelevant_docs, device):
        self.queries = queries
        self.relevant_docs = relevant_docs
        self.irrelevant_docs = irrelevant_docs
        self.device = device

    def __len__(self):
        return len(self.queries)
    
    def to_w2v_embedding(self, sp, text):
        tokens = sp.encode_as_pieces(text.lower())

        embeddings = []
        for token in tokens:
            if (token in w2v_model.wv): 
                embeddings.append(w2v_model.wv[token])

        return np.stack(embeddings)

    def __getitem__(self, idx):
        return {
            'query': torch.tensor(self.to_w2v_embedding(sp, self.queries[idx]), dtype=torch.float, device=self.device),
            'relevant': torch.tensor(self.to_w2v_embedding(sp, self.relevant_docs[idx]), dtype=torch.float, device=self.device),
            'irrelevant': torch.tensor(self.to_w2v_embedding(sp, self.irrelevant_docs[idx]), dtype=torch.float, device=self.device),
        }

### Fill Datasets

In [66]:
TrainingDataset = QueryDocsDataset(sp, train_triplets['query'], train_triplets['relevant'], train_triplets['irrelevant'], device)
TestingDataset = QueryDocsDataset(sp, test_triplets['query'], test_triplets['relevant'], test_triplets['irrelevant'], device)
ValidationDataset = QueryDocsDataset(sp, validate_triplets['query'], validate_triplets['relevant'], validate_triplets['irrelevant'], device)

# TrainingDataset = QueryDocsDataset(sp, train_triplets_embeddings['query_embeddings'], train_triplets_embeddings['relevant_embeddings'], train_triplets_embeddings['irrelevant_embeddings'])
# TestingDataset = QueryDocsDataset(sp, test_triplets_embeddings['query_embeddings'], test_triplets_embeddings['relevant_embeddings'], test_triplets_embeddings['irrelevant_embeddings'])
# ValidationDataset = QueryDocsDataset(sp, validate_triplets_embeddings['query'], validate_triplets_embeddings['relevant'], validate_triplets_embeddings['irrelevant'])

# Models

In [67]:
class QueryRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(QueryRNNCell, self).__init__()
        self.device = device
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.weight_ih = nn.Parameter(torch.randn(input_size, hidden_size, device=self.device))  # Input to hidden weights
        self.weight_hh = nn.Parameter(torch.randn(hidden_size, hidden_size, device=self.device))  # Hidden to hidden weights
        
        self.bias_hh = nn.Parameter(torch.randn(hidden_size, device=self.device))  # Bias

    def forward(self, input, hidden):
        return torch.tanh(
            torch.mm(input, self.weight_ih) + torch.mm(hidden, self.weight_hh) + self.bias_hh
        )
    
class QueryRNN(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(QueryRNN, self).__init__()
        self.device = device
        self.hidden_size = hidden_size
        self.rnn_cell = QueryRNNCell(input_size, hidden_size, device)
        self.rnn_cell.to(device)

    def forward(self, input):
        # Assuming input is of shape (batch, seq_len, input_size)
        batch_size, seq_len, _ = input.shape
        hidden = torch.zeros(batch_size, self.hidden_size, device=self.device)  # Initial hidden state
        for i in range(seq_len):
            
            mask = torch.any(input[:, i, :] != 0, dim=1).float().unsqueeze(1)  # Shape: (batch_size, 1)
            current_input = input[:, i, :]  
            
            current_hidden = self.rnn_cell(current_input, hidden)
            
            # Apply mask: Only update hidden state for non-padded inputs
            hidden = mask * current_hidden + (1 - mask) * hidden
            
        return hidden

# Two Towers

In [68]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, device):
        super(TwoTowerModel, self).__init__()
        self.device = device
        self.queryEncoder = QueryRNN(embedding_size, hidden_size, device)
        self.queryEncoder.to(device)
        self.docEncoder = QueryRNN(embedding_size, hidden_size, device)
        self.docEncoder.to(device)

    def forward(self, query, relevant, irrelevant):
        query_embedding = self.queryEncoder(query)
        relevant_embedding = self.docEncoder(relevant)
        irrelevant_embedding = self.docEncoder(irrelevant)
        return query_embedding, relevant_embedding, irrelevant_embedding

## Lose Function

In [69]:
import torch.nn.functional as F

def triplet_loss_function_cosine(query, relevant_doc, irrelevant_doc, margin):
    # Compute cosine similarity (the output ranges from -1 to 1)
    relevant_similarity = F.cosine_similarity(query, relevant_doc)
    irrelevant_similarity = F.cosine_similarity(query, irrelevant_doc)
    
    # Convert similarities to distances (ranges from 0 to 2)
    relevant_distance = 1 - relevant_similarity
    irrelevant_distance = 1 - irrelevant_similarity
    
    # Compute the triplet loss
    triplet_loss = torch.clamp(margin + relevant_distance - irrelevant_distance, min=0)
    return triplet_loss.mean()

### Padding Function

In [70]:
def collate_fn(batch):
    # Extract lists of tensors for 'query', 'relevant', and 'irrelevant' from the batch
    query_tensors = [item['query'] for item in batch]
    relevant_tensors = [item['relevant'] for item in batch]
    irrelevant_tensors = [item['irrelevant'] for item in batch]
    
    # Pad sequences within each list to the same length
    query_padded = pad_sequence(query_tensors, batch_first=True, padding_value=0)
    relevant_padded = pad_sequence(relevant_tensors, batch_first=True, padding_value=0)
    irrelevant_padded = pad_sequence(irrelevant_tensors, batch_first=True, padding_value=0)
    
    # Return a dictionary with padded sequences
    return {
        'query': query_padded,
        'relevant': relevant_padded,
        'irrelevant': irrelevant_padded
    }

## padding test

In [39]:
# Convert the numpy array to a PyTorch tensor
tensors =[ 
    torch.tensor([
        [51, 92, 14, 71],
        [60, 20, 82, 86],
        [74, 74, 87, 99],
        [23,  2, 21, 52],
        [ 1, 87, 29, 37],
    ]),
    torch.tensor([
        [51, 92, 14, 71],
        [60, 20, 82, 86],
        [74, 74, 87, 99],
        [23,  2, 21, 52],
        [ 1, 87, 29, 37],
        [60, 20, 82, 86],
    ]),
]
dump = pad_sequence(tensors, batch_first=True)

print(dump)

tensor([[[51, 92, 14, 71],
         [60, 20, 82, 86],
         [74, 74, 87, 99],
         [23,  2, 21, 52],
         [ 1, 87, 29, 37],
         [ 0,  0,  0,  0]],

        [[51, 92, 14, 71],
         [60, 20, 82, 86],
         [74, 74, 87, 99],
         [23,  2, 21, 52],
         [ 1, 87, 29, 37],
         [60, 20, 82, 86]]])


# Initialize Model and Dataloader

In [71]:
embedding_size = 128
hidden_size = 16
margin = 1.0
batch_size = 100
num_epochs = 10

# TrainingDataloader = DataLoader(TrainingDataset, batch_size, shuffle=False)
# TestingDataloader = DataLoader(TestingDataset, batch_size, shuffle=False)

TrainingDataloader = DataLoader(TrainingDataset, batch_size, shuffle=False, collate_fn=collate_fn)
TestingDataloader = DataLoader(TestingDataset, batch_size, shuffle=False, collate_fn=collate_fn)
ValidatingDataloader = DataLoader(ValidationDataset, batch_size, shuffle=False, collate_fn=collate_fn)

# Instantiate the model
Towers = TwoTowerModel(embedding_size, hidden_size, device)
Towers.to(device)
optimizer = torch.optim.Adam(Towers.parameters(), lr=0.001)


## Recheck the padding results

In [41]:
for batch in TestingDataloader:
    print(batch['query'].shape)        # Shape: (batch_size, max_seq_length_query, feature_dim)
    print(batch['relevant'].shape)     # Shape: (batch_size, max_seq_length_relevant, feature_dim)
    print(batch['irrelevant'].shape) 

    for i in range(batch['query'].size(0)):
        print(f"Element {i+1}, last row of 7:\n{batch['query'][i, -1, :].numpy()}\n")

    break

torch.Size([100, 10, 128])
torch.Size([100, 137, 128])
torch.Size([100, 183, 128])


TypeError: can't convert mps:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

# Train Towers

In [72]:
for epoch in range(num_epochs):
    Towers.train() 
    train_loss = 0.0

    i = 0
    for batch in TrainingDataloader:
        i +=1
        optimizer.zero_grad()

        # Forward pass through the model to get embeddings
        query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
            batch['query'], 
            batch['relevant'], 
            batch['irrelevant']
        )

        # Compute the loss
        loss = triplet_loss_function_cosine(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        if (i) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch: {i}, Train Loss: {loss.item():.4f}')
        
        # break
    
# Testing phase
Towers.eval()  # Set model to evaluation mode
test_loss = 0.0
with torch.no_grad():  # No need to track gradients for testing
    for batch in TestingDataloader:
        query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
            batch['query'], 
            batch['relevant'], 
            batch['irrelevant']
        )
        
        loss = triplet_loss_function_cosine(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)
        test_loss += loss.item()
        
avg_train_loss = train_loss / len(TestingDataloader)
avg_test_loss = test_loss / len(TestingDataloader)
    
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')

Epoch [1/10], Batch: 10, Train Loss: 0.9766
Epoch [1/10], Batch: 20, Train Loss: 1.0061
Epoch [1/10], Batch: 30, Train Loss: 0.9594
Epoch [1/10], Batch: 40, Train Loss: 1.0156
Epoch [1/10], Batch: 50, Train Loss: 1.0344
Epoch [1/10], Batch: 60, Train Loss: 0.9772
Epoch [1/10], Batch: 70, Train Loss: 1.0357
Epoch [1/10], Batch: 80, Train Loss: 1.0218
Epoch [1/10], Batch: 90, Train Loss: 0.9434
Epoch [1/10], Batch: 100, Train Loss: 1.0047
Epoch [1/10], Batch: 110, Train Loss: 1.0164
Epoch [1/10], Batch: 120, Train Loss: 0.9788
Epoch [1/10], Batch: 130, Train Loss: 0.9703
Epoch [1/10], Batch: 140, Train Loss: 0.9520
Epoch [1/10], Batch: 150, Train Loss: 0.9897
Epoch [1/10], Batch: 160, Train Loss: 1.0113
Epoch [1/10], Batch: 170, Train Loss: 0.9122
Epoch [1/10], Batch: 180, Train Loss: 1.0138
Epoch [1/10], Batch: 190, Train Loss: 1.0163
Epoch [1/10], Batch: 200, Train Loss: 0.9997
Epoch [1/10], Batch: 210, Train Loss: 0.9810
Epoch [1/10], Batch: 220, Train Loss: 1.0054
Epoch [1/10], Batch

## Export Model


In [74]:
model_path = 'AlexTwoTowersDictLocal.pth'
torch.save(Towers.state_dict(), 'AlexTwoTowersDictLocal.pth')
torch.save(Towers, 'AlexTwoTowersLocal.pth')

# Lib Collection Function

In [75]:
def LibDocsCollection(databases):
    documents = []
    for database in databases:
        for index, row in database.iterrows():
            for doc in row['passages']['passage_text']:
               documents.append(doc)
    documents = pd.DataFrame(documents, columns=['document'])
    unique_documents = documents.drop_duplicates(subset=['document']).reset_index(drop=True)
    print(documents.shape)
    print(unique_documents.shape)
    return unique_documents

## Build Libriary

In [76]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
validate = pd.read_parquet('validate.parquet')

libriary = LibDocsCollection([train, test, validate])

(837729, 1)
(767675, 1)


## Solo Dataset

In [77]:
class SoloDataset(Dataset):
    def __init__(self, sp, texts, device):
        self.texts = texts
        self.device = device

    def __len__(self):
        return len(self.texts)
    
    def to_w2v_embedding(self, sp, text):
        tokens = sp.encode_as_pieces(text.lower())

        embeddings = []
        for token in tokens:
            if (token in w2v_model.wv): 
                embeddings.append(w2v_model.wv[token])

        return np.stack(embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.to_w2v_embedding(sp, self.texts.iloc[idx]['document']), dtype=torch.float, device=self.device)
    
def solo_collate_fn(batch):
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=0)
    return padded_batch

### Docs Embeddings

In [78]:
DocsDataset = SoloDataset(sp, libriary, device)

batch_size = 100
docsEmbeddingsDataloader = DataLoader(DocsDataset, batch_size, shuffle=False, collate_fn=solo_collate_fn)

### Initialize Encoders

In [80]:
Towers = torch.load('AlexTwoTowersLocal.pth', map_location=device)
# OR Load the model's state dictionary (Testing the same)
Towers.load_state_dict(torch.load('AlexTwoTowersDictLocal.pth', map_location=device))

<All keys matched successfully>

### Encode All Documents in Lib

In [81]:
Towers.eval()  # Set model to evaluation mode
encodedDouments = []
with torch.no_grad():  # No need to track gradients for testing
    i=0
    for batch in docsEmbeddingsDataloader:
        i+=1
        encodedDoumentsBatch = Towers.docEncoder(batch).cpu().numpy()
        encodedDouments.extend(encodedDoumentsBatch)
        print(f'Batch: {i}')
        # print(batch)
        # print(encodedDoumentsBatch)
        # if(i >= 2):
        #     break
encodedDouments = pd.DataFrame({'encoders': [array for array in encodedDouments]})

Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5
Batch: 6
Batch: 7
Batch: 8
Batch: 9
Batch: 10
Batch: 11
Batch: 12
Batch: 13
Batch: 14
Batch: 15
Batch: 16
Batch: 17
Batch: 18
Batch: 19
Batch: 20
Batch: 21
Batch: 22
Batch: 23
Batch: 24
Batch: 25
Batch: 26
Batch: 27
Batch: 28
Batch: 29
Batch: 30
Batch: 31
Batch: 32
Batch: 33
Batch: 34
Batch: 35
Batch: 36
Batch: 37
Batch: 38
Batch: 39
Batch: 40
Batch: 41
Batch: 42
Batch: 43
Batch: 44
Batch: 45
Batch: 46
Batch: 47
Batch: 48
Batch: 49
Batch: 50
Batch: 51
Batch: 52
Batch: 53
Batch: 54
Batch: 55
Batch: 56
Batch: 57
Batch: 58
Batch: 59
Batch: 60
Batch: 61
Batch: 62
Batch: 63
Batch: 64
Batch: 65
Batch: 66
Batch: 67
Batch: 68
Batch: 69
Batch: 70
Batch: 71
Batch: 72
Batch: 73
Batch: 74
Batch: 75
Batch: 76
Batch: 77
Batch: 78
Batch: 79
Batch: 80
Batch: 81
Batch: 82
Batch: 83
Batch: 84
Batch: 85
Batch: 86
Batch: 87
Batch: 88
Batch: 89
Batch: 90
Batch: 91
Batch: 92
Batch: 93
Batch: 94
Batch: 95
Batch: 96
Batch: 97
Batch: 98
Batch: 99
Batch: 100
Batch: 1

In [82]:

print(libriary.shape)
print(encodedDouments.shape)

libWithEncoders = pd.DataFrame({
    'document': libriary['document'],
    'encoders': encodedDouments['encoders'],
})

print(libWithEncoders.shape)



(767675, 1)
(767675, 1)
(767675, 2)


### Get Random element from Validation Set

In [120]:
def getRandomQuery(dataframe):
    available_indices = list(dataframe.index)
    random_index = np.random.choice(available_indices)
    query = dataframe.iloc[random_index]['query']
    return query
query = getRandomQuery(validate)

print(query)

def to_w2v_embedding(sp, text):
    tokens = sp.encode_as_pieces(text.lower())

    embeddings = []
    for token in tokens:
        if (token in w2v_model.wv): 
            embeddings.append(w2v_model.wv[token])

    return np.stack(embeddings)

what does a wolf spider look like


### Encode Query

In [121]:
Towers.eval()  # Set model to evaluation mode
with torch.no_grad():  # No need to track gradients for testing
    # query_embedding = torch.tensor(to_w2v_embedding(sp, ))
    query_embedding = torch.tensor([to_w2v_embedding(sp, query)], dtype=torch.float, device=device)
    encodedQuery = Towers.docEncoder(query_embedding).cpu().numpy()
print(encodedQuery[0])

[-0.99999493  0.99999994 -1.          0.99805367  0.99999994 -0.99435025
 -1.         -1.          0.99999994 -1.         -1.          0.99999994
  0.99999994  0.9009006   0.99999994  0.99995136]


### Search Closest documents

In [122]:
def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [123]:
libWithEncoders['distance'] = libWithEncoders['encoders'].apply(lambda x: (1- cosine_similarity(encodedQuery[0], x)))

In [126]:
libWithEncodersSorted = libWithEncoders.sort_values(by='distance', ascending=True)
results = libWithEncodersSorted.head(5)
print(query)
print(results)

csv_file_path = 'results.csv'
results.to_csv(csv_file_path, index=False)

with open(csv_file_path, 'a') as f:
    f.write('\n\nQuery:\n')
    f.write(query + '\n')

what does a wolf spider look like
                                                 document  \
742498  There are many varieties of sweet potatoes, wh...   
714125  Click here to get exclusive, professional tips...   
186734  (BPM). Processes of the same nature are classi...   
205084  Wolf Spiders live in both coastal and inland h...   
476959  There are some people who have seen good resul...   

                                                 encoders  distance  
742498  [-0.99999356, 0.99999994, -1.0, 0.9864725, 0.9...  0.012050  
714125  [-1.0, 0.99999994, -0.9999061, 0.99999994, 0.9...  0.102515  
186734  [-0.99971503, 0.99999994, -0.99999994, 0.79861...  0.107043  
205084  [-1.0, 0.99999994, -1.0, 0.9999977, 0.99999994...  0.113631  
476959  [-1.0, 0.99999994, -0.9526948, 0.99965733, 0.9...  0.119708  
