In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader

## Load data

In [3]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
validate = pd.read_parquet('validate.parquet')

### collect all texts to one dataset

In [None]:

# with open('AllTexts.txt', 'w') as f:
#     pass  # This just creates the file, immediately closing it

# with open('AllTexts.txt', 'a') as f:  # Open file in append mode
#     for _, row in train.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in test.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in validate.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')

### train tokanizer

In [None]:
# spm.SentencePieceTrainer.train(
#     input = 'AllTexts.txt',
#     model_prefix='spm_AllTexts', 
#     vocab_size=30000,
# )

## Load Tokanizer

In [3]:
sp = spm.SentencePieceProcessor()
sp.load('spm_AllTexts.model')

True

In [3]:
def tokenize_file(file_path, sp_processor):
    tokenized_sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming each line in the file is a separate sentence or paragraph
            # Tokenize the line and add the list of tokens to the tokenized_sentences list
            tokenized_sentences.append(sp_processor.encode_as_pieces(line.strip()))
    return tokenized_sentences

### Tokinize all the Dataset

In [None]:
# tokinized_sentences = tokenize_file("AllTexts.txt", sp)

### Export tokens to JSON

In [None]:
# import json
# with open("Tokens_AllText.json", 'w', encoding='utf-8') as file:
#     json.dump(tokinized_sentences, file, ensure_ascii=False, indent=4)

### Define W2V

In [4]:
vector_size = 128

In [5]:
w2v_model = Word2Vec(
    min_count  =20,
    window     =10,
    vector_size=vector_size,
    sample     =6e-5, 
    alpha      = 0.03, 
    min_alpha  = 0.0007, 
    negative   = 20,
    workers    = multiprocessing.cpu_count() - 1
)

In [None]:
# print(len(tokinized_sentences))
# w2v_model.build_vocab(tokinized_sentences)
# w2v_model.save("word2vec.model")

In [None]:
# with open("word2vec_vocab.txt", 'w') as vocab_file:
#     for word in w2v_model.wv.key_to_index.keys():
#         vocab_file.write(word + '\n')

In [None]:
# w2v_model.train(tokinized_sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1)
# w2v_model.save("word2vec.model")

# Load Word2Vec Model

In [6]:
w2v_model = Word2Vec.load("word2vec.model")

In [7]:
similar_words = w2v_model.wv.most_similar('▁hacker', topn=4)
print(similar_words)
print(w2v_model.wv.most_similar(sp.encode_as_pieces('Hacker'.lower())))

[('▁cyber', 0.5855585336685181), ('▁hack', 0.5675662159919739), ('▁malicious', 0.5665103197097778), ('▁malware', 0.5581293106079102)]
[('▁cyber', 0.5855585336685181), ('▁hack', 0.5675662159919739), ('▁malicious', 0.5665103197097778), ('▁malware', 0.5581293106079102), ('▁scam', 0.5384910702705383), ('▁spyware', 0.5197509527206421), ('▁legitimate', 0.5031879544258118), ('▁adware', 0.4846465289592743), ('▁pretend', 0.469952255487442), ('▁insider', 0.46461066603660583)]


In [23]:
# def to_w2v_embedding(sp, text):
#     tokens = sp.encode_as_pieces(text.lower())

#     embeddings = []
#     for token in tokens:
#         if (token in w2v_model.wv): 
#             embeddings.append(w2v_model.wv[token])

#     return np.stack(embeddings)

### Triples for training

In [9]:
def prepareTriplesTokens(dataframe):
    triples = []
    for index, row in dataframe.iterrows():
        available_indices = list(dataframe.index)
        available_indices.remove(index)
        
        for relevant in row['passages']['passage_text']:
            random_index = np.random.choice(available_indices)
            random_doc_index = np.random.choice(
                list(
                    range(
                        len(dataframe.iloc[random_index]['passages']['passage_text'])
                    )
                )
            )

            irrelevant = dataframe.iloc[random_index]['passages']['passage_text'][random_doc_index]

            triples.append([
                row['query'],
                relevant,
                irrelevant,
            ])

    return triples

# train_triplets = prepareTriplesTokens(train)
# test_triplets = prepareTriplesTokens(test)
# validate_triplets = prepareTriplesTokens(validate)

train_triplets = pd.read_parquet('train_triplets.parquet')
test_triplets = pd.read_parquet('test_triplets.parquet')
validate_triplets = pd.read_parquet('validate_triplets.parquet')

In [12]:
print(train_triplets[:3])

         query                                           relevant  \
0  what is rba  Since 2007, the RBA's outstanding reputation h...   
1  what is rba  The Reserve Bank of Australia (RBA) came into ...   
2  what is rba  RBA Recognized with the 2014 Microsoft US Regi...   

                                          irrelevant  
0  This report describes the typical weather at t...  
1  1. district, community the vicar of a small pa...  
2  They have tried to make Panda Express prices c...  


In [20]:
# # Convert the list of triples to a DataFrame
# columns = ['query', 'relevant', 'irrelevant']
# train_triplets = pd.DataFrame(train_triplets, columns=columns)
# test_triplets = pd.DataFrame(test_triplets, columns=columns)
# validate_triplets = pd.DataFrame(validate_triplets, columns=columns)


In [62]:
# test_triplets_embeddings = pd.DataFrame()

# test_triplets_embeddings['query_embeddings'] = test_triplets['query'].apply(lambda x: to_w2v_embedding(sp, x))
# test_triplets_embeddings['relevant_embeddings'] = test_triplets['relevant'].apply(lambda x: to_w2v_embedding(sp, x))
# test_triplets_embeddings['irrelevant_embeddings'] = test_triplets['irrelevant'].apply(lambda x: to_w2v_embedding(sp, x))

In [63]:
# Calculate the memory usage of each column in bytes, then sum them up, and convert to megabytes
# total_memory_mb = test_triplets_embeddings.memory_usage(deep=True).sum() / (1024 ** 2)

# print(f'Total DataFrame size: {total_memory_mb:.2f} MB')

Total DataFrame size: 134.55 MB


In [64]:
# train_triplets_embeddings = pd.DataFrame()
# train_triplets_embeddings['query_embeddings'] = train_triplets['query'].apply(lambda x: to_w2v_embedding(sp, x))
# train_triplets_embeddings['relevant_embeddings'] = train_triplets['relevant'].apply(lambda x: to_w2v_embedding(sp, x))
# train_triplets_embeddings['irrelevant_embeddings'] = train_triplets['irrelevant'].apply(lambda x: to_w2v_embedding(sp, x))

In [65]:
# Calculate the memory usage of each column in bytes, then sum them up, and convert to megabytes
# total_memory_mb = train_triplets_embeddings.memory_usage(deep=True).sum() / (1024 ** 2)

# print(f'Total DataFrame size: {total_memory_mb:.2f} MB')

Total DataFrame size: 1150.24 MB


In [66]:
# print(test_triplets_embeddings['query_embeddings'].head(1))

# train_triplets_embeddings.to_parquet('train_triplets_with_embedings.parquet', engine='pyarrow') 
# test_triplets_embeddings.to_parquet('test_triplets_with_embedings.parquet', engine='pyarrow') 
# validate_triplets.to_parquet('validate_triplets_with_embedings.parquet', engine='pyarrow') 

0    [[-0.38117662, -1.751613, 0.118526, -4.4650145...
Name: query_embeddings, dtype: object


In [13]:
device = torch.device('mps')
# device = torch.device('cpu')

# Dataset

In [33]:
class QueryDocsDataset(Dataset):
    def __init__(self, sp, queries, relevant_docs, irrelevant_docs, device):
        self.queries = queries
        self.relevant_docs = relevant_docs
        self.irrelevant_docs = irrelevant_docs
        self.device = device

    def __len__(self):
        return len(self.queries)
    
    def to_w2v_embedding(self, sp, text):
        tokens = sp.encode_as_pieces(text.lower())

        embeddings = []
        for token in tokens:
            if (token in w2v_model.wv): 
                embeddings.append(w2v_model.wv[token])

        return np.stack(embeddings)

    def __getitem__(self, idx):
        return {
            'query': torch.tensor(self.to_w2v_embedding(sp, self.queries[idx].lower()), dtype=torch.float, device=self.device),
            'relevant': torch.tensor(self.to_w2v_embedding(sp, self.relevant_docs[idx].lower()), dtype=torch.float, device=self.device),
            'irrelevant': torch.tensor(self.to_w2v_embedding(sp, self.irrelevant_docs[idx].lower()), dtype=torch.float, device=self.device),
        }

### Fill Datasets

In [34]:
TrainingDataset = QueryDocsDataset(sp, train_triplets['query'], train_triplets['relevant'], train_triplets['irrelevant'], device)
TestingDataset = QueryDocsDataset(sp, test_triplets['query'], test_triplets['relevant'], test_triplets['irrelevant'], device)
ValidationDataset = QueryDocsDataset(sp, validate_triplets['query'], validate_triplets['relevant'], validate_triplets['irrelevant'], device)

# TrainingDataset = QueryDocsDataset(sp, train_triplets_embeddings['query_embeddings'], train_triplets_embeddings['relevant_embeddings'], train_triplets_embeddings['irrelevant_embeddings'])
# TestingDataset = QueryDocsDataset(sp, test_triplets_embeddings['query_embeddings'], test_triplets_embeddings['relevant_embeddings'], test_triplets_embeddings['irrelevant_embeddings'])
# ValidationDataset = QueryDocsDataset(sp, validate_triplets_embeddings['query'], validate_triplets_embeddings['relevant'], validate_triplets_embeddings['irrelevant'])

# Models

In [35]:
class QueryRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(QueryRNNCell, self).__init__()
        self.device = device
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.weight_ih = nn.Parameter(torch.randn(input_size, hidden_size, device=self.device))  # Input to hidden weights
        self.weight_hh = nn.Parameter(torch.randn(hidden_size, hidden_size, device=self.device))  # Hidden to hidden weights
        
        self.bias_hh = nn.Parameter(torch.randn(hidden_size, device=self.device))  # Bias

    def forward(self, input, hidden):
        return torch.tanh(
            torch.mm(input, self.weight_ih) + torch.mm(hidden, self.weight_hh) + self.bias_hh
        )
    
class QueryRNN(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super(QueryRNN, self).__init__()
        self.device = device
        self.hidden_size = hidden_size
        self.rnn_cell = QueryRNNCell(input_size, hidden_size, device)
        self.rnn_cell.to(device)

    def forward(self, input):
        # Assuming input is of shape (batch, seq_len, input_size)
        batch_size, seq_len, _ = input.shape
        hidden = torch.zeros(batch_size, self.hidden_size, device=self.device)  # Initial hidden state
        for i in range(seq_len):
            
            mask = torch.any(input[:, i, :] != 0, dim=1).float().unsqueeze(1)  # Shape: (batch_size, 1)
            current_input = input[:, i, :]  
            
            current_hidden = self.rnn_cell(current_input, hidden)
            
            # Apply mask: Only update hidden state for non-padded inputs
            hidden = mask * current_hidden + (1 - mask) * hidden
            
        return hidden

# Two Towers

In [36]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, device):
        super(TwoTowerModel, self).__init__()
        self.device = device
        self.queryEncoder = QueryRNN(embedding_size, hidden_size, device)
        self.queryEncoder.to(device)
        self.docEncoder = QueryRNN(embedding_size, hidden_size, device)
        self.docEncoder.to(device)

    def forward(self, query, relevant, irrelevant):
        query_embedding = self.queryEncoder(query)
        relevant_embedding = self.docEncoder(relevant)
        irrelevant_embedding = self.docEncoder(irrelevant)
        return query_embedding, relevant_embedding, irrelevant_embedding

## Lose Function

In [37]:
import torch.nn.functional as F

def triplet_loss_function_cosine(query, relevant_doc, irrelevant_doc, margin):
    # Compute cosine similarity (the output ranges from -1 to 1)
    relevant_similarity = F.cosine_similarity(query, relevant_doc)
    irrelevant_similarity = F.cosine_similarity(query, irrelevant_doc)
    
    # Convert similarities to distances (ranges from 0 to 2)
    relevant_distance = 1 - relevant_similarity
    irrelevant_distance = 1 - irrelevant_similarity
    
    # Compute the triplet loss
    triplet_loss = torch.clamp(margin + relevant_distance - irrelevant_distance, min=0)
    return triplet_loss.mean()

### Padding Function

In [38]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extract lists of tensors for 'query', 'relevant', and 'irrelevant' from the batch
    query_tensors = [item['query'] for item in batch]
    relevant_tensors = [item['relevant'] for item in batch]
    irrelevant_tensors = [item['irrelevant'] for item in batch]
    
    # Pad sequences within each list to the same length
    query_padded = pad_sequence(query_tensors, batch_first=True, padding_value=0)
    relevant_padded = pad_sequence(relevant_tensors, batch_first=True, padding_value=0)
    irrelevant_padded = pad_sequence(irrelevant_tensors, batch_first=True, padding_value=0)
    
    # Return a dictionary with padded sequences
    return {
        'query': query_padded,
        'relevant': relevant_padded,
        'irrelevant': irrelevant_padded
    }

## padding test

In [39]:
# Convert the numpy array to a PyTorch tensor
tensors =[ 
    torch.tensor([
        [51, 92, 14, 71],
        [60, 20, 82, 86],
        [74, 74, 87, 99],
        [23,  2, 21, 52],
        [ 1, 87, 29, 37],
    ]),
    torch.tensor([
        [51, 92, 14, 71],
        [60, 20, 82, 86],
        [74, 74, 87, 99],
        [23,  2, 21, 52],
        [ 1, 87, 29, 37],
        [60, 20, 82, 86],
    ]),
]
dump = pad_sequence(tensors, batch_first=True)

print(dump)

tensor([[[51, 92, 14, 71],
         [60, 20, 82, 86],
         [74, 74, 87, 99],
         [23,  2, 21, 52],
         [ 1, 87, 29, 37],
         [ 0,  0,  0,  0]],

        [[51, 92, 14, 71],
         [60, 20, 82, 86],
         [74, 74, 87, 99],
         [23,  2, 21, 52],
         [ 1, 87, 29, 37],
         [60, 20, 82, 86]]])


# Initialize Model and Dataloader

In [40]:
embedding_size = 128
hidden_size = 16
margin = 1.0
batch_size = 100
num_epochs = 3

# TrainingDataloader = DataLoader(TrainingDataset, batch_size, shuffle=False)
# TestingDataloader = DataLoader(TestingDataset, batch_size, shuffle=False)

TrainingDataloader = DataLoader(TrainingDataset, batch_size, shuffle=False, collate_fn=collate_fn)
TestingDataloader = DataLoader(TestingDataset, batch_size, shuffle=False, collate_fn=collate_fn)
ValidatingDataloader = DataLoader(ValidationDataset, batch_size, shuffle=False, collate_fn=collate_fn)

# Instantiate the model
Towers = TwoTowerModel(embedding_size, hidden_size, device)
Towers.to(device)
optimizer = torch.optim.Adam(Towers.parameters(), lr=0.001)


## Recheck the padding results

In [41]:
for batch in TestingDataloader:
    print(batch['query'].shape)        # Shape: (batch_size, max_seq_length_query, feature_dim)
    print(batch['relevant'].shape)     # Shape: (batch_size, max_seq_length_relevant, feature_dim)
    print(batch['irrelevant'].shape) 

    for i in range(batch['query'].size(0)):
        print(f"Element {i+1}, last row of 7:\n{batch['query'][i, -1, :].numpy()}\n")

    break

torch.Size([100, 10, 128])
torch.Size([100, 137, 128])
torch.Size([100, 183, 128])


TypeError: can't convert mps:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [43]:
for epoch in range(num_epochs):
    Towers.train() 
    train_loss = 0.0

    i = 0
    for batch in TrainingDataloader:
        i +=1
        optimizer.zero_grad()

        # Forward pass through the model to get embeddings
        query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
            batch['query'], 
            batch['relevant'], 
            batch['irrelevant']
        )

        # Compute the loss
        loss = triplet_loss_function_cosine(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        if (i) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch: {i}, Train Loss: {loss.item():.4f}')
        
        # break
    
    # Testing phase
    Towers.eval()  # Set model to evaluation mode
    test_loss = 0.0
    with torch.no_grad():  # No need to track gradients for testing
        for batch in TestingDataloader:
            query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
                batch['query'], 
                batch['relevant'], 
                batch['irrelevant']
            )
            
            loss = triplet_loss_function_cosine(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)
            test_loss += loss.item()
            
    avg_train_loss = train_loss / len(TestingDataloader)
    avg_test_loss = test_loss / len(TestingDataloader)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')

Epoch [1/3], Batch: 10, Train Loss: 0.9265
Epoch [1/3], Batch: 20, Train Loss: 0.8214
Epoch [1/3], Batch: 30, Train Loss: 0.8913
Epoch [1/3], Batch: 40, Train Loss: 0.8827
Epoch [1/3], Batch: 50, Train Loss: 0.9135
Epoch [1/3], Batch: 60, Train Loss: 0.9418
Epoch [1/3], Batch: 70, Train Loss: 0.9422
Epoch [1/3], Batch: 80, Train Loss: 0.8371
Epoch [1/3], Batch: 90, Train Loss: 0.9466
Epoch [1/3], Batch: 100, Train Loss: 0.9242
Epoch [1/3], Batch: 110, Train Loss: 0.9173
Epoch [1/3], Batch: 120, Train Loss: 0.9461
Epoch [1/3], Batch: 130, Train Loss: 0.9503
Epoch [1/3], Batch: 140, Train Loss: 0.9498
Epoch [1/3], Batch: 150, Train Loss: 0.8839
Epoch [1/3], Batch: 160, Train Loss: 0.9794
Epoch [1/3], Batch: 170, Train Loss: 0.9707
Epoch [1/3], Batch: 180, Train Loss: 0.9383
Epoch [1/3], Batch: 190, Train Loss: 0.9301
Epoch [1/3], Batch: 200, Train Loss: 0.8938
Epoch [1/3], Batch: 210, Train Loss: 0.9320
Epoch [1/3], Batch: 220, Train Loss: 0.9364
Epoch [1/3], Batch: 230, Train Loss: 0.96