In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader

## Load data

In [3]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')
validate = pd.read_parquet('validate.parquet')

### collect all texts to one dataset

In [None]:

# with open('AllTexts.txt', 'w') as f:
#     pass  # This just creates the file, immediately closing it

# with open('AllTexts.txt', 'a') as f:  # Open file in append mode
#     for _, row in train.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in test.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')
#     for _, row in validate.iterrows():
#         concatenated = '\n'.join(row['passages']['passage_text'])
#         concatenated = '\n'.join([concatenated, '\n'.join(row['answers'])])
#         concatenated = '\n'.join([concatenated, row['query']])
#         f.write(concatenated.lower() + '\n')

### train tokanizer

In [None]:
# spm.SentencePieceTrainer.train(
#     input = 'AllTexts.txt',
#     model_prefix='spm_AllTexts', 
#     vocab_size=30000,
# )

## Load Tokanizer

In [9]:
sp = spm.SentencePieceProcessor()
sp.load('spm_AllTexts.model')

True

In [None]:
def tokenize_file(file_path, sp_processor):
    tokenized_sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming each line in the file is a separate sentence or paragraph
            # Tokenize the line and add the list of tokens to the tokenized_sentences list
            tokenized_sentences.append(sp_processor.encode_as_pieces(line.strip()))
    return tokenized_sentences

### Tokinize all the Dataset

In [None]:
tokinized_sentences = tokenize_file("AllTexts.txt", sp)

### Export tokens to JSON

In [None]:
# import json
# with open("Tokens_AllText.json", 'w', encoding='utf-8') as file:
#     json.dump(tokinized_sentences, file, ensure_ascii=False, indent=4)

### Define W2V

In [None]:
vector_size = 128

In [None]:
w2v_model = Word2Vec(
    min_count  =20,
    window     =10,
    vector_size=vector_size,
    sample     =6e-5, 
    alpha      = 0.03, 
    min_alpha  = 0.0007, 
    negative   = 20,
    workers    = multiprocessing.cpu_count() - 1
)

In [None]:
# print(len(tokinized_sentences))
# w2v_model.build_vocab(tokinized_sentences)
# w2v_model.save("word2vec.model")

In [None]:
# with open("word2vec_vocab.txt", 'w') as vocab_file:
#     for word in w2v_model.wv.key_to_index.keys():
#         vocab_file.write(word + '\n')

In [None]:
# w2v_model.train(tokinized_sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1)
# w2v_model.save("word2vec.model")

In [None]:
w2v_model = Word2Vec.load("word2vec.model")

In [None]:
similar_words = w2v_model.wv.most_similar('▁hacker', topn=4)
print(similar_words)
print(w2v_model.wv.most_similar(sp.encode_as_pieces('Hacker')))

In [None]:
def to_embedding(sp, text, vector_size):
    tokens = sp.encode_as_pieces(text)

    embeddings = []
    for token in tokens:
        if (token in w2v_model.wv): 
            embeddings.append(w2v_model.wv[token])

    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(vector_size)

### Triples for training

In [11]:
def prepareTriplesTokens(dataframe):
    triples = []
    for index, row in dataframe.iterrows():
        available_indices = list(dataframe.index)
        available_indices.remove(index)
        
        for relevant in row['passages']['passage_text']:
            random_index = np.random.choice(available_indices)
            random_doc_index = np.random.choice(
                list(
                    range(
                        len(dataframe.iloc[random_index]['passages']['passage_text'])
                    )
                )
            )

            irrelevant = dataframe.iloc[random_index]['passages']['passage_text'][random_doc_index]

            triples.append([
                row['query'],
                relevant,
                irrelevant,
            ])

    return triples

train_triplets = prepareTriplesTokens(train)
test_triplets = prepareTriplesTokens(test)
validate_triplets = prepareTriplesTokens(validate)

In [23]:
print(len(train_triplets))

676193


In [20]:
# Convert the list of triples to a DataFrame
columns = ['query', 'relevant', 'irrelevant']
train_triplets = pd.DataFrame(train_triplets, columns=columns)
test_triplets = pd.DataFrame(test_triplets, columns=columns)
validate_triplets = pd.DataFrame(validate_triplets, columns=columns)

# Export the DataFrame to a CSV file
train_triplets.to_parquet('train_triplets.parquet', engine='pyarrow') 
test_triplets.to_parquet('test_triplets.parquet', engine='pyarrow') 
validate_triplets.to_parquet('validate_triplets.parquet', engine='pyarrow') 

# Dataset

In [21]:
class QueryDocsDataset(Dataset):
    def __init__(self, queries, relevant_docs, irrelevant_docs):
        self.queries = queries
        self.relevant_docs = relevant_docs
        self.irrelevant_docs = irrelevant_docs

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        return {
            'query': self.queries[idx],
            'relevant': self.relevant_docs[idx],
            'irrelevant': self.irrelevant_docs[idx]
        }

In [None]:
TestingDataset = QueryDocsDataset(test_triplets['query'], test_triplets['relevant'], test_triplets['irrelevant'])
TestingDataloader = DataLoader(TestingDataset, batch_size=1000, shuffle=True)

# Models

In [None]:
class QueryRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(QueryRNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.weight_ih = nn.Parameter(torch.randn(input_size, hidden_size))  # Input to hidden weights
        self.weight_hh = nn.Parameter(torch.randn(hidden_size, hidden_size))  # Hidden to hidden weights
        self.bias_ih = nn.Parameter(torch.randn(input_size))  # Bias
        self.bias_hh = nn.Parameter(torch.randn(hidden_size))  # Bias

    def forward(self, input, hidden):
        return torch.tanh(
            torch.mm(input, self.weight_ih) + self.bias_ih + torch.mm(hidden, self.weight_hh) + self.bias_hh
        )
    
class QueryRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(QueryRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_cell = QueryRNNCell(input_size, hidden_size)

    def forward(self, input):
        # Assuming input is of shape (batch, seq_len, input_size)
        batch_size, seq_len, _ = input.shape
        hidden = torch.zeros(batch_size, self.hidden_size)  # Initial hidden state
        for i in range(seq_len):
            hidden = self.rnn_cell(input[:, i, :], hidden)
        return hidden

# Two Towers

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_size, hidden_size):
        super(TwoTowerModel, self).__init__()
        self.queryEncoder = QueryRNN(embedding_size, hidden_size)
        self.docEncoder = QueryRNN(embedding_size, hidden_size)

    def forward(self, query, relevant, irrelevant):
        query_embedding = self.queryEncoder(query)
        relevant_embedding = self.docEncoder(relevant)
        irrelevant_embedding = self.docEncoder(irrelevant)
        return query_embedding, relevant_embedding, irrelevant_embedding

## Lose Function

In [None]:
# def distance_function(query, relevant_document):
#     return 0

# def triplet_loss_function(query, relevant_document, irrelevant_document, distance_function, margin):
#     relevant_distance = distance_function(query, relevant_document)
#     irrelevant_distance = distance_function(query, irrelevant_document)
#     tripletLoss = max(0, relevant_distance - irrelevant_distance + margin)
#     return tripletLoss

def triplet_loss_function(query, relevant_doc, irrelevant_doc, margin):
    # Assuming Euclidean distance as the distance function
    relevant_distance = torch.norm(query - relevant_doc, p=2, dim=1)
    irrelevant_distance = torch.norm(query - irrelevant_doc, p=2, dim=1)
    return torch.clamp(relevant_distance - irrelevant_distance + margin, min=0).mean()

# Training Loop

In [None]:
embedding_size = 128
hidden_size = 256
batch_size = 1000

# Instantiate the dataset and dataloader
# dataset = QueryDocumentDataset(...)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model
Towers = TwoTowerModel(embedding_size, hidden_size)

optimizer = torch.optim.Adam(Towers.parameters(), lr=0.001)

margin = 1.0
num_epochs = 1

for epoch in range(num_epochs):
    Towers.train() 
    train_loss = 0.0

    for batch in TestingDataloader:
        optimizer.zero_grad()

        # Forward pass through the model to get embeddings
        query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
            batch['query'], 
            batch['relevant'], 
            batch['irrelevant']
        )

        # Compute the loss
        loss = triplet_loss_function(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    
    # Testing phase
    Towers.eval()  # Set model to evaluation mode
    test_loss = 0.0
    with torch.no_grad():  # No need to track gradients for testing
        for batch in TestingDataloader:
            query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings = Towers(
                batch['query'], batch['relevant_doc'], batch['irrelevant_doc']
            )
            
            loss = triplet_loss_function(query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings, margin)
            test_loss += loss.item()
            
    avg_train_loss = train_loss / len(TestingDataloader)
    avg_test_loss = test_loss / len(TestingDataloader)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')