Imports

In [190]:
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset

import random

import numpy as np

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from collections import defaultdict

import numpy as np
import pandas as pd
# from tqdm.auto import tqdm
from multiprocessing import Pool, cpu_count


Download Data

In [191]:
dataset = load_dataset("ms_marco", "v1.1")

# train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])
# validation_data = pd.DataFrame(dataset['validation'])

In [192]:
def unravel_passages(dataframe):
    unraveled_rows = []

    for index, row in dataframe.iterrows():
        # query_id = row['query_id']
        query = row['query']
        # answers = row['answers']
        count = len(row['passages']['passage_text'])
        
        for passage, url in zip(row['passages']['passage_text'], row['passages']['url']):
            unraveled_rows.append({
                # 'query_id': query_id,
                'query': query,
                # 'answers': answers,
                'passage': passage,
                'url': url,
                'count': count,
            })

    return pd.DataFrame(unraveled_rows)

In [193]:
# unravel_train_data = unravel_passages(train_data)
unravel_test_data = unravel_passages(test_data)
# unravel_val_data = unravel_passages(validation_data)

In [194]:
unique_queries = pd.DataFrame(unravel_test_data['query'].unique(), columns=['query'])
unique_passages = pd.DataFrame(unravel_test_data['passage'].unique(), columns=['passage'])

In [195]:
unique_queries['query_id'] = range(len(unique_queries))
unique_passages['passage_id'] = range(len(unique_passages))

query_to_id = {row['query']: row['query_id'] for index, row in unique_queries.iterrows()}
passage_to_id = {row['passage']: row['passage_id'] for index, row in unique_passages.iterrows()}

unravel_test_data['query_id'] = unravel_test_data['query'].map(query_to_id)
unravel_test_data['passage_id'] = unravel_test_data['passage'].map(passage_to_id)

relevant_passages = unravel_test_data.groupby('query_id')['passage_id'].apply(list).reset_index(name='relevant')

In [196]:
print(relevant_passages.head())

   query_id                                  relevant
0         0                     [0, 1, 2, 3, 4, 5, 6]
1         1         [7, 8, 9, 10, 11, 12, 13, 14, 15]
2         2      [16, 17, 18, 19, 20, 21, 22, 23, 24]
3         3      [25, 26, 27, 28, 29, 30, 31, 32, 33]
4         4  [34, 35, 36, 37, 38, 39, 40, 41, 42, 43]


In [204]:
def sample_irrelevant(relevant_list, all_passages, num_samples):
    possible_irrelevant = list(set(all_passages) - set(relevant_list))
    return random.sample(possible_irrelevant, num_samples)


def sample_irrelevant_optimized(relevant_list, all_passages_ids, irrelevant_cache, num_samples):
    relevant_set = frozenset(relevant_list)  # Convert to frozenset for hashing
    if relevant_set not in irrelevant_cache:
        possible_irrelevant = list(all_passages_ids - relevant_set)
        irrelevant_cache[relevant_set] = possible_irrelevant
    else:
        possible_irrelevant = irrelevant_cache[relevant_set]
    
    return random.sample(possible_irrelevant, min(len(possible_irrelevant), num_samples))

In [None]:
all_passages_ids = set(unique_passages['passage_id'])

In [202]:

def worker(args):
    relevant_list, all_passages_ids, num_samples = args
    return sample_irrelevant_optimized(relevant_list, all_passages_ids, {}, num_samples)

def apply_in_parallel(df, all_passages_ids, num_processes=None):
    with Pool(processes=num_processes) as pool:
        # Prepare arguments for parallel processing
        args = [(row['relevant'], all_passages_ids, len(row['relevant'])) for index, row in df.iterrows()]
        
        # Execute the function in parallel
        results = pool.map(worker, args)
        
    # Update the DataFrame with results
    df['irrelevant'] = results
    return df


In [205]:
irrelevant_cache = {}  # Initialize cache outside the function for reuse
relevant_passages['irrelevant'] = relevant_passages['relevant'].apply(
    lambda x: sample_irrelevant_optimized(x, all_passages_ids, irrelevant_cache, len(x))
)

In [198]:
relevant_passages['irrelevant'] = relevant_passages['relevant'].apply(lambda x: sample_irrelevant(x, all_passages_ids, len(x)))

In [200]:
relevant_passages

Unnamed: 0,query_id,relevant,irrelevant
0,0,"[0, 1, 2, 3, 4, 5, 6]","[23177, 70775, 35696, 53719, 48244, 9643, 47114]"
1,1,"[7, 8, 9, 10, 11, 12, 13, 14, 15]","[74064, 16659, 7625, 24907, 69326, 15534, 2459..."
2,2,"[16, 17, 18, 19, 20, 21, 22, 23, 24]","[24129, 7719, 28653, 7868, 76809, 58615, 49393..."
3,3,"[25, 26, 27, 28, 29, 30, 31, 32, 33]","[40729, 14303, 57113, 54914, 19875, 30572, 581..."
4,4,"[34, 35, 36, 37, 38, 39, 40, 41, 42, 43]","[45232, 75588, 17959, 36800, 40898, 70599, 729..."
...,...,...,...
9645,9645,"[77856, 77857, 77858, 77859, 77860, 77861, 77862]","[55471, 30431, 42547, 52805, 9398, 72762, 49530]"
9646,9646,"[47682, 77863, 77864, 77865, 77866, 77867, 778...","[60553, 67130, 36098, 55344, 52781, 68109, 312..."
9647,9647,"[77871, 77872, 77873, 77874, 77875, 77876, 778...","[60575, 31463, 18748, 73243, 28724, 55078, 771..."
9648,9648,"[77880, 77881, 77882, 77883, 54246, 77884, 778...","[66949, 20480, 57526, 55277, 7823, 42425, 4208..."


In [199]:
corpus = api.load('text8')
model = Word2Vec(corpus, vector_size=300, window=5, min_count=1, workers=4)

KeyboardInterrupt: 

In [None]:
def text_to_padded_embeddings(text, word2vec_model, max_length):
    # Tokenize the text
    tokens = text.split()

    # Initialize an empty matrix for embeddings with shape (max_length, vector_size)
    embeddings_matrix = np.zeros((max_length, word2vec_model.vector_size))

    # Iterate over the tokens to retrieve embeddings, up to the max_length
    for i, token in enumerate(tokens[:max_length]):
        if token in word2vec_model.wv:
            embeddings_matrix[i] = word2vec_model.wv[token]

    return embeddings_matrix

def create_sequence_embeddings_dataframe(triplets_dataframe, word2vec_model, query_length=15, doc_length=50):
    embeddings_data = []

    for idx, row in triplets_dataframe.iterrows():
        # Generate embeddings for the query, relevant document, and irrelevant document
        query_embeddings = text_to_padded_embeddings(row['query'], word2vec_model, query_length)
        relevant_doc_embeddings = text_to_padded_embeddings(row['relevant_doc'], word2vec_model, doc_length)
        irrelevant_doc_embeddings = text_to_padded_embeddings(row['irrelevant_doc'], word2vec_model, doc_length)
        
        # Append the embeddings as matrices to the list
        embeddings_data.append({
            'query_embeddings': query_embeddings,
            'relevant_doc_embeddings': relevant_doc_embeddings,
            'irrelevant_doc_embeddings': irrelevant_doc_embeddings
        })

    return pd.DataFrame(embeddings_data)


In [None]:
embeddings_df = create_sequence_embeddings_dataframe(triplets_train, model)

NameError: name 'triplets_train' is not defined

In [None]:
class SequenceEmbeddingTripletsDataset(Dataset):
    def __init__(self, embeddings_dataframe):
        self.embeddings_df = embeddings_dataframe
    
    def __len__(self):
        return len(self.embeddings_df)
    
    def __getitem__(self, idx):
        row = self.embeddings_df.iloc[idx]
        query_embeddings = torch.tensor(row['query_embeddings'], dtype=torch.float)
        relevant_doc_embeddings = torch.tensor(row['relevant_doc_embeddings'], dtype=torch.float)
        irrelevant_doc_embeddings = torch.tensor(row['irrelevant_doc_embeddings'], dtype=torch.float)
        return query_embeddings, relevant_doc_embeddings, irrelevant_doc_embeddings

# Create the DataLoader
sequence_embedding_dataset = SequenceEmbeddingTripletsDataset(embeddings_df)
sequence_embedding_dataloader = DataLoader(sequence_embedding_dataset, batch_size=4, shuffle=True)


In [None]:
class TripletsDataset(Dataset):
    def __init__(self, triplets_dataframe):
        self.dataframe = triplets_dataframe
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        query = row['query']
        relevant_doc = row['relevant_doc']
        irrelevant_doc = row['irrelevant_doc']
        return query, relevant_doc, irrelevant_doc

# Example usage
train_dataset = TripletsDataset(triplets_train)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)


Generate Data

In [None]:
# # Define dataset parameters
# num_examples = 64
# seq_length = 10
# embedding_dim = 300

# # Generate synthetic data
# queries = torch.randn(num_examples, seq_length, embedding_dim)
# relevant_docs = torch.randn(num_examples, seq_length, embedding_dim)
# irrelevant_docs = torch.randn(num_examples, seq_length, embedding_dim)

# # Convert to a DataFrame
# df = pd.DataFrame({
#     'queries': list(queries),
#     'relevant_docs': list(relevant_docs),
#     'irrelevant_docs': list(irrelevant_docs)
# })

DataLoader

In [None]:
# class TripletsDataset(Dataset):
#     def __init__(self, dataframe):
#         self.dataframe = dataframe
    
#     def __len__(self):
#         return len(self.dataframe)
    
#     def __getitem__(self, idx):
#         return (self.dataframe.iloc[idx]['queries'],
#                 self.dataframe.iloc[idx]['relevant_docs'],
#                 self.dataframe.iloc[idx]['irrelevant_docs'])

# dataset = TripletsDataset(df)
# dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


Model

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(TwoTowerModel, self).__init__()
        self.query_encoder = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.doc_encoder = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        
    def forward(self, query, doc):
        _, query_hidden = self.query_encoder(query)
        _, doc_hidden = self.doc_encoder(doc)
        return query_hidden.squeeze(0), doc_hidden.squeeze(0)

model = TwoTowerModel(embedding_dim=embedding_dim, hidden_dim=128)


Loss Function and Optimizer

In [None]:
criterion = nn.TripletMarginLoss(margin=1.0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    for queries, relevant_docs, irrelevant_docs in dataloader:
        optimizer.zero_grad()
        
        # Encode queries and documents
        query_enc, rel_doc_enc = model(queries, relevant_docs)
        _, irr_doc_enc = model(queries, irrelevant_docs)
        
        # Compute loss and backpropagate
        loss = criterion(query_enc, rel_doc_enc, irr_doc_enc)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 1.0147972106933594
Epoch 2, Loss: 0.0
Epoch 3, Loss: 0.0
Epoch 4, Loss: 0.0
Epoch 5, Loss: 0.0
