Imports

In [19]:
import pandas as pd
from datasets import load_dataset
import random
from collections import defaultdict
from tqdm import tqdm

Download Data

In [20]:
def unravel_passages(dataset):
    unraveled_rows = []
    for i, data in enumerate(dataset['query']):
        query = data
        passage_texts = dataset['passages'][i]['passage_text']
        urls = dataset['passages'][i]['url']
        for passage_text, url in zip(passage_texts, urls):
            unraveled_rows.append({
                'query': query,
                'passage': passage_text,
                'url': url
            })    
    return pd.DataFrame(unraveled_rows)


def sample_irrelevant_optimized(relevant_list, all_passages_ids, num_samples):
    relevant_set = frozenset(relevant_list)
    possible_irrelevant = list(all_passages_ids - relevant_set)
    return random.sample(possible_irrelevant, min(len(possible_irrelevant), num_samples))

def create_triplets_dataframe(unraveled_data):
    relevant_passages = unraveled_data.groupby('query_id')['passage_id'].apply(list).reset_index(name='relevant')
    all_passages_ids = set(unraveled_data['passage_id'])
    irrelevant_cache = {}
    relevant_passages['irrelevant'] = relevant_passages['relevant'].apply(
        lambda x: sample_irrelevant_optimized(x, all_passages_ids, len(x))
    )
    return relevant_passages

def prepare_mappings_optim(unraveled_data):
    unique_queries = pd.DataFrame({'query': unraveled_data['query'].unique()})
    unique_passages = pd.DataFrame({'passage': unraveled_data['passage'].unique()})
    
    # Use the index directly for ID assignment
    unique_queries['query_id'] = unique_queries.index
    unique_passages['passage_id'] = unique_passages.index
    
    return unique_queries, unique_passages

def map_ids_optim(unraveled_data, unique_queries, unique_passages):
    # Map using merge for vectorized operation, this is much faster than map for large datasets
    unraveled_data = unraveled_data.merge(unique_queries, on='query', how='left')
    unraveled_data = unraveled_data.merge(unique_passages, on='passage', how='left')
    return unraveled_data

def process_dataset_optim(dataset_split):
    unraveled_data = unravel_passages(dataset_split)
    unique_queries, unique_passages = prepare_mappings_optim(unraveled_data)
    # Directly use DataFrames for merging, do not convert to dict
    unraveled_data = map_ids_optim(unraveled_data, unique_queries, unique_passages)
    triplets_df = create_triplets_dataframe(unraveled_data)
    return triplets_df, unique_queries, unique_passages


def expand_triplets(triplets_df):
    expanded_triplets = []
    for index, row in triplets_df.iterrows():
        query_id = row['query_id']
        positive_passages = row['relevant']
        negative_passages = row['irrelevant']
        
        for positive_passage_id, negative_passage_id in zip(positive_passages, negative_passages):
            expanded_triplets.append({
                'query_id': query_id,
                'positive_passage_id': positive_passage_id,
                'negative_passage_id': negative_passage_id
            })
    return pd.DataFrame(expanded_triplets)


NameError: name 'profile' is not defined

In [None]:

# def expand_triplets_optimized(triplets_df):
#     # Create a list to hold DataFrames for each row's expanded triplets
#     dfs = []

#     # Iterate over the DataFrame without using iterrows()
#     for query_id, relevant, irrelevant in zip(triplets_df['query_id'], triplets_df['relevant'], triplets_df['irrelevant']):
#         # Generate a DataFrame from the zipped lists of positive and negative passage IDs for this query
#         df = pd.DataFrame({
#             'query_id': query_id,
#             'positive_passage_id': relevant,
#             'negative_passage_id': irrelevant
#         })
#         dfs.append(df)

#     # Concatenate all the individual DataFrames into one
#     expanded_triplets_df = pd.concat(dfs, ignore_index=True)
#     return expanded_triplets_df

# def expand_triplets_preallocated(triplets_df):
#     # Calculate the total number of triplets to preallocate DataFrame
#     total_triplets = sum(len(relevant) for relevant in triplets_df['relevant'])
    
#     # Preallocate DataFrame
#     expanded_df = pd.DataFrame(index=range(total_triplets), columns=['query_id', 'positive_passage_id', 'negative_passage_id'])
    
#     # Fill the DataFrame
#     idx = 0
#     for _, row in triplets_df.iterrows():
#         n = len(row['relevant'])
#         expanded_df.iloc[idx:idx+n, 0] = row['query_id']
#         expanded_df.iloc[idx:idx+n, 1] = row['relevant']
#         expanded_df.iloc[idx:idx+n, 2] = row['irrelevant']
#         idx += n
    
#     return expanded_df

# def expand_triplets_preallocated_2(triplets_df):
#     # Estimate total size needed for preallocation
#     total_size = sum(len(rel) for rel in triplets_df['relevant'])

#     # Preallocate DataFrame with appropriate data types
#     preallocated_df = pd.DataFrame({
#         'query_id': pd.Series(dtype='int'),
#         'positive_passage_id': pd.Series(dtype='int'),
#         'negative_passage_id': pd.Series(dtype='int'),
#     }, index=pd.RangeIndex(total_size))

#     # Example of bulk assignment (adapt as needed)
#     start_idx = 0
#     for _, row in triplets_df.iterrows():
#         end_idx = start_idx + len(row['relevant'])
#         preallocated_df.iloc[start_idx:end_idx] = pd.DataFrame({
#             'query_id': row['query_id'],
#             'positive_passage_id': row['relevant'],
#             'negative_passage_id': row['irrelevant'],
#         }).values  # Using .values for direct assignment to avoid index alignment issues
#         start_idx = end_idx

#     return preallocated_df


In [7]:
dataset = load_dataset("ms_marco", "v1.1")

In [8]:
train_triplets, train_queries, train_passages = process_dataset_optim(pd.DataFrame(dataset['train']))

In [9]:
test_triplets, test_queries, test_passages = process_dataset_optim(pd.DataFrame(dataset['test']))

In [10]:
val_triplets, val_queries, val_passages = process_dataset_optim(pd.DataFrame(dataset['validation']))