<a href="https://colab.research.google.com/github/matdjohnson-at-umass-dot-edu/cs646-final-project/blob/main/CS646_Final_Project_Preprocessing5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets
! pip install transformers
! pip install statsmodels

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from datasets import concatenate_datasets, Dataset, disable_caching, disable_progress_bars, load_dataset
from statsmodels.distributions.empirical_distribution import ECDF
from tqdm import tqdm
from google.colab import drive
import os
import torch
import torch.nn.functional as torch_func
import gc
import time
from threading import Lock
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModel
import logging
import psutil
import numpy as np
from collections import Counter

os.environ["TOKENIZERS_PARALLELISM"] = "false"

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
datasets_base_dir = "/content/drive/MyDrive/CS646-FinalProject/datasets"

example_ids_dir_names = [
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_0-75000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_75000-150000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_150000-225000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_225000-300000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_300000-375000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_375000-450000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_450000-475000",
    "ms_marco_corpus_in_qrel_embs_avg_example_ids_475000-500000"
]

combined_dataset = None

for dir_name in example_ids_dir_names:
    print(f"reading {dir_name}")
    dataset_for_dir = Dataset.from_parquet(f"{datasets_base_dir}/{dir_name}/{dir_name}.parquet")
    if combined_dataset is None:
        combined_dataset = dataset_for_dir
    else:
        combined_dataset = concatenate_datasets([combined_dataset, dataset_for_dir])
    dataset_for_dir.cleanup_cache_files()
    del dataset_for_dir
    gc.collect()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
combined_dataset = combined_dataset.with_format("torch", device=device)


reading ms_marco_corpus_in_qrel_embs_avg_example_ids_0-75000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_75000-150000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_150000-225000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_225000-300000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_300000-375000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_375000-450000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_450000-475000
reading ms_marco_corpus_in_qrel_embs_avg_example_ids_475000-500000


In [None]:
cosine_similarity = combined_dataset['cos_sim'].cpu()
ecdf = ECDF(cosine_similarity)
linspace_segments = 100
x_axis = torch.linspace(0, 1.0, linspace_segments)
cdf = ecdf(x_axis)
idx_matching_50 = 0
for i in range(0, len(cdf)):
    if cdf[i] >= 0.50:
        idx_matching_50 = i - 1
        break

filtered_examples_dataset_file_name = "ms_marco_corpus_in_qrel_embs_avg_example_ids_filtered"
filtered_examples_dir = f"{datasets_base_dir}/{filtered_examples_dataset_file_name}"
filtered_examples_file_path_and_name = f"{datasets_base_dir}/{filtered_examples_dataset_file_name}/{filtered_examples_dataset_file_name}.parquet"
if not os.path.exists(filtered_examples_dir):
    os.makedirs(filtered_examples_dir)
if (os.path.exists(filtered_examples_file_path_and_name) and os.path.isfile(filtered_examples_file_path_and_name)):
    filtered_dataset = Dataset.from_parquet(filtered_examples_file_path_and_name)
else:
    # cosine similarity chosen to select roughly bottom 50 percent of cosine similarities
    # ref: https://docs.google.com/spreadsheets/d/1-g3FWBxhp5ZYYlWFru3Eik4Kmgwnhu4DtUdmpfYF4rE/edit?usp=sharing
    # this not only helps with harness of the negative sample, but also incidentally reduces the dataset size which is larger than can be used in time constraints
    filtered_dataset = combined_dataset.filter(lambda example: example['cos_sim'] < (idx_matching_50 / linspace_segments))
    filtered_dataset.to_parquet(filtered_examples_file_path_and_name)

del combined_dataset
gc.collect()
print(filtered_dataset)

Dataset({
    features: ['pos_example', 'neg_example', 'cos_sim'],
    num_rows: 211996
})


In [None]:
log_freq = 10000

In [None]:
if 'pos_example_emb' not in filtered_dataset.column_names:
    corpus_in_qrel_embs_avg_file_name = "ms_marco_corpus_in_qrel_embs_avg"
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: reading dataset to memory")
    corpus_in_qrel_embs_avg_dataset = Dataset.from_parquet(
        f"{datasets_base_dir}/{corpus_in_qrel_embs_avg_file_name}/{corpus_in_qrel_embs_avg_file_name}-concatenated.parquet",
        columns=['_id', 'embedding']
    )
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: read of dataset to memory complete")
    print(f"{timestamp}: converting dataset to numpy")
    corpus_in_qrel_embs_avg_dataset = corpus_in_qrel_embs_avg_dataset.with_format("numpy")
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to numpy complete")
    print(f"{timestamp}: converting dataset to dict")
    corpus_in_qrel_embs_avg_dataset = corpus_in_qrel_embs_avg_dataset.to_dict(batch_size=20000)
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to dict complete")
    corpus_in_qrel_embs_avg_lookup_dict = {}
    for i in range(0, len(corpus_in_qrel_embs_avg_dataset['_id'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: id to emb dict creation - starting {i+1} of {len(corpus_in_qrel_embs_avg_dataset['_id'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        in_qrel_id = corpus_in_qrel_embs_avg_dataset['_id'][i]
        corpus_in_qrel_embs_avg_lookup_dict[int(in_qrel_id)] = corpus_in_qrel_embs_avg_dataset['embedding'][i]
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: id to emb dict creation - completed {i+1} of {len(corpus_in_qrel_embs_avg_dataset['_id'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    print(f"{len(corpus_in_qrel_embs_avg_lookup_dict.keys())} {corpus_in_qrel_embs_avg_lookup_dict.keys()}"[0:1500])
    del corpus_in_qrel_embs_avg_dataset
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2024-12-05T01:14:40: reading dataset to memory
2024-12-05T01:14:40: read of dataset to memory complete
2024-12-05T01:14:40: converting dataset to numpy
2024-12-05T01:14:40: conversion of dataset to numpy complete
2024-12-05T01:14:40: converting dataset to dict
2024-12-05T01:18:22: conversion of dataset to dict complete
2024-12-05T01:18:22: id to emb dict creation - starting 1 of 500000
2024-12-05T01:18:22: memory statistics: pmem(rss=26743607296, vms=37542342656, shared=4463865856, text=2842624, lib=0, data=24004939776, dirty=0)
2024-12-05T01:18:23: id to emb dict creation - completed 1 of 500000
2024-12-05T01:18:23: memory statistics: pmem(rss=26743607296, vms=37542342656, shared=4463865856, text=2842624, lib=0, data=24004939776, dirty=0)
2024-12-05T01:18:23: id to emb dict creation - starting 10001 of 500000
2024-12-05T01:18:23: memory statistics: pmem(rss=24688082944, vms=35486769152, shared=2408292352, text=2842624, lib=0, data=24004939776, dirty=0)
2024-12-05T01:18:23: id to emb d

In [None]:
if 'pos_example_emb' not in filtered_dataset.column_names:
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: converting dataset to numpy and to dictionary")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
    filtered_dataset = filtered_dataset.with_format("numpy")
    filtered_dataset = filtered_dataset.to_dict(batch_size=20000)
    filtered_dataset['pos_example_emb'] = list()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to numpy and to dictionary complete")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
    for i in range(0, len(filtered_dataset['pos_example'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: embs assignment - starting {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        filtered_dataset['pos_example_emb'].append(
            corpus_in_qrel_embs_avg_lookup_dict[int(filtered_dataset['pos_example'][i])]
        )
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: embs assignment - completed {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    filtered_dataset = Dataset.from_dict(filtered_dataset)
    filtered_dataset.to_parquet(filtered_examples_file_path_and_name)
    del corpus_in_qrel_embs_avg_lookup_dict
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2024-12-05T01:27:28: converting dataset to numpy and to dictionary
2024-12-05T01:27:28: memory statistics: pmem(rss=22623850496, vms=33421758464, shared=352718848, text=2842624, lib=0, data=23995551744, dirty=0)

2024-12-05T01:27:29: conversion of dataset to numpy and to dictionary complete
2024-12-05T01:27:29: memory statistics: pmem(rss=22630604800, vms=33429098496, shared=352718848, text=2842624, lib=0, data=24002891776, dirty=0)

2024-12-05T01:27:29: embs assignment - starting 1 of 211996
2024-12-05T01:27:29: memory statistics: pmem(rss=22630604800, vms=33429098496, shared=352718848, text=2842624, lib=0, data=24002891776, dirty=0)
2024-12-05T01:27:29: embs assignment - completed 1 of 211996
2024-12-05T01:27:29: memory statistics: pmem(rss=22630604800, vms=33429098496, shared=352718848, text=2842624, lib=0, data=24002891776, dirty=0)
2024-12-05T01:27:29: embs assignment - starting 10001 of 211996
2024-12-05T01:27:29: memory statistics: pmem(rss=22630604800, vms=33429098496, shared=3

Creating parquet from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

In [None]:
if 'neg_example_emb' not in filtered_dataset.column_names:
    corpus_not_in_qrel_embs_avg_file_name = "ms_marco_corpus_not_in_qrel_embs_avg"
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: reading dataset to memory")
    corpus_not_in_qrel_embs_avg_dataset = Dataset.from_parquet(
        f"{datasets_base_dir}/{corpus_not_in_qrel_embs_avg_file_name}/{corpus_not_in_qrel_embs_avg_file_name}-concatenated.parquet",
        columns=['_id', 'embedding']
    )
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: read of dataset to memory complete")
    print(f"{timestamp}: converting dataset to numpy")
    corpus_not_in_qrel_embs_avg_dataset = corpus_not_in_qrel_embs_avg_dataset.with_format("numpy")
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to numpy complete")
    print(f"{timestamp}: converting dataset to dict")
    corpus_not_in_qrel_embs_avg_dataset = corpus_not_in_qrel_embs_avg_dataset.to_dict(batch_size=20000)
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to dict complete")
    corpus_not_in_qrel_embs_avg_lookup_dict = {}
    for i in range(0, len(corpus_not_in_qrel_embs_avg_dataset['_id'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: id to emb dict creation - starting {i+1} of {len(corpus_not_in_qrel_embs_avg_dataset['_id'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        in_qrel_id = corpus_not_in_qrel_embs_avg_dataset['_id'][i]
        corpus_not_in_qrel_embs_avg_lookup_dict[int(in_qrel_id)] = corpus_not_in_qrel_embs_avg_dataset['embedding'][i]
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: id to emb dict creation - completed {i+1} of {len(corpus_not_in_qrel_embs_avg_dataset['_id'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    print(f"{len(corpus_not_in_qrel_embs_avg_dataset.keys())} {corpus_not_in_qrel_embs_avg_dataset.keys()}"[0:1500])
    del corpus_not_in_qrel_embs_avg_dataset
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2024-12-05T01:34:24: reading dataset to memory


Generating train split: 0 examples [00:00, ? examples/s]

2024-12-05T01:36:46: read of dataset to memory complete
2024-12-05T01:36:46: converting dataset to numpy
2024-12-05T01:36:46: conversion of dataset to numpy complete
2024-12-05T01:36:46: converting dataset to dict
2024-12-05T01:44:03: conversion of dataset to dict complete
2024-12-05T01:44:03: id to emb dict creation - starting 1 of 1000000
2024-12-05T01:44:03: memory statistics: pmem(rss=48720162816, vms=67143135232, shared=4465696768, text=2842624, lib=0, data=53032734720, dirty=0)
2024-12-05T01:44:03: id to emb dict creation - completed 1 of 1000000
2024-12-05T01:44:03: memory statistics: pmem(rss=48720162816, vms=67143135232, shared=4465696768, text=2842624, lib=0, data=53032734720, dirty=0)
2024-12-05T01:44:03: id to emb dict creation - starting 10001 of 1000000
2024-12-05T01:44:03: memory statistics: pmem(rss=48721244160, vms=67144724480, shared=4465696768, text=2842624, lib=0, data=53034323968, dirty=0)
2024-12-05T01:44:03: id to emb dict creation - completed 10001 of 1000000
20

In [None]:
if 'neg_example_emb' not in filtered_dataset.column_names:
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: converting dataset to numpy and to dictionary")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
    filtered_dataset = filtered_dataset.with_format("numpy")
    filtered_dataset = filtered_dataset.to_dict(batch_size=20000)
    filtered_dataset['neg_example_emb'] = list()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to numpy and to dictionary complete")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
    for i in range(0, len(filtered_dataset['neg_example'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: embs assignment - starting {i+1} of {len(filtered_dataset['neg_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        filtered_dataset['neg_example_emb'].append(
            corpus_not_in_qrel_embs_avg_lookup_dict[int(filtered_dataset['neg_example'][i])]
        )
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: embs assignment - completed {i+1} of {len(filtered_dataset['neg_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    filtered_dataset = Dataset.from_dict(filtered_dataset)
    filtered_dataset.to_parquet(filtered_examples_file_path_and_name)
    del corpus_not_in_qrel_embs_avg_lookup_dict
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2024-12-05T02:03:09: converting dataset to numpy and to dictionary
2024-12-05T02:03:09: memory statistics: pmem(rss=44656861184, vms=63079981056, shared=355434496, text=2842624, lib=0, data=53079859200, dirty=0)

2024-12-05T02:04:43: conversion of dataset to numpy and to dictionary complete
2024-12-05T02:04:43: memory statistics: pmem(rss=53490089984, vms=71966023680, shared=355434496, text=2842624, lib=0, data=61965910016, dirty=0)

2024-12-05T02:04:43: embs assignment - starting 1 of 211996
2024-12-05T02:04:43: memory statistics: pmem(rss=53490089984, vms=71966023680, shared=355434496, text=2842624, lib=0, data=61965910016, dirty=0)
2024-12-05T02:04:43: embs assignment - completed 1 of 211996
2024-12-05T02:04:43: memory statistics: pmem(rss=53490089984, vms=71966023680, shared=355434496, text=2842624, lib=0, data=61965910016, dirty=0)
2024-12-05T02:04:43: embs assignment - starting 10001 of 211996
2024-12-05T02:04:43: memory statistics: pmem(rss=53490089984, vms=71966023680, shared=3

Creating parquet from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

In [None]:
print(filtered_dataset)
filtered_dataset_element = filtered_dataset.select([0])
print(f"{filtered_dataset_element['pos_example']}")
print(f"{filtered_dataset_element['neg_example']}")
print(f"{filtered_dataset_element['cos_sim']}")
print(f"{len(filtered_dataset_element['pos_example_emb'][0])} {filtered_dataset_element['pos_example_emb']}"[0:1500] + " ... ")
print(f"{len(filtered_dataset_element['neg_example_emb'][0])} {filtered_dataset_element['neg_example_emb']}"[0:1500] + " ... ")

Dataset({
    features: ['pos_example', 'neg_example', 'cos_sim', 'pos_example_emb', 'neg_example_emb'],
    num_rows: 211996
})
[657586]
[32293]
[0.11175079643726349]
1024 [[-0.5600718259811401, 0.7737520337104797, -1.0267841815948486, 0.16590581834316254, 1.4907433986663818, 0.5756590366363525, -0.45866355299949646, 1.3119560480117798, -0.5353403091430664, -0.5814063549041748, -0.20175421237945557, -0.5158028602600098, 0.6689209342002869, 1.269309639930725, 0.5356517434120178, 1.1946147680282593, -0.2437087446451187, 0.5267056226730347, -0.628906786441803, -0.09965250641107559, -1.3433955907821655, 0.4070412516593933, 0.5272387862205505, 0.06389933079481125, -0.2281103879213333, -0.7136669158935547, 0.256807416677475, -0.6540659070014954, -0.4976305365562439, 0.6791308522224426, -0.5389124751091003, -0.9851455688476562, -0.27874353528022766, 0.04295652359724045, -1.3697741031646729, 0.52895587682724, 0.49161118268966675, -1.3122892379760742, -1.3112765550613403, 1.6294536590576172, -

In [None]:
if 'query_emb' not in filtered_dataset.column_names:
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    queries_in_qrel_embs_avg_file_name = "ms_marco_queries_in_qrel_embs_avg"
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: converting filtered dataset to numpy")
    filtered_dataset = filtered_dataset.with_format("numpy")
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of filtered dataset to numpy complete")
    print(f"{timestamp}: converting filtered dataset to dict")
    filtered_dataset = filtered_dataset.to_dict(batch_size=20000)
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of filtered dataset to dict complete")
    pos_id_to_filtered_idx_map = {}
    for i in range(0, len(filtered_dataset['pos_example'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: pos_example id to dataset idx - starting {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        pos_id_to_filtered_idx_map[int(filtered_dataset['pos_example'][i])] = i
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: pos_example id to dataset idx - completed {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()


2024-12-05T02:32:34: reading dataset to memory
2024-12-05T02:32:34: read of dataset to memory complete
2024-12-05T02:32:34: converting dataset to numpy
2024-12-05T02:32:34: conversion of dataset to numpy complete
2024-12-05T02:32:34: converting dataset to dict
2024-12-05T02:36:22: conversion of dataset to dict complete
2024-12-05T02:36:22: converting filtered dataset to numpy
2024-12-05T02:36:22: conversion of filtered dataset to numpy complete
2024-12-05T02:36:22: converting filtered dataset to dict
2024-12-05T02:39:27: conversion of filtered dataset to dict complete
2024-12-05T02:39:27: pos_example id to dataset idx - starting 1 of 211996
2024-12-05T02:39:27: memory statistics: pmem(rss=58121572352, vms=78227472384, shared=4542533632, text=2842624, lib=0, data=64099880960, dirty=0)
2024-12-05T02:39:27: pos_example id to dataset idx - completed 1 of 211996
2024-12-05T02:39:27: memory statistics: pmem(rss=56025481216, vms=76131352576, shared=2446413824, text=2842624, lib=0, data=640998

In [None]:
from collections import Counter

if 'query_emb' not in filtered_dataset.keys():
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    qrels = load_dataset('BeIR/msmarco-qrels')
    corpus_id_to_query_ids_map = {}
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: beginning map of corpus ids to query ids")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    for entry in qrels['train']:
        if corpus_id_to_query_ids_map.get(int(entry['corpus-id'])) is None:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])] = [int(entry['query-id'])]
        else:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])].append(int(entry['query-id']))
    for entry in qrels['validation']:
        if corpus_id_to_query_ids_map.get(int(entry['corpus-id'])) is None:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])] = [int(entry['query-id'])]
        else:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])].append(int(entry['query-id']))
    for entry in qrels['test']:
        if corpus_id_to_query_ids_map.get(int(entry['corpus-id'])) is None:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])] = [int(entry['query-id'])]
        else:
            corpus_id_to_query_ids_map[int(entry['corpus-id'])].append(int(entry['query-id']))
    key_to_count = {}
    for key in corpus_id_to_query_ids_map.keys():
        if key_to_count.get(key) is None:
            key_to_count[key] = 1
        else:
            key_to_count[key] = key_to_count[key] + 1
    count_to_countct = {}
    for key in key_to_count.keys():
        if count_to_countct.get(key_to_count[key]) is None:
            count_to_countct[key_to_count[key]] = 1
        else:
            count_to_countct[key_to_count[key]] = count_to_countct[key_to_count[key]] + 1
    print(f"counts of query id count per corpus id:")
    print(Counter(count_to_countct))
    for key in corpus_id_to_query_ids_map.keys():
        corpus_id_to_query_ids_map[key] = corpus_id_to_query_ids_map[key][0]
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: completed map of corpus ids to query ids")
    print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: reading dataset to memory")
    queries_in_qrel_embs_avg_dataset = Dataset.from_parquet(
        f"{datasets_base_dir}/{queries_in_qrel_embs_avg_file_name}/{queries_in_qrel_embs_avg_file_name}-concatenated.parquet",
        columns=['_id', 'embedding']
    )
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: read of dataset to memory complete")
    print(f"{timestamp}: converting dataset to numpy")
    queries_in_qrel_embs_avg_dataset = queries_in_qrel_embs_avg_dataset.with_format("numpy")
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to numpy complete")
    print(f"{timestamp}: converting dataset to dict")
    queries_in_qrel_embs_avg_dataset = queries_in_qrel_embs_avg_dataset.to_dict(batch_size=20000)
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
    print(f"{timestamp}: conversion of dataset to dict complete")
    query_id_to_query_idx_map = {}
    for i in range(0, len(queries_in_qrel_embs_avg_dataset["_id"])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: query id to query dataset idx - starting {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        query_id_to_query_idx_map[int(queries_in_qrel_embs_avg_dataset['_id'][i])] = i
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: query id to query dataset idx - completed {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    filtered_dataset['query_ids'] = list()
    filtered_dataset['query_embs'] = list()
    for i in range(0, len(filtered_dataset['pos_example'])):
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: filtered dataset to query id and emb lookup - starting {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
        query_id = corpus_id_to_query_ids_map[filtered_dataset['pos_example'][i]]
        filtered_dataset['query_ids'].append(query_id)
        query_dataset_idx = query_id_to_query_idx_map[query_id]
        filtered_dataset['query_embs'].append(queries_in_qrel_embs_avg_dataset['embedding'][query_dataset_idx])
        if i % log_freq == 0 or i == 0:
            timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
            print(f"{timestamp}: filtered dataset to query id and emb lookup - completed {i+1} of {len(filtered_dataset['pos_example'])}")
            print(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}")
    filtered_dataset = Dataset.from_dict(filtered_dataset)
    filtered_dataset.to_parquet(filtered_examples_file_path_and_name)
    del queries_in_qrel_embs_avg_dataset, query_id_to_query_idx_map

2024-12-05T03:25:52: beginning map of corpus ids to query ids
2024-12-05T03:25:52: memory statistics: pmem(rss=37192622080, vms=57285713920, shared=378245120, text=2842624, lib=0, data=47326191616, dirty=0)
counts of query id count per corpus id:
Counter({1: 532024})
2024-12-05T03:26:14: completed map of corpus ids to query ids
2024-12-05T03:26:14: memory statistics: pmem(rss=37190651904, vms=57283923968, shared=364924928, text=2842624, lib=0, data=47337725952, dirty=0)
2024-12-05T03:26:21: reading dataset to memory
2024-12-05T03:26:21: read of dataset to memory complete
2024-12-05T03:26:21: converting dataset to numpy
2024-12-05T03:26:21: conversion of dataset to numpy complete
2024-12-05T03:26:21: converting dataset to dict
2024-12-05T03:29:59: conversion of dataset to dict complete
2024-12-05T03:29:59: query id to query dataset idx - starting 1 of 211996
2024-12-05T03:29:59: memory statistics: pmem(rss=56084140032, vms=76219125760, shared=2461044736, text=2842624, lib=0, data=641768

Creating parquet from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

In [None]:
print(filtered_dataset)
filtered_dataset = filtered_dataset.rename_column("query_embs", "query_emb")
filtered_dataset = filtered_dataset.rename_column("query_ids", "query_id")
filtered_dataset = filtered_dataset.rename_column("pos_example", "pos_doc_id")
filtered_dataset = filtered_dataset.rename_column("pos_example_emb", "pos_doc_emb")
filtered_dataset = filtered_dataset.rename_column("neg_example", "neg_doc_id")
filtered_dataset = filtered_dataset.rename_column("neg_example_emb", "neg_doc_emb")
filtered_dataset = filtered_dataset.rename_column("cos_sim", "pos_doc_neg_doc_cos_sim")
print(filtered_dataset)
filtered_dataset.to_parquet(filtered_examples_file_path_and_name)
del filtered_dataset
gc.collect()

Dataset({
    features: ['pos_example', 'neg_example', 'cos_sim', 'pos_example_emb', 'neg_example_emb', 'query_ids', 'query_embs'],
    num_rows: 211996
})
Dataset({
    features: ['pos_doc_id', 'neg_doc_id', 'pos_doc_neg_doc_cos_sim', 'pos_doc_emb', 'neg_doc_emb', 'query_id', 'query_emb'],
    num_rows: 211996
})


Creating parquet from Arrow format:   0%|          | 0/212 [00:00<?, ?ba/s]

5219341520

In [None]:
dataset_final = Dataset.from_parquet(filtered_examples_file_path_and_name)
print(dataset_final)
dataset_final_element = dataset_final.select([0])
print(f"{dataset_final_element['pos_doc_id']}")
print(f"{dataset_final_element['neg_doc_id']}")
print(f"{dataset_final_element['query_id']}")
print(f"{dataset_final_element['pos_doc_neg_doc_cos_sim']}")
print(f"{len(dataset_final_element['pos_doc_emb'][0])} {dataset_final_element['pos_doc_emb']}"[0:1500] + " ... ")
print(f"{len(dataset_final_element['neg_doc_emb'][0])} {dataset_final_element['neg_doc_emb']}"[0:1500] + " ... ")
print(f"{len(dataset_final_element['query_emb'][0])} {dataset_final_element['query_emb']}"[0:1500] + " ... ")

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['pos_doc_id', 'neg_doc_id', 'pos_doc_neg_doc_cos_sim', 'pos_doc_emb', 'neg_doc_emb', 'query_id', 'query_emb'],
    num_rows: 211996
})
[657586]
[32293]
[901246]
[0.11175079643726349]
1024 [[-0.5600718259811401, 0.7737520337104797, -1.0267841815948486, 0.16590581834316254, 1.4907433986663818, 0.5756590366363525, -0.45866355299949646, 1.3119560480117798, -0.5353403091430664, -0.5814063549041748, -0.20175421237945557, -0.5158028602600098, 0.6689209342002869, 1.269309639930725, 0.5356517434120178, 1.1946147680282593, -0.2437087446451187, 0.5267056226730347, -0.628906786441803, -0.09965250641107559, -1.3433955907821655, 0.4070412516593933, 0.5272387862205505, 0.06389933079481125, -0.2281103879213333, -0.7136669158935547, 0.256807416677475, -0.6540659070014954, -0.4976305365562439, 0.6791308522224426, -0.5389124751091003, -0.9851455688476562, -0.27874353528022766, 0.04295652359724045, -1.3697741031646729, 0.52895587682724, 0.49161118268966675, -1.3122892379760742, -1

In [None]:
datasets_base_dir = "/content/drive/MyDrive/CS646-FinalProject/datasets"
filtered_examples_dataset_file_name = "ms_marco_corpus_in_qrel_embs_avg_example_ids_filtered"
filtered_examples_file_path_and_name = f"{datasets_base_dir}/{filtered_examples_dataset_file_name}/{filtered_examples_dataset_file_name}.parquet"
final_dataset_train_file_path_and_name = f"{datasets_base_dir}/ms_marco_final_dataset_avg/ms_marco_final_dataset_avg_train.parquet"
final_dataset_test_file_path_and_name = f"{datasets_base_dir}/ms_marco_final_dataset_avg/ms_marco_final_dataset_avg_test.parquet"

dataset_final = Dataset.from_parquet(filtered_examples_file_path_and_name)
dataset_final = dataset_final.train_test_split(test_size=0.05)
dataset_final['train'].to_parquet(final_dataset_train_file_path_and_name)
dataset_final['test'].to_parquet(final_dataset_test_file_path_and_name)

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

260972000