<a href="https://colab.research.google.com/github/matdjohnson-at-umass-dot-edu/cs646-final-project/blob/main/CS646_Final_Project_Preprocessing4_Instance0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets
! pip install transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import concatenate_datasets, Dataset, disable_caching, disable_progress_bars

from tqdm import tqdm
from google.colab import drive
import os
import torch
import torch.nn.functional as torch_func
import gc
import time
from threading import Lock
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModel
import logging
import psutil

os.environ["TOKENIZERS_PARALLELISM"] = "false"

disable_caching()

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class NegativeExampleSelector:

    def __init__(self, in_qrel_dir_and_file, not_in_qrel_dir_and_file, output_dir, start_index, end_index):
        self.datasets_root_dir = "/content/drive/MyDrive/CS646-FinalProject/datasets"
        self.in_qrel_dir_and_file = in_qrel_dir_and_file
        self.not_in_qrel_dir_and_file = not_in_qrel_dir_and_file
        self.output_dir = output_dir
        self.start_index = start_index
        self.end_index = end_index

    def run(self):
        torch.cuda.empty_cache()
        gc.collect()
        corpus_in_qrel = Dataset.from_parquet(f"{self.datasets_root_dir}/{self.in_qrel_dir_and_file}", columns=["_id", "embedding"])
        corpus_not_in_qrel = Dataset.from_parquet(f"{self.datasets_root_dir}/{self.not_in_qrel_dir_and_file}", columns=["_id", "embedding"])
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        corpus_in_qrel = corpus_in_qrel.with_format("torch", device=device).select(range(self.start_index, self.end_index))
        corpus_in_qrel_ids_list = list()
        corpus_in_qrel_embs_list = list()
        for entry in corpus_in_qrel:
            corpus_in_qrel_ids_list.append(int(entry['_id']))
            corpus_in_qrel_embs_list.append(entry['embedding'])
        corpus_in_qrel_ids = torch.tensor(corpus_in_qrel_ids_list)
        corpus_in_qrel_embs = torch.stack(corpus_in_qrel_embs_list)
        corpus_in_qrel.cleanup_cache_files()
        del corpus_in_qrel, corpus_in_qrel_ids_list, corpus_in_qrel_embs_list
        corpus_not_in_qrel = corpus_not_in_qrel.with_format("torch", device=device).select(range(self.start_index * 2, self.end_index * 2))
        corpus_not_in_qrel_ids_list = list()
        corpus_not_in_qrel_embs_list = list()
        for entry in corpus_not_in_qrel:
            corpus_not_in_qrel_ids_list.append(int(entry['_id']))
            corpus_not_in_qrel_embs_list.append(entry['embedding'])
        corpus_not_in_qrel_ids = torch.tensor(corpus_not_in_qrel_ids_list)
        corpus_not_in_qrel_embs = torch.stack(corpus_not_in_qrel_embs_list)
        corpus_not_in_qrel.cleanup_cache_files()
        del corpus_not_in_qrel, corpus_not_in_qrel_ids_list, corpus_not_in_qrel_embs_list
        gc.collect()
        torch.cuda.empty_cache()
        log_per_entry_ct = 1000
        min_cosine_similarities = list()
        if not os.path.exists(f"{self.datasets_root_dir}/{self.output_dir}"):
            os.makedirs(f"{self.datasets_root_dir}/{self.output_dir}")
        for i in range(0, len(corpus_in_qrel_embs)):
            if i % log_per_entry_ct == 0 or i == 0:
                log_file = open(f"{self.datasets_root_dir}/{self.output_dir}/output.log", "a")
                timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
                log_file.write(f"{timestamp}: finding negative sample for entry {i}\n")
                log_file.write(f"{timestamp}: _id:{corpus_in_qrel_ids[i]}, corpus_in_qrel_embs.shape:{corpus_in_qrel_embs.shape}, corpus_not_in_qrel_embs.shape:{corpus_not_in_qrel_embs.shape}, torch.unsqueeze(corpus_in_qrel_embs[i], 0).shape:{torch.unsqueeze(corpus_in_qrel_embs[i], 0).shape}\n")
                log_file.write(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
                log_file.close()
            min_result = torch.min(
                    torch_func.cosine_similarity(
                        torch.unsqueeze(corpus_in_qrel_embs[i], 0),
                        corpus_not_in_qrel_embs
                    ),
                    0
                )
            min_cosine_similarities.append(
                [corpus_in_qrel_ids[i].item(), corpus_not_in_qrel_ids[min_result[1]].item(), min_result[0].item()]
            )
            corpus_not_in_qrel_ids = self.remove_elements(corpus_not_in_qrel_ids, [min_result[1]])
            corpus_not_in_qrel_embs = self.remove_elements(corpus_not_in_qrel_embs, [min_result[1]])
            torch.cuda.empty_cache()
            gc.collect()
        dataset_dict = {
            "pos_example": list(),
            "neg_example": list(),
            "cos_sim": list()
        }
        for i in range(0, len(min_cosine_similarities)):
            dataset_dict["pos_example"].append(min_cosine_similarities[i][0])
            dataset_dict["neg_example"].append(min_cosine_similarities[i][1])
            dataset_dict["cos_sim"].append(min_cosine_similarities[i][2])
        example_ids_dataset = Dataset.from_dict(dataset_dict)
        example_ids_dataset.to_parquet(f"{self.datasets_root_dir}/{self.output_dir}/{self.output_dir}.parquet")
        example_ids_dataset.cleanup_cache_files()
        del example_ids_dataset, dataset_dict, min_cosine_similarities, corpus_not_in_qrel_ids, corpus_not_in_qrel_embs, corpus_in_qrel_ids, corpus_in_qrel_embs
        gc.collect()
        torch.cuda.empty_cache()

    def remove_elements(self, tensor, element_indicies):
        selection_mask = torch.ones(tensor.shape[0], dtype=torch.bool)
        selection_mask[element_indicies] = False
        return_value = tensor[selection_mask]
        del selection_mask
        return return_value


In [None]:
start_index = 450000
end_index = 475000
corpus_in_qrel_avg_dir_and_file = "ms_marco_corpus_in_qrel_embs_avg/ms_marco_corpus_in_qrel_embs_avg-concatenated.parquet"
corpus_not_in_qrel_avg_dir_and_file = "ms_marco_corpus_not_in_qrel_embs_avg/ms_marco_corpus_not_in_qrel_embs_avg-concatenated.parquet"
corpus_in_qrel_avg_example_ids_file = f"ms_marco_corpus_in_qrel_embs_avg_example_ids_{start_index}-{end_index}"

negative_sample_selector = NegativeExampleSelector(
    corpus_in_qrel_avg_dir_and_file,
    corpus_not_in_qrel_avg_dir_and_file,
    corpus_in_qrel_avg_example_ids_file,
    start_index,
    end_index
)
negative_sample_selector.run()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]