<a href="https://colab.research.google.com/github/matdjohnson-at-umass-dot-edu/cs646-final-project/blob/main/CS646_Final_Project_Preprocessing2_Instance_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets
! pip install transformers
! pip install psutil

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import concatenate_datasets, Dataset, disable_caching, disable_progress_bars
from google.colab import drive
from tqdm import tqdm
import numpy as np
import re
import os
import torch
import time
import gc
import psutil

disable_caching()
disable_progress_bars()
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class DatasetProcessor:
    def __init__(self, dataset_dir_pattern, dataset_dirs, output_dir_suffix, embedding_manipulation_function):
        self.dataset_parent_dir = '/content/drive/MyDrive/CS646-FinalProject/datasets/'
        self.dataset_dir_pattern = re.compile(dataset_dir_pattern)
        self.dataset_dirs = dataset_dirs
        self.dataset_subsection_pattern = re.compile('subset_.*\\.parquet')
        self.output_dir_suffix = output_dir_suffix
        self.embedding_manipulation_function = embedding_manipulation_function
        self.embedding_list = list()
        self.dataset = None

    def reshape_dataset_element(self, dataset_element):
        dataset_element['embedding'] = torch.reshape(dataset_element['embedding'], (-1, 1024))
        return dataset_element

    def process_dataset(self):
        # for dir in os.listdir(self.dataset_parent_dir):
        #     if self.dataset_dir_pattern.match(dir) is not None:
        timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
        log_file_path = self.dataset_parent_dir + f"instance0_{timestamp}_compute_{self.output_dir_suffix}.log"
        for dir in self.dataset_dirs:
            print(f"reading dataset from directory {self.dataset_parent_dir + dir}")
            for file in os.listdir(self.dataset_parent_dir + dir):
                if self.dataset_subsection_pattern.match(file) is not None:
                    log_file = open(log_file_path, "a")
                    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
                    log_file.write(f"{timestamp}: starting dataset='{dir}' file='{file}'\n")
                    log_file.write(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
                    log_file.close()
                    self.dataset = Dataset.from_parquet(self.dataset_parent_dir + dir + f"/{file}")
                    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    self.dataset = self.dataset.with_format("torch", device=device)
                    self.dataset = self.dataset.map(self.reshape_dataset_element)
                    self.dataset = self.dataset.map(self.embedding_manipulation_function)
                    self.dataset.to_parquet(self.dataset_parent_dir + dir + self.output_dir_suffix + f"/{file}")
                    self.dataset.cleanup_cache_files()
                    del self.dataset
                    gc.collect()
                    torch.cuda.empty_cache()
                    log_file = open(log_file_path, "a")
                    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
                    log_file.write(f"{timestamp}: completed dataset='{dir}' file='{file}'\n")
                    log_file.write(f"{timestamp}: memory statistics: {psutil.Process().memory_info()}\n")
                    log_file.close()


In [None]:
dataset_dir_pattern = '^ms_marco_corpus_in_qrel_embs_[0-9]+-[0-9]+$'
dataset_dirs = [
    "ms_marco_queries_in_qrel_embs_187500-250000",
    "ms_marco_queries_in_qrel_embs_250000-312500",
    "ms_marco_queries_in_qrel_embs_312500-375000",
    "ms_marco_queries_in_qrel_embs_375000-432500",
    "ms_marco_queries_in_qrel_embs_432500-500000",
    "ms_marco_queries_in_qrel_embs_500000-509962"
]

def average_embedding(dataset_element):
    dataset_element['embedding'] = torch.mean(dataset_element['embedding'], dim=0)
    return dataset_element

dataset_processor = DatasetProcessor(dataset_dir_pattern, dataset_dirs, '_avg', average_embedding)
dataset_processor.process_dataset()

reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_187500-250000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_250000-312500
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_312500-375000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_375000-432500
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_432500-500000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_500000-509962


In [None]:
def sum_embedding(dataset_element):
    dataset_element['embedding'] = torch.sum(dataset_element['embedding'], dim=0)
    return dataset_element

dataset_processor = DatasetProcessor(dataset_dir_pattern, dataset_dirs, "_sum", sum_embedding)
dataset_processor.process_dataset()

reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_187500-250000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_250000-312500
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_312500-375000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_375000-432500
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_432500-500000
reading dataset from directory /content/drive/MyDrive/CS646-FinalProject/datasets/ms_marco_queries_in_qrel_embs_500000-509962
