In [1]:
# Add the project root directory to the system path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
os.environ["TOKENIZERS_PARALLELISM"] = "true"


from src.etl.etl_funcs import load_documents
from src.etl.embedding_funcs import embed_index
from src.algorithms import v0, v1
from src.processing import graph_construction, distance_metrics

from sentence_transformers import SentenceTransformer
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the document index
document_index = load_documents()

# Embed the document index
embedded_index = embed_index(document_index)

sub_index = embedded_index[:50]

Embedding Documents: 100%|██████████| 197/197 [00:06<00:00, 29.31it/s]


# Single process timing

In [3]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

doc_distance_vectors = {}

retriever = v0.V0Retriever()

# For each document
for doc in tqdm.tqdm(embedded_index):
    for other_doc in embedded_index:
        # Calculate all the distance vectors for every other doc and nest them into a dictionar
        doc_distance_vectors[doc.id_] = {
            other_doc.id_: retriever.calculate_distance(doc, other_doc)
        }

100%|██████████| 197/197 [00:05<00:00, 34.03it/s]


In [4]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

doc_distance_vectors = {}

retriever = v1.V1Retriever()

# For each document
for doc in tqdm.tqdm(sub_index):
    for other_doc in sub_index:
        # Calculate all the distance vectors for every other doc and nest them into a dictionar
        doc_distance_vectors[doc.id_] = {
            other_doc.id_: retriever.calculate_distance(doc, other_doc)
        }

100%|██████████| 50/50 [00:02<00:00, 19.40it/s]


# Multi process timing

In [5]:
adj_matrix = graph_construction.construct_adjacency_dict_parallel(
    embedded_index=embedded_index,
    retrieval_class=v0.V0Retriever
)

Processing chunk 1026c639-9265-4ddb-a0cf-2a7e91737602: 100%|██████████| 22/22 [00:01<00:00, 16.81it/s]
Processing chunk e983f9b7-10a1-46b4-a460-e912349e0ade: 100%|██████████| 22/22 [00:00<00:00, 23.48it/s]
Processing chunk 54c40cb7-ffcf-4bab-99ab-4c3ea548c7c8: 100%|██████████| 22/22 [00:00<00:00, 27.22it/s]
Processing chunk d4e5638e-41a0-44a5-92ed-a4c2521877ef: 100%|██████████| 22/22 [00:00<00:00, 35.45it/s]
Processing chunk c1fee0bc-c189-4faa-a845-927623fb517d: 100%|██████████| 22/22 [00:00<00:00, 34.89it/s]
Processing chunk ee0bfddf-e01a-4c96-bbf5-9caa7f73f531: 100%|██████████| 22/22 [00:00<00:00, 35.05it/s]
Processing chunk 6d3ed7e9-5c9b-40d7-a5ae-16106ceeb90a: 100%|██████████| 22/22 [00:00<00:00, 36.58it/s]
Processing chunk 0189064d-82b4-48b4-ab30-0cca6a9d3f22: 100%|██████████| 22/22 [00:00<00:00, 36.60it/s]
Processing chunk f344b0e5-4f18-4618-8d7b-b5c6310edf66: 100%|██████████| 21/21 [00:00<00:00, 38.45it/s]


In [6]:
adj_matrix = graph_construction.construct_adjacency_dict_parallel(
    embedded_index=sub_index,
    retrieval_class=v1.V1Retriever
)

Processing chunk e983f9b7-10a1-46b4-a460-e912349e0ade: 100%|██████████| 6/6 [00:00<00:00, 17.18it/s]
Processing chunk d4e5638e-41a0-44a5-92ed-a4c2521877ef: 100%|██████████| 6/6 [00:00<00:00, 13.81it/s]
Processing chunk 54c40cb7-ffcf-4bab-99ab-4c3ea548c7c8: 100%|██████████| 6/6 [00:00<00:00, 13.76it/s]
Processing chunk 1026c639-9265-4ddb-a0cf-2a7e91737602: 100%|██████████| 6/6 [00:00<00:00,  8.46it/s]
Processing chunk c1fee0bc-c189-4faa-a845-927623fb517d: 100%|██████████| 6/6 [00:00<00:00, 10.76it/s]
Processing chunk 6d3ed7e9-5c9b-40d7-a5ae-16106ceeb90a: 100%|██████████| 5/5 [00:00<00:00, 15.62it/s]
Processing chunk ee0bfddf-e01a-4c96-bbf5-9caa7f73f531: 100%|██████████| 5/5 [00:00<00:00, 16.10it/s]
Processing chunk f344b0e5-4f18-4618-8d7b-b5c6310edf66: 100%|██████████| 5/5 [00:00<00:00, 19.59it/s]
Processing chunk 0189064d-82b4-48b4-ab30-0cca6a9d3f22: 100%|██████████| 5/5 [00:00<00:00, 19.54it/s]


# Solution was to move the embedding logic all to the initial call. Uses the pre-built parallelism of the huggingface library much more effectively.

The rest of the function calls are so quick and use so little overhead we don't need to multiprocess really.