# Embed all relevant docs and export

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from nesta_ds_utils.loading_saving import S3
from time import time

model = SentenceTransformer("all-MiniLM-L6-v2")

from discovery_child_development import PROJECT_DIR, logging, S3_BUCKET
ENRICHED_DATA_DIR = PROJECT_DIR / 'outputs/enrichments'
PATH_TO_DATASET = ENRICHED_DATA_DIR / 'openalex_patents_relevance_labels_only_relevant.csv'

VECTORS_PATH = "data/outputs/vectors/"
VECTORS_FILE = "sentence_vectors_384_labelled.parquet"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
relevant_df = pd.read_csv(PATH_TO_DATASET)

docs = relevant_df["text"].tolist()
ids = relevant_df["id"].tolist()

In [3]:
len(relevant_df)

51234

In [4]:
# embed the titles & abstracts (these have already been concatenated in the column 'text')
t0 = time()
sentence_vectors_384 = model.encode(docs, show_progress_bar=True)
print(f"vectorization done in {time() - t0:.3f} s")

vectors_as_list = [list(vec) for vec in sentence_vectors_384]

vector_df = pd.DataFrame({"id": ids, "miniLM_384_vector": vectors_as_list})
if len(docs) == len(vector_df):
    logging.info(f"Successfully embedded {len(docs)} docs")
else:
    logging.warning("Embeddings were not created for all docs")

S3.upload_obj(vector_df, S3_BUCKET, f"{VECTORS_PATH}{VECTORS_FILE}")

Batches: 100%|██████████| 1602/1602 [14:29<00:00,  1.84it/s]


vectorization done in 871.939 s
2024-02-29 15:00:21,339 - root - INFO - Successfully embedded 51234 docs
2024-02-29 15:00:23,763 - botocore.credentials - INFO - Found credentials in environment variables.
