## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES

In [74]:
import sys

sys.path.insert(1, "../../")

In [75]:
import re
from pathlib import Path

import pandas as pd


### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED

In [76]:
# set local folder where the index data is located
LOCAL_ROOT = "<local-path-to-data-folder"

# value to decide if the original file should maintain or remove the embedding column
REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True

#identifier field
STANDARD_IDENTIFIER_FIELD = "id"

#new embedding field name
NEW_STANDARD_EMBEDDING_FIELD = "embedding"

### GENERIC METHOD TO EXTRACT EMBEDDING COLUMNS FROM A FILE AND CREATE A NEW EMBEDDINGS SPECIFIC FILE

In [77]:
def extract_text_embedding_from_table(input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str):
    """Migrate table for embeddings."""
    original_df = pd.read_parquet(input_path)
    no_embeddings_df = original_df.drop(columns=[original_embedding_field])
    
    embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]
    embeddings_df = embeddings_df.rename(columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore
    embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)

    if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:
        no_embeddings_df.to_parquet(input_path, index=False)

### ITERATES PER ALL PARQUET FILES INSIDE  THE FOLDER AND DETECTS ALL EMBEDDINGS COLUMNS IN ALL OF THEM

In [None]:
#READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE
folder_path = Path(LOCAL_ROOT)
pattern = r"^(.*?)(_embedding)$"

for file_path in folder_path.iterdir():
    if file_path.is_file() and file_path.suffix == ".parquet":
        original_df = pd.read_parquet(str(file_path))
        columns = original_df.columns.tolist()

        for column in columns:
            match = re.match(pattern, column)
            if match:
                print(f"Reading {file_path}")
                filename_without_extension = str(file_path.with_suffix("").as_posix())
                embedding_file_name = f"{filename_without_extension}_{column}s{file_path.suffix}"
                extract_text_embedding_from_table(str(file_path), column, embedding_file_name)
