# bootstrap_words_02
- Read in parquet files from previous stage as pandas
- ensure that vectors are lists of int32s
- convert to pyspark with bronze_words schema and write to delta table

In [None]:
%run "./00_setup.ipynb"

In [None]:
import numpy as np
from src.bronzeutils import validate_embeddings, words_schema
from src.fileutils import get_local_path
from src.constants import WORDLIST_PATH, WORDS_PARQUET_FILENAME
from src.sparkdbutils import create_unpartitioned_table
import pyspark.sql.functions as F

In [None]:
# START REPAIR SCRIPT
# INPUT_FILENAME = "words_old.parquet"

In [None]:
# input_parquet_path = get_local_path(f"{WORDLIST_PATH}/{INPUT_FILENAME}")
# output_parquet_path = get_local_path(f"{WORDLIST_PATH}/{WORDS_PARQUET_FILENAME

In [None]:
# df = pd.read_parquet(input_parquet_path, engine="pyarrow")

In [None]:
# df = df.drop(columns=["date_added"])

In [None]:
# df.to_parquet(output_parquet_path)

In [None]:
# END REPAIR SCRIPT

In [None]:
# TODO: parameterize _TARGET_DB_NAME in pipeline (or use as constants?)
_TARGET_DB_NAME = "bronze"
_TARGET_TABLE_NAME = "words"

In [None]:
parquet_path = get_local_path(f"{WORDLIST_PATH}/{WORDS_PARQUET_FILENAME}")

In [None]:
spark_df = spark.read.parquet(parquet_path, schema=words_schema)

In [None]:
# Validate that the df has no null elements in any embedding.
# Then explicitly clean the nulls so we can then apply a schema
# where embeddings cannot contain null elements

null_elements_df = spark_df.filter(F.expr("exists(embedding, x -> x IS NULL)"))

# Count how many such rows exist
count_null_elements = null_elements_df.count()

if count_null_elements > 0:
    raise Exception("Source data has null values in its embeddings")
else:
    print("✅ No null values in any embeddings")


In [None]:
def apply_schema(df, new_schema):
    current_schema = {field.name: field.dataType for field in df.schema.fields}
    exprs = []
    for field in new_schema.fields:
        if field.name in current_schema:
            current_type = current_schema[field.name]
            target_type = field.dataType

            if current_type == target_type:
                # Same type, no cast needed
                exprs.append(F.col(field.name).alias(field.name))
            else:
                # Different types, need to cast
                exprs.append(F.col(field.name).cast(target_type).alias(field.name))
        else:
            # Column in target_schema not in current df, add null with correct type
            exprs.append(F.lit(None).cast(field.dataType).alias(field.name))
    
    return df.select(*exprs)

In [None]:
final_df = apply_schema(spark_df, words_schema)

In [None]:
final_df.count()

In [None]:
create_unpartitioned_table(spark, final_df, _TARGET_TABLE_NAME, _TARGET_DB_NAME)

In [None]:
print(f"✅ Successfully created boostrapped words table {_TARGET_DB_NAME}.{_TARGET_TABLE_NAME}")