In [None]:
%run "./00_setup.ipynb"

In [None]:
from pyspark.sql.types import DateType
import pyspark.sql.functions as F
from src.sparkdbutils import create_db, create_unpartitioned_table

In [None]:
# TODO: These should be pipeline parameters
_SOURCE_DB_NAME = "raw"
_SOURCE_WORDS_TABLE_NAME = "words"
_SOURCE_EMBEDDINGS_TABLE_NAME = "word_embeddings"
_SOURCE_FREQUENCIES_TABLE_NAME = "word_frequencies"
_TARGET_DB_NAME = "bronze"
_TARGET_TABLE_NAME = "words"

In [None]:
# Read in tables to be merged
words_df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_WORDS_TABLE_NAME}")
word_embeddings_df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_EMBEDDINGS_TABLE_NAME}")
word_frequencies_df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_FREQUENCIES_TABLE_NAME}")

In [None]:
# Perform full outer joins to capture all possible words
result_df = words_df.join(word_embeddings_df, on="word", how="full_outer") \
                    .join(word_frequencies_df, on="word", how="full_outer") \
                    .select("word", "letter_set", "date_added", "version", "frequency", "embedding")

In [None]:
# Validation check
# 1. Check for missing data patterns
missing_analysis = result_df.select(
    "word",
    F.col("letter_set").isNull().alias("missing_letter_set"),
    F.col("version").isNull().alias("missing_version"),
    F.col("frequency").isNull().alias("missing_frequency"),
    F.col("embedding").isNull().alias("missing_embedding")
)

# 2. Count missing data by source
print("Missing data summary:")
missing_analysis.groupBy("missing_letter_set", 
                         "missing_version",
                         "missing_frequency",
                         "missing_embedding") \
                 .count() \
                 .show()

# 3. Find words that exist in some but not all DataFrames
incomplete_words = result_df.filter(
    (F.col("letter_set").isNull()) | 
    (F.col("version").isNull()) | 
    (F.col("frequency").isNull()) | 
    (F.col("embedding").isNull())
)

print(f"Words with incomplete data: {incomplete_words.count()}")
if incomplete_words.count() > 0:
    print("Examples of incomplete words:")
    incomplete_words.show(10)
    raise Exception("Rows above have incomplete data.")

# 4. Verify your assumption about identical word sets
print("Row count comparison:")
words_df_count = words_df.count()
word_embeddings_df_count = word_embeddings_df.count()
word_frequencies_df_count = word_frequencies_df.count()
result_df_count = result_df.count()
print(f"words_df: {words_df_count}")
print(f"word_embeddings_df: {word_embeddings_df_count}")
print(f"word_frequencies_df: {word_frequencies_df_count}")
print(f"Full outer join: {result_df_count}")

all_counts = set([words_df_count, 
                  word_embeddings_df_count,
                  word_frequencies_df_count,
                  result_df_count])

if len(all_counts) > 1:
    raise Exception("Row counts do not match.")

In [None]:
create_db(spark, _TARGET_DB_NAME)

In [None]:
create_unpartitioned_table(spark, result_df, _TARGET_TABLE_NAME, _TARGET_DB_NAME)