In [None]:
%run "./00_setup.ipynb"

In [None]:
import csv
import pandas as pd
import numpy as np
import os
from src.wordutils import get_letter_set, filter_wordlist
from src.fileutils import word_file_to_set, get_local_path
from src.constants import (WORDLIST_PATH, 
                           RAW_WORDLIST_FILENAME,
                           WORDLIST_TEMP_CSV_FILENAME,
                           WORDS_PKL_FILENAME,
                           WORDS_PARQUET_FILENAME,
                           NGRAMS_API_BASE,
                           NGRAMS_BATCH_SIZE)
from src.ngramsutils import get_word_frequencies
from src.embeddingutils import get_word_embeddings

In [None]:
# filter the wordlist
wordlist = filter_wordlist(word_file_to_set(f"{WORDLIST_PATH}/{RAW_WORDLIST_FILENAME}"))
print(len(wordlist),"words")

In [None]:
# Create temp csv of word, letter_set, version for initial wordlist
VERSION = 1
rows = [(word, get_letter_set(word), VERSION) for word in wordlist]
temp_path = get_local_path(f"{WORDLIST_PATH}/{WORDLIST_TEMP_CSV_FILENAME}")
with open(temp_path, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["word", "letter_set", "version"])
    writer.writerows(rows)
    

In [None]:
# Read in csv to pandas df
source_df = pd.read_csv(temp_path)
source_df["date_added"] = [None] * len(source_df)
source_df["date_added"] = pd.to_datetime(source_df["date_added"], errors="coerce")

pkl_path = get_local_path(f"{WORDLIST_PATH}/{WORDS_PKL_FILENAME}")
parquet_path = get_local_path(f"{WORDLIST_PATH}/{WORDS_PARQUET_FILENAME}")

# TODO: TESTING ONLY - REMOVE
# Add this after reading your CSV but before processing:
print(f"Full dataset: {len(source_df)} rows")

# Create test subset
test_source_df = source_df.head(1000).copy()
print(f"Test subset: {len(test_source_df)} rows")
# END TODO


def add_frequency_and_embedding(source_df: pd.DataFrame, 
                                pkl_path: str = pkl_path,
                                resume_job: bool = False
) -> pd.DataFrame:

    # Pick up job where we left off, if necessary
    target_df = None
    offset = 0
    if resume_job and os.path.exists(pkl_path):
        target_df = pd.read_pickle(pkl_path)
        offset = len(target_df)
        print(f"Resuming from offset {offset} with {len(target_df)} rows already processed")

    try:
        while offset < len(source_df):
            print(f"processing batch {offset}:{offset + NGRAMS_BATCH_SIZE}")
        
            batch_df = source_df[offset:(offset + NGRAMS_BATCH_SIZE)].copy()
        
            # get words this batch
            batch_words = list(batch_df["word"].values)
        
            # add frequencies for this batch
            freq_dict = get_word_frequencies(batch_words)
            frequencies = [freq_dict[word] for word in batch_words]
            batch_df["frequency"] = frequencies
        
            # add embeddings for this batch
            embeddings_dict = get_word_embeddings(batch_words)
            embeddings = [embeddings_dict[word] for word in batch_words]
            batch_df["embedding"] = embeddings
        
            # Combine batches
            if target_df is None:
                target_df = batch_df
            else:
                target_df = pd.concat([target_df, batch_df], ignore_index=True)
        
            # Save checkpoint
            target_df.to_pickle("./target_df.pkl")
        
            # Quick and dirty logging, so we know where to pick up
            print(f"completed batch of rows {offset}-{offset + NGRAMS_BATCH_SIZE}")
        
            # Finally increment the offset for the next iteration
            offset += NGRAMS_BATCH_SIZE

        print(f"Processing complete! Final dataset has {len(target_df)} rows")
        return target_df

    except Exception as err:
        print(f"Exception occurred! Resume job at offset {offset}")
        print(f"Current progress: {len(target_df) if target_df is not None else 0} rows processed")
        raise(err)

In [None]:
# First run - comment out before rerunning
# result_df = add_frequency_and_embedding(source_df)

# Resume if it crashed - uncomment to use
# result_df = add_frequency_and_embedding(source_df, resume_job=True)

In [None]:
# result_df.to_parquet(parquet_path)

In [None]:
# df2 = pd.read_parquet(parquet_path)

In [None]:
# Comprehensive validation checks for your word embeddings dataset

def validate_processed_data(original_df, processed_df, wordlist):
    """
    Comprehensive validation of the processed word embeddings dataset
    """
    print("=== DATA VALIDATION REPORT ===\n")
    
    # 1. ROW COUNT VALIDATION
    print("1. ROW COUNT VALIDATION")
    print(f"   Original CSV rows: {len(original_df)}")
    print(f"   Processed DF rows: {len(processed_df)}")  
    print(f"   Filtered wordlist: {len(wordlist)}")
    
    if len(original_df) == len(processed_df) == len(wordlist):
        print("   ✅ Row counts match perfectly")
    else:
        print("   ❌ Row count mismatch!")
    print()
    
    # 2. SCHEMA VALIDATION
    print("2. SCHEMA VALIDATION")
    expected_columns = ['word', 'letter_set', 'version', 'date_added', 'frequency', 'embedding']
    actual_columns = list(processed_df.columns)
    print(f"   Expected columns: {expected_columns}")
    print(f"   Actual columns: {actual_columns}")
    
    if set(expected_columns) == set(actual_columns):
        print("   ✅ All expected columns present")
    else:
        missing = set(expected_columns) - set(actual_columns)
        extra = set(actual_columns) - set(expected_columns)
        if missing: print(f"   ❌ Missing columns: {missing}")
        if extra: print(f"   ❌ Extra columns: {extra}")
    print()
    
    # 3. DATA TYPE VALIDATION
    print("3. DATA TYPE VALIDATION")
    print(f"   Data types:\n{processed_df.dtypes}")
    
    # Check specific types
    checks = [
        ('word', 'object'),
        ('letter_set', 'object'), 
        ('version', 'int64'),
        ('date_added', 'datetime64[ns]'),
        ('frequency', ('float64')),
    ]
    
    for col, expected_type in checks:
        actual_type = str(processed_df[col].dtype)
        if isinstance(expected_type, tuple):
            if actual_type in expected_type:
                print(f"   ✅ {col}: {actual_type}")
            else:
                print(f"   ❌ {col}: expected {expected_type}, got {actual_type}")
        else:
            if actual_type == expected_type:
                print(f"   ✅ {col}: {actual_type}")
            else:
                print(f"   ❌ {col}: expected {expected_type}, got {actual_type}")
    print()
    
    # 4. NULL/MISSING VALUE VALIDATION
    print("4. NULL/MISSING VALUE VALIDATION")
    null_counts = processed_df.isnull().sum()
    print(f"   Null counts per column:\n{null_counts}")
    
    # Check expected nulls
    if null_counts['date_added'] == len(processed_df):
        print("   ✅ All date_added values are null (as expected)")
    else:
        print(f"   ❌ Expected all date_added to be null, but {len(processed_df) - null_counts['date_added']} are not null")
    
    # Check no unexpected nulls
    critical_cols = ['word', 'letter_set', 'version', 'frequency', 'embedding']
    for col in critical_cols:
        if null_counts[col] == 0:
            print(f"   ✅ No nulls in {col}")
        else:
            print(f"   ❌ Found {null_counts[col]} nulls in {col}")
    print()
    
    # 5. WORD CONSTRAINT VALIDATION
    print("5. WORD CONSTRAINT VALIDATION")
    
    # Check word length (should be >= 4 for Spelling Bee)
    short_words = processed_df[processed_df['word'].str.len() < 4]
    if len(short_words) == 0:
        print("   ✅ No words shorter than 4 characters")
    else:
        print(f"   ❌ Found {len(short_words)} words shorter than 4 characters")
        print(f"       Examples: {list(short_words['word'].head())}")
    
    # Check letter_set constraint (should be <= 7 distinct letters)
    long_letter_sets = processed_df[processed_df['letter_set'].str.len() > 7]
    if len(long_letter_sets) == 0:
        print("   ✅ No words with more than 7 distinct letters")
    else:
        print(f"   ❌ Found {len(long_letter_sets)} words with more than 7 distinct letters")
        print(f"       Examples: {list(long_letter_sets[['word', 'letter_set']].head().to_dict('records'))}")
    
    # Verify letter_set calculation
    sample_check = processed_df.head(10).copy()
    sample_check['calculated_letter_set'] = sample_check['word'].apply(lambda w: ''.join(sorted(set(w.upper()))))
    letter_set_matches = (sample_check['letter_set'] == sample_check['calculated_letter_set']).all()
    if letter_set_matches:
        print("   ✅ Letter set calculation appears correct (spot check)")
    else:
        print("   ❌ Letter set calculation may be incorrect")
        print(f"       Sample mismatches:\n{sample_check[['word', 'letter_set', 'calculated_letter_set']]}")
    print()
    
    # 6. VERSION VALIDATION
    print("6. VERSION VALIDATION")
    version_values = processed_df['version'].unique()
    if len(version_values) == 1 and version_values[0] == 1:
        print("   ✅ All versions are 1 (as expected)")
    else:
        print(f"   ❌ Expected all versions to be 1, found: {version_values}")
    print()
    
    # 7. FREQUENCY VALIDATION
    print("7. FREQUENCY VALIDATION")
    freq_stats = processed_df['frequency'].describe()
    print(f"   Frequency statistics:\n{freq_stats}")
    
    # Check for reasonable frequency range
    zero_freq = (processed_df['frequency'] == 0).sum()
    negative_freq = (processed_df['frequency'] < 0).sum()
    
    print(f"   Words with zero frequency: {zero_freq}")
    print(f"   Words with negative frequency: {negative_freq}")
    
    if negative_freq == 0:
        print("   ✅ No negative frequencies")
    else:
        print("   ❌ Found negative frequencies")
    print()
    
    # 8. EMBEDDING VALIDATION
    print("8. EMBEDDING VALIDATION")
    
    # Check embedding dimensions
    if len(processed_df) > 0:
        first_embedding = processed_df['embedding'].iloc[0]
        if isinstance(first_embedding, (list, tuple, np.ndarray)):
            embedding_dim = len(first_embedding)
            print(f"   Embedding dimension: {embedding_dim}")
            
            if embedding_dim == 768:
                print("   ✅ Embedding dimension is 768 (as expected)")
            else:
                print(f"   ❌ Expected embedding dimension 768, got {embedding_dim}")
            
            # Check all embeddings have same dimension
            dims = processed_df['embedding'].apply(lambda x: len(x) if isinstance(x, (list, tuple, np.ndarray)) else 0)
            if dims.nunique() == 1:
                print("   ✅ All embeddings have consistent dimensions")
            else:
                print(f"   ❌ Inconsistent embedding dimensions: {dims.value_counts()}")
            
            # Check for null embeddings
            null_embeddings = processed_df['embedding'].apply(lambda x: x is None or (isinstance(x, (list, tuple, np.ndarray)) and len(x) == 0))
            if null_embeddings.sum() == 0:
                print("   ✅ No null or empty embeddings")
            else:
                print(f"   ❌ Found {null_embeddings.sum()} null or empty embeddings")
        else:
            print(f"   ❌ Embedding data type unexpected: {type(first_embedding)}")
    print()
    
    # 9. WORD SET VALIDATION
    print("9. WORD SET VALIDATION")
    original_words = set(wordlist)
    processed_words = set(processed_df['word'])
    
    if original_words == processed_words:
        print("   ✅ Processed words exactly match original wordlist")
    else:
        missing_from_processed = original_words - processed_words
        extra_in_processed = processed_words - original_words
        
        if missing_from_processed:
            print(f"   ❌ Missing from processed: {len(missing_from_processed)} words")
            print(f"       Examples: {list(list(missing_from_processed)[:5])}")
        
        if extra_in_processed:
            print(f"   ❌ Extra in processed: {len(extra_in_processed)} words") 
            print(f"       Examples: {list(list(extra_in_processed)[:5])}")
    
    # Check for duplicates
    duplicates = processed_df['word'].duplicated().sum()
    if duplicates == 0:
        print("   ✅ No duplicate words")
    else:
        print(f"   ❌ Found {duplicates} duplicate words")
    print()
    
    # 10. PARQUET ROUNDTRIP VALIDATION
    print("10. PARQUET ROUNDTRIP VALIDATION")
    
    # This would be called after reading back from parquet
    # Just check that the dtypes are preserved reasonably
    print("   Check dtypes after parquet roundtrip:")
    for col in processed_df.columns:
        print(f"     {col}: {processed_df[col].dtype}")
    
    print("\n=== VALIDATION COMPLETE ===")

# Usage in your notebook:
# validate_processed_data(source_df, df2, wordlist)

In [None]:
# TODO: Remove testing
test_result_df = add_frequency_and_embedding(test_source_df)
# Save test results
test_parquet_path = get_local_path(f"{WORDLIST_PATH}/test_words.parquet")
test_result_df.to_parquet(test_parquet_path)

# Read back and validate
test_df2 = pd.read_parquet(test_parquet_path)

# Run validation (you'll need the original wordlist subset too)
test_wordlist = list(test_source_df['word'])
validate_processed_data(test_source_df, test_df2, test_wordlist)

In [None]:
# Check some high-frequency words make sense
print("Top 10 most frequent words:")
print(test_df2.nlargest(10, 'frequency')[['word', 'frequency']])

# Check some low-frequency words
print("\nSample low-frequency words:")
print(test_df2.nsmallest(10, 'frequency')[['word', 'frequency']])

# Verify letter_set examples
print("\nSample letter_set validation:")
sample = test_df2.sample(5)[['word', 'letter_set']]
for _, row in sample.iterrows():
    calculated = ''.join(sorted(set(row['word'].upper())))
    print(f"'{row['word']}' -> expected: '{calculated}', actual: '{row['letter_set']}'")