In [0]:
from pyspark.sql.functions import *

silver_table = "nlp_dev.silver.fasttext_silver"
gold_path = f"abfss://nlp@nlplakeadls001.dfs.core.windows.net/gold/fasttext/ingest_date=2025-10-10/"

df_silver = spark.read.table(silver_table)

display(df_silver.take(5))

print("Silver Count",df_silver.count())


In [0]:
#tokenize the text using simple regex
from pyspark.sql import functions as F

df_tokenized = (
    df_silver.withColumn("tokens", F.split(F.col("review_text"), r"\s+"))
)

In [0]:
from pyspark.ml.feature import StopWordsRemover

#remove stopwords using the default list
remover = StopWordsRemover(
    inputCol="tokens",
    outputCol="tokens_clean"
)

df_clean = remover.transform(df_tokenized)

display(df_clean.take(5))

print("Clean Count",df_clean.count())

In [0]:
from pyspark.ml.feature import HashingTF, IDF

#hash the tokens into a vector, IDF

hasttag_tf = HashingTF(
    inputCol="tokens_clean",
    outputCol="raw_features",
    numFeatures=100000  # good balance for text

)

df_hashed = hasttag_tf.transform(df_clean)


In [0]:
idf = IDF(
    inputCol="raw_features",
    outputCol="tfidf_features"
)

idf_model = idf.fit(df_hashed) 

df_tfidf = idf_model.transform(df_hashed)

df_tfidf.select("label", "tfidf_features").show(5)


In [0]:
df_final = (df_tfidf.withColumn(
            "split",
            F.when(F.rand() < 0.80, "train")
            .when(F.rand() < 0.90, "val")
            .otherwise("test")
            )
            .withColumn("ingest_date", F.lit("2025-10-10"))
         )

In [0]:
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(gold_path)

print("Gold Count",df_final.count())

In [0]:
%sql

use catalog nlp_dev;
use schema gold;

CREATE table if not exists nlp_dev.gold.fasttext_gold
USING delta
LOCATION 'abfss://nlp@nlplakeadls001.dfs.core.windows.net/gold/fasttext/ingest_date=2025-10-10';


In [0]:
%sql
SELECT split, COUNT(*) 
FROM nlp_dev.gold.fasttext_gold
GROUP BY split;
