In [0]:
from pyspark.sql import functions as F

bronze_table = "nlp_dev.bronze.fasttext_bronze"
silver_path = f"abfss://nlp@nlplakeadls001.dfs.core.windows.net/silver/fasttext/ingest=2025-10-10"

df_bronze = spark.table(bronze_table)

display(df_bronze.limit(5))
df_bronze.printSchema()

In [0]:
df_silver = (df_bronze.select("label","review_text")
             .withColumn("review_text", F.trim(F.regexp_replace("review_text", r"\s+", " "))) #normalize whitespaces
             .withColumn("review_text", F.regexp_replace("review_text", r"[\x00-\x1F]", "")) #Remove control characters
             .withColumn("review_text", F.lower(F.col("review_text"))) # convert to lower case
             .withColumn("review_text", F.regexp_replace("review_text", r"[^\p{L}\p{N}\s]", "")) #remove punctuations
             .withColumn("review_text", F.trim(F.regexp_replace("review_text", r"\s+", " "))) # collapse whitespaces after punctuations
             .withColumn("text_length_chars", F.length("review_text")) #get the char count
             .withColumn("text_length_words", F.size(F.split("review_text", r"\s+"))) # get the word count
             .withColumn("ingest_date", F.lit("2025-10-10")) #get the ingest date, can be automated based on the date.
             )

In [0]:
df_silver = (
    df_silver.filter(
        (F.col("review_text").isNotNull()) & #drop nulls
        (F.col("label").isNotNull()) &  #drop nulls
        (F.col("text_length_words") >= 3) &  # drop ultra short reviews
        (F.col("text_length_words") <= 512) # drop absurdly large reviews
    )
)

display(df_silver.limit(5))
display(df_silver.count())


In [0]:
df_silver.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(silver_path)

print("Data saved to :", silver_path)

In [0]:
%sql

use catalog nlp_dev;
use schema silver;

create table if not exists fasttext_silver
using delta
location "abfss://nlp@nlplakeadls001.dfs.core.windows.net/silver/fasttext/ingest=2025-10-10/"

In [0]:
%sql

select count(1) from fasttext_silver

In [0]:
%sql

use catalog nlp_dev;
use schema silver;

optimize fasttext_silver
zorder by (label)