In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name

storage_account = "nlplakeadls001"
container = "nlp"

raw_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw"
bronze_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze"

ingest_date = "2025-10-10"

raw_path = f"{raw_base}/fasttext/ingest_date={ingest_date}/*.bz2"
bronze_path = f"{bronze_base}/fasttext/ingest_date={ingest_date}/"

raw_path, bronze_path

In [0]:


df_raw = spark.read.text(raw_path)
df_raw.limit(5).display()


In [0]:
from pyspark.sql.functions import col, split, regexp_replace

df_bronze = (df_raw 
.withColumn("label_raw", split(col("value")," ")[0])
.withColumn("review_text", regexp_replace(col("value"), "^__label__\\d+\\s+", ""))
.withColumn(
        "label",
        regexp_replace(col("label_raw"), "__label__", "").cast("int")
    )
    .drop("value", "label_raw")
)

df_bronze.limit(5).display()
print("Row count (bronze):", df_bronze.count())

In [0]:
df_bronze.write.format("delta").mode("overwrite").save(bronze_path)

print("Bronze saved to:", bronze_path)


In [0]:
%sql
SELECT current_catalog(), current_schema();


In [0]:
%sql
-- Make sure youâ€™re in the right catalog/schema
USE CATALOG nlp_dev;
USE SCHEMA bronze;

-- Create the Delta table pointing to the actual root (with ingest_date)
CREATE TABLE IF NOT EXISTS fasttext_bronze
USING DELTA
LOCATION 'abfss://nlp@nlplakeadls001.dfs.core.windows.net/bronze/fasttext/ingest_date=2025-10-10/';


In [0]:
%sql

SHOW TABLES IN nlp_dev.bronze;
SELECT COUNT(*) FROM nlp_dev.bronze.fasttext_bronze;