In [0]:
# === CONFIG ===
storage_acct = "goodreadsreviews60107070"

spark.conf.set(
    "fs.azure.account.key.goodreadsreviews60107070.dfs.core.windows.net",
    "W68YOwummMkTfxvE8uSyeHwSn2ISU3fxF43SpgTIU/zdUDmwquZ95QpaxDJnze6PRovNww3bWamU+AStmbtZLg=="
)

container   = "lakehouse"
silver_path = f"abfss://{container}@{storage_acct}.dfs.core.windows.net/processed"
gold_path   = f"abfss://{container}@{storage_acct}.dfs.core.windows.net/gold"

# Output locations for splits
features_v2_base = f"{gold_path}/features_v2"
train_out = f"{features_v2_base}/train"
val_out   = f"{features_v2_base}/val"
test_out  = f"{features_v2_base}/test"

In [0]:
# === 1) LOAD CLEAN SOURCE ===
# Use features_v1 as the leakage-safe source for downstream featurization
df = (
    spark.read.format("delta")
    .load(f"{gold_path}/features_v1")  # load by path instead of table()
)

# --- Optional: Basic hygiene filters ---
from pyspark.sql import functions as F

df = (
    df.dropDuplicates(["review_id"])
      .filter(F.col("review_text").isNotNull() & (F.length(F.col("review_text")) >= 10))
      .filter(F.col("rating").isNotNull())
)

# Verify schema and sample
df.printSchema()
df.show(5, truncate=False)

In [0]:
# === 2) MAKE REPRODUCIBLE SPLITS (70/15/15) ===
# Note: split BEFORE TF-IDF or encoders to avoid data leakage

from pyspark.sql import functions as F

seed = 67

splits = (
    df.withColumn("_rand", F.rand(seed))
      .withColumn(
          "_split",
          F.when(F.col("_rand") < 0.70, F.lit("train"))
           .when(F.col("_rand") < 0.85, F.lit("val"))
           .otherwise(F.lit("test"))
      )
)

train_df = splits.filter(F.col("_split") == "train").drop("_rand", "_split")
val_df   = splits.filter(F.col("_split") == "val").drop("_rand", "_split")
test_df  = splits.filter(F.col("_split") == "test").drop("_rand", "_split")

# Optional sanity check
print("Train:", train_df.count(), "Val:", val_df.count(), "Test:", test_df.count())

In [0]:
# === 3) WRITE SPLITS TO GOLD/features_v2 ===
# Overwrite to keep paths stable while iterating

out_path = f"{gold_path}/features_v2"

train_out = f"{out_path}/train"
val_out   = f"{out_path}/val"
test_out  = f"{out_path}/test"

(train_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(train_out))

(val_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(val_out))

(test_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(test_out))

In [0]:
# === 4) QUICK VERIFICATION ===
def load_and_count(p):
    df = spark.read.format("delta").load(p)
    return df, df.count()

train_loaded, n_train = load_and_count(train_out)
val_loaded,   n_val   = load_and_count(val_out)
test_loaded,  n_test  = load_and_count(test_out)

print("Split counts →",
      "train:", n_train,
      "val:",   n_val,
      "test:",  n_test,
      "total:", n_train + n_val + n_test)

# Peek a few rows to ensure schema/fields look right
train_loaded.show(5, truncate=False)
val_loaded.show(5, truncate=False)
test_loaded.show(5, truncate=False)

In [0]:
# === 5) SAVE SPLIT MANIFEST WITH COUNTS + PERCENTAGES ===
total_records = n_train + n_val + n_test

manifest_data = [
    ("train", n_train, round((n_train / total_records) * 100, 2)),
    ("val",   n_val,   round((n_val / total_records) * 100, 2)),
    ("test",  n_test,  round((n_test / total_records) * 100, 2))
]

manifest = spark.createDataFrame(
    manifest_data,
    ["split", "count", "percentage"]
)

(manifest.write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .save(f"{features_v2_base}/_manifest_counts"))

manifest.show(truncate=False)

In [0]:
# Databricks notebook source
# =========================================================
# GOODREADS TEXT FEATURE EXTRACTION
# =========================================================
# Purpose:
#   Load train split from feature_v2 (Gold layer)
#   Work with review_text column for NLP feature engineering
# =========================================================

# === 1. CONFIGURE STORAGE ACCESS ===
storage_acct = "goodreadsreviews60107070"

spark.conf.set(
    f"fs.azure.account.key.{storage_acct}.dfs.core.windows.net",
    "W68YOwummMkTfxvE8uSyeHwSn2ISU3fxF43SpgTIU/zdUDmwquZ95QpaxDJnze6PRovNww3bWamU+AStmbtZLg=="
)

container = "lakehouse"
gold_path = f"abfss://{container}@{storage_acct}.dfs.core.windows.net/gold"
train_path = f"{gold_path}/features_v2/train"

In [0]:
# === 2. LOAD DATASET (feature_v2/train) ===
train_df = spark.read.format("delta").load(train_path)

print("Total records:", train_df.count())
train_df.printSchema()

Total records: 10480029
root
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_added_parsed: timestamp (nullable = true)
 |-- date_added_iso: date (nullable = true)
 |-- review_length: integer (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- review_length_words: integer (nullable = true)
 |-- avg_rating_per_book: double (nullable = true)
 |-- n_reviews_per_book: long (nullable = true)



In [0]:
# === 3. TEXT CLEANING & NORMALIZATION (review_text) ===
# If not already available on the cluster, install once:
%pip install emoji==2.14.0

import re
from pyspark.sql.functions import col, length, lower, regexp_replace, trim
from pyspark.sql import functions as F

# --- 3.1 Define regex patterns (Java/PCRE compatible for Spark) ---
URL_PATTERN   = r'(https?://\S+|www\.\S+)'
NUM_PATTERN   = r'\d+'
# Remove punctuation EXCEPT < and > so placeholders like <URL> survive
PUNCT_EXCEPT_PLACEHOLDERS = r'[\\p{Punct}&&[^<>]]'

# --- 3.2 Emoji replacement via Python UDF (uses emoji lib) ---
# Databricks note: ensure `emoji` package is installed on the cluster
try:
    import emoji
except Exception as e:
    raise RuntimeError("Install the `emoji` package on the cluster: %pip install emoji==2.14.0") from e

def replace_emojis_to_placeholder(text: str) -> str:
    if text is None:
        return None
    # replace each emoji grapheme with <EMOJI>
    # emoji.replace_emoji handles combined emojis and skin tones correctly
    return emoji.replace_emoji(text, replace='<EMOJI>')

replace_emojis_udf = F.udf(replace_emojis_to_placeholder, returnType=F.StringType())

# --- 3.3 Apply cleaning pipeline in order ---
# Order matters: placeholders first, then punctuation/spacing, then trim + filter
cleaned_df = (
    train_df
      .withColumn("raw_text", col("review_text"))
      # lowercase
      .withColumn("clean_text", lower(col("review_text")))
      # URLs -> <URL>
      .withColumn("clean_text", regexp_replace(col("clean_text"), URL_PATTERN, " <URL> "))
      # numbers -> <NUM>
      .withColumn("clean_text", regexp_replace(col("clean_text"), NUM_PATTERN, " <NUM> "))
      # emojis -> <EMOJI> (UDF)
      .withColumn("clean_text", replace_emojis_udf(col("clean_text")))
      # remove punctuation except <> to keep placeholders
      .withColumn("clean_text", regexp_replace(col("clean_text"), PUNCT_EXCEPT_PLACEHOLDERS, " "))
      # collapse multiple spaces
      .withColumn("clean_text", regexp_replace(col("clean_text"), r"\s+", " "))
      # trim
      .withColumn("clean_text", trim(col("clean_text")))
      # filter out empty or very short reviews (<10 chars)
      .filter(length(col("clean_text")) >= 10)
)

# --- 3.4 Quick sanity checks ---
print("After cleaning:", cleaned_df.count())
display(
    cleaned_df.select("review_id", "raw_text", "clean_text").limit(10)
)

In [0]:
# === 3.5 SAVE CLEANED TEXT (ALL COLUMNS) ===
cleaned_out_path = f"{gold_path}/features_v2/text_cleaned"

(
    cleaned_df
    .write
    .mode("overwrite")
    .format("delta")
    .save(cleaned_out_path)
)

print(f"Full cleaned dataset saved to: {cleaned_out_path}")

In [0]:
# === III 4a. BASIC TEXT FEATURES ===
from pyspark.sql import functions as F

text_basic_df = (
    cleaned_df
    .withColumn("review_length_words", F.size(F.split(F.col("clean_text"), r"\s+")))
    .withColumn("review_length_chars", F.length(F.col("clean_text")))
    .filter(F.col("review_length_words") > 0)
)

# Quick sample
display(
    text_basic_df.select("review_id","clean_text","review_length_words","review_length_chars").limit(10)
)

# Summary (use percentile_approx for median)
summary_df = (
    text_basic_df.agg(
        F.count("*").alias("n_rows"),
        F.avg("review_length_words").alias("avg_words"),
        F.percentile_approx("review_length_words", 0.5).alias("p50_words"),
        F.max("review_length_words").alias("max_words"),
        F.avg("review_length_chars").alias("avg_chars"),
        F.max("review_length_chars").alias("max_chars"),
    )
)
display(summary_df)

# Save (ALL columns retained + new features); path aligned with earlier convention
basic_out = f"{gold_path}/features_v2/text_basic"
(text_basic_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(basic_out))

print("Basic text features saved to:", basic_out)

# Optional reload
reloaded = spark.read.format("delta").load(basic_out)
reloaded.printSchema()
print("Count:", reloaded.count())


In [0]:
# === 4b. SENTIMENT FEATURES (VADER) ===
# If not installed on cluster:
%pip install nltk==3.9.1

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DoubleType

# Download VADER lexicon if not already available
nltk.download("vader_lexicon")

# Initialize analyzer once (broadcast to executors)
sia = SentimentIntensityAnalyzer()

# Define function to compute sentiment scores
def vader_scores(text):
    if text is None:
        return (0.0, 0.0, 0.0, 0.0)
    s = sia.polarity_scores(text)
    return (float(s["pos"]), float(s["neu"]), float(s["neg"]), float(s["compound"]))

schema = StructType([
    StructField("sentiment_pos", DoubleType(), True),
    StructField("sentiment_neu", DoubleType(), True),
    StructField("sentiment_neg", DoubleType(), True),
    StructField("sentiment_compound", DoubleType(), True),
])

vader_udf = F.udf(vader_scores, schema)

# Apply to dataset
sentiment_df = (
    text_basic_df
    .withColumn("sentiment", vader_udf(F.col("clean_text")))
    .withColumn("sentiment_pos", F.col("sentiment.sentiment_pos"))
    .withColumn("sentiment_neu", F.col("sentiment.sentiment_neu"))
    .withColumn("sentiment_neg", F.col("sentiment.sentiment_neg"))
    .withColumn("sentiment_compound", F.col("sentiment.sentiment_compound"))
    .drop("sentiment")
)

# Quick inspection
display(
    sentiment_df.select(
        "review_id", "clean_text",
        "sentiment_pos", "sentiment_neu", "sentiment_neg", "sentiment_compound"
    ).limit(10)
)

# Save sentiment-enriched data
sentiment_out = f"{gold_path}/features_v2/text_sentiment"
(sentiment_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(sentiment_out))

print("Sentiment features saved to:", sentiment_out)


In [0]:
sentiment_df = spark.read.format("delta").load(f"{gold_path}/features_v2/text_sentiment")
sentiment_df.printSchema()
print("Total records:", sentiment_df.count())
display(sentiment_df.limit(5))

root
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_added_parsed: timestamp (nullable = true)
 |-- date_added_iso: date (nullable = true)
 |-- review_length: integer (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- review_length_words: integer (nullable = true)
 |-- avg_rating_per_book: double (nullable = true)
 |-- n_reviews_per_book: long (nullable = true)
 |-- raw_text: string (nullable = true)
 |-- clean_text: string (nullable = true)
 |-- review_length_chars: integer (nullable = true)
 |-- sentiment_pos: double (nullable = true)
 |-- sentiment_neu: double (nullable =

book_id,review_id,title,author_id,name,user_id,rating,review_text,language_code,n_votes,date_added,date_added_parsed,date_added_iso,review_length,word_count,review_length_words,avg_rating_per_book,n_reviews_per_book,raw_text,clean_text,review_length_chars,sentiment_pos,sentiment_neu,sentiment_neg,sentiment_compound
18375252,bec7e62ae812fada353b8456d2371a87,Au Revoir Là-haut,822613,Pierre Lemaitre,c55bde87d5dce88e7b7a4ba5a4d2257d,1,banalities on banalities old tunes nothing new. dumped.,fre,0,Sun Mar 05 01:41:18 -0800 2017,2017-03-05T09:41:18Z,2017-03-05,55,8,16,4.225806451612903,31,banalities on banalities old tunes nothing new. dumped.,ba ali ies o ba ali ies old es o hi g ew. d m ed.,49,0.0,1.0,0.0,0.0
41865,9f2ca2cc167c2f16892858c09fe51b7c,"Twilight (twilight, #1)",941441,Stephenie Meyer,3b92b0352627e473e429e80ff1ef7dd5,2,"this is a book made for teenagers. the whole notion that you can have a relationship based on platonic love is very appealing for teenagers, specially girls, that feel targeted for their looks only. of course let's put in the mix some seriously sexy pretty vampire, that is rich and all powerful and that adores and is crazy about the main girl character, and the cocktail is just perfect. it is not very understandable this all powerful, pretty vampire is so crazy about this boringly plain girl, unless you read the book and realize he kind of didn't have much to choose from.... all said, of course we all have a teenager in our hearts and this is a very easy book to read. unless you get bored by teenager angst.",en-us,0,Tue Sep 10 00:39:23 -0700 2013,2013-09-10T07:39:23Z,2013-09-10,717,131,177,3.4414496264889967,9906,"this is a book made for teenagers. the whole notion that you can have a relationship based on platonic love is very appealing for teenagers, specially girls, that feel targeted for their looks only. of course let's put in the mix some seriously sexy pretty vampire, that is rich and all powerful and that adores and is crazy about the main girl character, and the cocktail is just perfect. it is not very understandable this all powerful, pretty vampire is so crazy about this boringly plain girl, unless you read the book and realize he kind of didn't have much to choose from.... all said, of course we all have a teenager in our hearts and this is a very easy book to read. unless you get bored by teenager angst.","his is a book made for ee agers. he whole o io ha yo a have a rela io shi based o la o i love is very a eali g for ee agers, s e ially girls, ha feel arge ed for heir looks o ly. of o rse le 's i he mix some serio sly sexy re y vam ire, ha is ri h a d all owerf l a d ha adores a d is razy abo he mai girl hara er, a d he o k ail is j s erfe . i is o very ders a dable his all owerf l, re y vam ire is so razy abo his bori gly lai girl, less yo read he book a d realize he ki d of did ' have m h o hoose from.... all said, of o rse we all have a ee ager i o r hear s a d his is a very easy book o read. less yo ge bored by ee ager a gs .",637,0.187,0.8,0.013,0.9743
30109238,804b10dcc6b047605074a5c833373384,"Lake Of Dreams (fortune Bay, Prequel Novella)",15240324,Judith Hudson,600c811b96fed8dd0181b7024aac0524,5,very nice start to a series. love the setting of the book. looking forward to reading more.,eng,0,Thu Sep 08 11:35:19 -0700 2016,2016-09-08T18:35:19Z,2016-09-08,91,17,23,4.4,5,very nice start to a series. love the setting of the book. looking forward to reading more.,very i e s ar o a series. love he se i g of he book. looki g forward o readi g more.,84,0.271,0.729,0.0,0.6666
15803173,281289cad67a54610e731307e42bf728,Golden Boy,4818033,Abigail Tarttelin,17aaae5b58b453a8cdd4bc54c2ff3f0b,5,great book! so much food for thought regarding gender and identity... this book is powerful and touching.,eng,0,Sat Nov 21 23:16:47 -0800 2015,2015-11-22T07:16:47Z,2015-11-22,105,17,28,4.32258064516129,279,great book! so much food for thought regarding gender and identity... this book is powerful and touching.,grea book! so m h food for ho gh regardi g ge der a d ide i y... his book is owerf l a d o hi g.,96,0.0,1.0,0.0,0.0
1158706,e737dd555f23e1eaf45647e0cfaeb297,"Strangers In Death (in Death, #26)",17065,J.d. Robb,fca26c34be8fe623ee340061f1281796,4,"strangers in death (police proc-eve dallas-nyc-2060) - vg robb, j.d. (aka nora roberts) - 26th in series g.p. putnam's sons, 2008, us hardcover - isbn: 9780399154706 first sentence: murder harbored no bigotry, no bias. when a wealthy man is found murdered in his apartment, lt. eve dallas first looks to the wife as a suspect. the wife, however, was out of the country with friends and has an air-tight alibi. dallas has a feeling, however, and a determination to find justice for the victim. i have long admitted to being of fan of this series and this book doesn't change that. the strengths are all there; crisp dialogue with wonderful interjections of humor, wonderful characters and the portrayal of the relationship between them, the fun slightly-futuristic-but-not-unbelievable technology and, yes, some nice scenes between eve and her husband, roarke. the plot didn't have the same emotional charge some have had, but it did have a delightfully twisted villain. a slight weakness was whomever relied on spell-check to catch errors (hear versus here), but that's minor. somewhat more disappointing was that i saw where the plot was going a bit earlier than i'd have liked. however, that didn't prevent my reading the book all in one day and enjoying it.",eng,0,Tue Mar 11 22:21:13 -0700 2008,2008-03-12T05:21:13Z,2008-03-12,1265,208,324,3.953488372093023,129,"strangers in death (police proc-eve dallas-nyc-2060) - vg robb, j.d. (aka nora roberts) - 26th in series g.p. putnam's sons, 2008, us hardcover - isbn: 9780399154706 first sentence: murder harbored no bigotry, no bias. when a wealthy man is found murdered in his apartment, lt. eve dallas first looks to the wife as a suspect. the wife, however, was out of the country with friends and has an air-tight alibi. dallas has a feeling, however, and a determination to find justice for the victim. i have long admitted to being of fan of this series and this book doesn't change that. the strengths are all there; crisp dialogue with wonderful interjections of humor, wonderful characters and the portrayal of the relationship between them, the fun slightly-futuristic-but-not-unbelievable technology and, yes, some nice scenes between eve and her husband, roarke. the plot didn't have the same emotional charge some have had, but it did have a delightfully twisted villain. a slight weakness was whomever relied on spell-check to catch errors (hear versus here), but that's minor. somewhat more disappointing was that i saw where the plot was going a bit earlier than i'd have liked. however, that didn't prevent my reading the book all in one day and enjoying it.","s ra gers i dea h ( oli e ro -eve dallas- y - ) - vg robb, j.d. (aka ora rober s) - h i series g. . am's so s, , s hard over - isb : firs se e e: m rder harbored o bigo ry, o bias. whe a weal hy ma is fo d m rdered i his a ar me , l . eve dallas firs looks o he wife as a s s e . he wife, however, was o of he o ry wi h frie ds a d has a air- igh alibi. dallas has a feeli g, however, a d a de ermi a io o fi d j s i e for he vi im. i have lo g admi ed o bei g of fa of his series a d his book does ' ha ge ha . he s re g hs are all here; ris dialog e wi h wo derf l i erje io s of h mor, wo derf l hara ers a d he or rayal of he rela io shi be wee hem, he f sligh ly-f ris i -b - o - believable e h ology a d, yes, some i e s e es be wee eve a d her h sba d, roarke. he lo did ' have he same emo io al harge some have had, b i did have a deligh f lly wis ed villai . a sligh weak ess was whomever relied o s ell- he k o a h errors (hear vers s here), b ha 's mi or. somewha more disa oi i g was ha i saw where he lo was goi g a bi earlier ha i'd have liked. however, ha did ' reve my readi g he book all i o e day a d e joyi g i .",1155,0.089,0.873,0.037,0.8894


In [0]:
# =========================================================
# III.4(c) TF-IDF FEATURES — scikit-learn + Pandas UDF (final, rubric-compliant)
# =========================================================
# Prereqs:
#   - sentiment_df (or cleaned_df) has columns ["review_id", "clean_text"]
#   - gold_path already defined (e.g., abfss://lakehouse@.../gold)

# %pip install scikit-learn==1.5.2

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd, pickle, json
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, FloatType

# Spark safety config
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")

# --- PARAMETERS (recommended rubric defaults) ---
MAX_FEATURES = 1000           # limit vocabulary size (top N words)
NGRAM_RANGE  = (1, 2)         # unigrams + bigrams → captures short context (“not good”)
STOP_WORDS   = "english"      # remove common filler words
SAMPLE_ROWS  = 10_000         # sample size for fitting vocab
REPARTITIONS = 200            # partition count for distributed transform

# --- 1) Select necessary columns & rebalance partitions ---
base_df = sentiment_df.select("review_id", "clean_text").repartition(REPARTITIONS)

# --- 2) Fit TF-IDF vocabulary on manageable sample (driver-safe) ---
sample_pdf = base_df.select("clean_text").limit(SAMPLE_ROWS).toPandas()
sample_texts = sample_pdf["clean_text"].fillna("").astype(str).tolist()

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=NGRAM_RANGE,
    stop_words=STOP_WORDS
)
tfidf.fit(sample_texts)

vocab = tfidf.get_feature_names_out().tolist()
print("TF-IDF vocabulary size:", len(vocab))

# --- 3) Broadcast fitted vectorizer to executors ---
bc_tfidf = spark.sparkContext.broadcast(pickle.dumps(tfidf))

@pandas_udf(ArrayType(FloatType()))
def tfidf_transform_batch(texts: pd.Series) -> pd.Series:
    vec = pickle.loads(bc_tfidf.value)
    X = vec.transform(texts.fillna("").astype(str))
    # convert each row to dense float32 array
    return pd.Series([row.astype("float32").toarray().ravel().tolist() for row in X])

# --- 4) Apply TF-IDF transformation across all rows ---
tfidf_df = base_df.withColumn("tfidf_features", tfidf_transform_batch(col("clean_text")))

# --- 5) Save TF-IDF vectors (compact array form) ---
tfidf_out = f"{gold_path}/features_v2/text_tfidf"
(
    tfidf_df
      .write
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .save(tfidf_out)
)
print("TF-IDF features saved to:", tfidf_out)

# --- 6) Save vocabulary (Delta + JSON for reproducibility) ---
vocab_delta_out = f"{gold_path}/features_v2/_tfidf_vocab_sklearn"
(
    spark.createDataFrame([(i, t) for i, t in enumerate(vocab)], ["index","term"])
      .write
      .format("delta")
      .mode("overwrite")
      .save(vocab_delta_out)
)


print("Vocabulary saved to Delta:", vocab_delta_out)

# --- 7) Sanity check ---
display(tfidf_df.select("review_id", F.size("tfidf_features").alias("vector_length")).limit(5))

TF-IDF vocabulary size: 1000
TF-IDF features saved to: abfss://lakehouse@goodreadsreviews60107070.dfs.core.windows.net/gold/features_v2/text_tfidf
Vocabulary saved to Delta: abfss://lakehouse@goodreadsreviews60107070.dfs.core.windows.net/gold/features_v2/_tfidf_vocab_sklearn


review_id,vector_length
00213a878c5438e52ca638b95c7df6cd,1000
0048829855ac3a7b99053ca11f859cf9,1000
0073464460a9110021c299cc7bdf95ba,1000
009aa2d16f5c8891925157004333c751,1000
00c876900904f25d3a3b59dd3afdc5f6,1000


In [0]:
import json
import os

vocab_json_out = f"/dbfs/mnt/lakehouse/gold/features_v2/tfidf_vocab.json"
os.makedirs(os.path.dirname(vocab_json_out), exist_ok=True)

with open(vocab_json_out, "w") as f:
    json.dump(vocab, f)

print("Vocabulary JSON saved to:", vocab_json_out)

Vocabulary JSON saved to: /dbfs/mnt/lakehouse/gold/features_v2/tfidf_vocab.json


In [0]:
# === Save vocab JSON to /gold/features_v2 safely ===
import os, json

# 1) Write locally (Databricks driver)
os.makedirs("/dbfs/tmp", exist_ok=True)
local_vocab_path = "/dbfs/tmp/vocab.json"

with open(local_vocab_path, "w") as f:
    json.dump(vocab, f)

# 2) Copy to Azure Data Lake (ABFSS)
dst_path = f"{gold_path}/features_v2/tfidf_vocab.json"
dbutils.fs.cp("dbfs:/tmp/vocab.json", dst_path, True)

print("Vocabulary JSON saved to:", dst_path)

Vocabulary JSON saved to: abfss://lakehouse@goodreadsreviews60107070.dfs.core.windows.net/gold/features_v2/tfidf_vocab.json


In [0]:
# Safety: moderate Arrow batches
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "512")

In [0]:
from pyspark.sql import functions as F, Window as W

# Use the sentiment-enriched table
source_df = spark.read.format("delta").load(f"{gold_path}/features_v2/text_sentiment") \
                     .select("review_id","clean_text") \
                     .dropna(subset=["clean_text"])

# Stable row index for chunking (order by review_id for determinism)
w = W.orderBy("review_id")
indexed_df = source_df.withColumn("row_idx", F.row_number().over(w) - 1).cache()
total_rows = indexed_df.count()
print("Total rows:", total_rows)

Total rows: 10442623


In [0]:
%pip install sentence-transformers==2.7.0 transformers==4.44.2 torch

import pandas as pd
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import pandas_udf, col

MODEL_NAME   = "sentence-transformers/all-MiniLM-L6-v2"  # 384-d
BATCH_SIZE   = 64
USE_GPU      = False  # set True if your cluster has GPUs

_model = None
def _get_model():
    global _model
    if _model is None:
        from sentence_transformers import SentenceTransformer
        device = "cuda" if USE_GPU else "cpu"
        _model = SentenceTransformer(MODEL_NAME, device=device)
    return _model

@pandas_udf(ArrayType(FloatType()))
def sbert_embed(texts: pd.Series) -> pd.Series:
    m = _get_model()
    embs = m.encode(
        texts.fillna("").astype(str).tolist(),
        batch_size=BATCH_SIZE,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,  # cosine-ready
    )
    return pd.Series([e.astype("float32").tolist() for e in embs])


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
emb_out = f"{gold_path}/features_v2/text_embeddings_sbert"

# Create empty table if not exists (schema)
empty_df = indexed_df.limit(0).withColumn("bert_embedding", F.array().cast(ArrayType(FloatType())))
(empty_df
    .select("review_id","clean_text","bert_embedding")
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema","true")
    .save(emb_out))

# Track progress in DBFS
progress_path = "dbfs:/tmp/emb_progress.txt"
def write_progress(msg):
    dbutils.fs.put(progress_path, msg, True)
def read_progress():
    try:
        return dbutils.fs.head(progress_path)
    except:
        return ""

In [0]:
# Quick verification
final_df = spark.read.format("delta").load(emb_out)
print("Rows in embeddings table:", final_df.count())
display(final_df.select("review_id", F.size("bert_embedding").alias("dim")).limit(10))

# Optional: Z-ORDER by review_id if you’ll join/filter by it frequently (Databricks SQL / OPTIMIZE):
# spark.sql(f"OPTIMIZE delta.`{emb_out}` ZORDER BY (review_id)")

In [0]:
# --- Speed knobs ---
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024")
spark.conf.set("spark.sql.shuffle.partitions", "200")  # moderate shuffle fanout

# SBERT params (keep model same)
MODEL_NAME   = "sentence-transformers/all-MiniLM-L6-v2"
USE_GPU      = False          # set True if you have GPUs
BATCH_SIZE   = 128            # ↑ batch
REPARTITIONS = 32             # fewer, larger partitions
CHUNK_SIZE   = 300_000        # ↑ chunk size to reduce driver/job overhead

# Ensure base/indexed DF prepared once (outside loop), cached:
# indexed_df: columns [review_id, clean_text, row_idx] with row_idx = row_number()-1, cache() called.

# UDF (ensure global cache works)
_model = None
def _get_model():
    global _model
    if _model is None:
        from sentence_transformers import SentenceTransformer
        device = "cuda" if USE_GPU else "cpu"
        # Optional: cache model to DBFS to avoid repeated downloads across clusters
        _model = SentenceTransformer(MODEL_NAME, device=device)
    return _model

import pandas as pd
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import pandas_udf, col

@pandas_udf(ArrayType(FloatType()))
def sbert_embed(texts: pd.Series) -> pd.Series:
    m = _get_model()
    embs = m.encode(
        texts.fillna("").astype(str).tolist(),
        batch_size=BATCH_SIZE,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    return pd.Series([e.astype("float32").tolist() for e in embs])

# Destination (append mode)
emb_out = f"{gold_path}/features_v2/text_embeddings_sbert"

from math import ceil
total_rows = indexed_df.count()
num_chunks = ceil(total_rows / CHUNK_SIZE)
print(f"Planned chunks: {num_chunks} of ~{CHUNK_SIZE} rows")

for k in range(num_chunks):
    start = k * CHUNK_SIZE
    end   = min((k+1) * CHUNK_SIZE, total_rows) - 1

    done = read_progress()
    tag = f"[{start}-{end}]"
    if tag in done:
        print(f"Skip chunk {tag}")
        continue

    print(f"Processing chunk {tag} ...")
    # No extra shuffle columns; single repartition once per chunk
    chunk_df = (
        indexed_df
          .where((F.col("row_idx") >= start) & (F.col("row_idx") <= end))
          .select("review_id","clean_text")
          .repartition(REPARTITIONS)
    )

    emb_df = chunk_df.withColumn("bert_embedding", sbert_embed(col("clean_text")))

    (emb_df
        .select("review_id","clean_text","bert_embedding")
        .write
        .format("delta")
        .mode("append")
        .save(emb_out))

    write_progress(done + f"{tag} ")
    print(f"Done chunk {tag}")


Planned chunks: 35 of ~300000 rows
Processing chunk [0-299999] ...
Wrote 85 bytes.
Done chunk [0-299999]
Processing chunk [300000-599999] ...
Wrote 101 bytes.
Done chunk [300000-599999]
Processing chunk [600000-899999] ...
Wrote 117 bytes.
Done chunk [600000-899999]
Processing chunk [900000-1199999] ...
Wrote 134 bytes.
Done chunk [900000-1199999]
Processing chunk [1200000-1499999] ...
Wrote 152 bytes.
Done chunk [1200000-1499999]
Processing chunk [1500000-1799999] ...
Wrote 170 bytes.
Done chunk [1500000-1799999]
Processing chunk [1800000-2099999] ...
Wrote 188 bytes.
Done chunk [1800000-2099999]
Processing chunk [2100000-2399999] ...
Wrote 206 bytes.
Done chunk [2100000-2399999]
Processing chunk [2400000-2699999] ...
Wrote 224 bytes.
Done chunk [2400000-2699999]
Processing chunk [2700000-2999999] ...
Wrote 242 bytes.
Done chunk [2700000-2999999]
Processing chunk [3000000-3299999] ...
Wrote 260 bytes.
Done chunk [3000000-3299999]
Processing chunk [3300000-3599999] ...
Wrote 278 bytes.

In [0]:
from pyspark.sql import functions as F

def add_extra_features(df):
    # tokens + counts
    df = df.withColumn("tokens", F.split(F.col("clean_text"), r"\s+")) \
           .withColumn("word_count", F.size(F.col("tokens"))) \
           .filter(F.col("word_count") > 0)

    # unique words + type–token ratio (TTR)
    df = df.withColumn("unique_tokens", F.array_distinct(F.col("tokens"))) \
           .withColumn("unique_word_count", F.size(F.col("unique_tokens"))) \
           .withColumn("ttr", F.col("unique_word_count") / F.col("word_count"))

    # average word length
    df = df.withColumn("avg_word_len",
                       F.aggregate(
                           F.transform(F.col("tokens"), lambda x: F.length(x)),
                           F.lit(0), lambda acc, x: acc + x
                       ) / F.col("word_count"))

    # placeholder counts created during cleaning
    df = df.withColumn("url_count",  F.size(F.expr("filter(tokens, x -> x = '<URL>')"))) \
           .withColumn("num_count",  F.size(F.expr("filter(tokens, x -> x = '<NUM>')"))) \
           .withColumn("emoji_count",F.size(F.expr("filter(tokens, x -> x = '<EMOJI>')")))

    # negation markers
    NEG_SET = F.array(F.lit("not"), F.lit("no"), F.lit("never"))
    df = df.withColumn("negation_count",
                       F.size(F.expr("filter(tokens, x -> array_contains({}, x))".format(NEG_SET.sql))))

    # exclamation / question counts on cleaned_text (if punctuation retained elsewhere, else 0)
    df = df.withColumn("exclaim_count", F.length(F.regexp_replace(F.col("clean_text"), r"[^!]", ""))) \
           .withColumn("question_count",F.length(F.regexp_replace(F.col("clean_text"), r"[^?]", "")))

    # elongated words (e.g., sooooo, niiiice): count tokens with any char repeated ≥3
    df = df.withColumn("elongated_count",
                       F.size(F.expr("filter(tokens, x -> x rlike '(.)\\1{2,}')")))

    # punctuation density proxy (after cleaning many puncts may be gone; still keep)
    df = df.withColumn("punct_chars", F.length(F.regexp_replace(F.col("clean_text"), r"[A-Za-z0-9<>\\s]", ""))) \
           .withColumn("punct_ratio", F.when(F.length("clean_text") > 0,
                                             F.col("punct_chars") / F.length("clean_text")).otherwise(F.lit(0.0)))

    # drop helpers; keep final set
    keep = ["review_id","clean_text","word_count","unique_word_count","ttr","avg_word_len",
            "url_count","num_count","emoji_count","negation_count",
            "exclaim_count","question_count","elongated_count","punct_ratio"]
    return df.select(*keep)

for split in ["train","val","test"]:
    src = f"{gold_path}/features_v2/{split}_text_cleaned"   # must exist
    dst = f"{gold_path}/features_v2/{split}_text_extra"

    base = spark.read.format("delta").load(src).select("review_id","clean_text")
    extra = add_extra_features(base)
    (extra.write.format("delta").mode("overwrite").option("overwriteSchema","true").save(dst))

    print(f"Wrote extra features → {dst}")


In [0]:
from pyspark.sql import functions as F

def load_or_empty(path, cols_keep):
    try:
        df = spark.read.format("delta").load(path)
        # keep only requested columns that exist
        keep = [c for c in cols_keep if c in df.columns]
        return df.select(*keep)
    except Exception:
        # return empty DF with the requested schema (if any)
        return spark.createDataFrame([], schema=",".join([f"{c} string" for c in cols_keep]))  # will be inner-joined safely later

def combine_split(split: str):
    base_path = f"{gold_path}/features_v2"
    # metadata (from original split folder)
    meta_cols = ["review_id","book_id","rating"]
    meta = spark.read.format("delta").load(f"{base_path}/{split}").select(*meta_cols)

    # features (each must contain review_id + its columns)
    basic = load_or_empty(f"{base_path}/{split}_text_basic",
                          ["review_id","review_length_words","review_length_chars"])
    senti = load_or_empty(f"{base_path}/{split}_text_sentiment",
                          ["review_id","sentiment_pos","sentiment_neu","sentiment_neg","sentiment_compound"])
    extra = load_or_empty(f"{base_path}/{split}_text_extra",
                          ["review_id","word_count","unique_word_count","ttr","avg_word_len",
                           "url_count","num_count","emoji_count","negation_count",
                           "exclaim_count","question_count","elongated_count","punct_ratio"])
    tfidf = load_or_empty(f"{base_path}/{split}_text_tfidf",
                          ["review_id","tfidf_features"])
    emb   = load_or_empty(f"{base_path}/{split}_text_embeddings",
                          ["review_id","bert_embedding"])

    # inner-join on review_id to guarantee aligned rows with all features
    joined = (meta
              .join(basic, "review_id", "inner")
              .join(senti, "review_id", "inner")
              .join(extra, "review_id", "left")   # extra is optional; left-join to not drop rows
              .join(tfidf, "review_id", "inner")
              .join(emb,   "review_id", "left"))  # embeddings optional; left-join if not computed yet
    # enforce final column order
    front = ["review_id","book_id","rating",
             "review_length_words","review_length_chars",
             "sentiment_pos","sentiment_neu","sentiment_neg","sentiment_compound"]
    extras = [c for c in ["word_count","unique_word_count","ttr","avg_word_len",
                          "url_count","num_count","emoji_count","negation_count",
                          "exclaim_count","question_count","elongated_count","punct_ratio"] if c in joined.columns]
    tails = ["tfidf_features"] + (["bert_embedding"] if "bert_embedding" in joined.columns else [])
    ordered_cols = [c for c in front if c in joined.columns] + extras + tails
    out = joined.select(*ordered_cols)

    # minimal sanity checks
    assert out.select("review_id").distinct().count() == out.count(), "duplicate review_id after joins"
    assert out.count() > 0, f"no rows produced for split={split}"

    dst = f"{base_path}/{split}_allfeatures"
    (out.write.format("delta").mode("overwrite").option("overwriteSchema","true").save(dst))
    print(f"Wrote → {dst} | rows={out.count()} | cols={len(out.columns)}")

for sp in ["train","val","test"]:
    combine_split(sp)