In [1]:
import os

print("Workspace:", os.listdir("/workspace"))
print("Data:", os.listdir("/workspace/data"))

Workspace: ['hive_warehouse', 'data', 'notebooks']
Data: ['categories.json', 'database.db', 'database.db.wal', 'podcasts.json', 'podcasts.parquet', 'podcasts.parquet.tmp', 'reviews.json']


In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("PodcastPopularity")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .config(
        "spark.jars.packages",
        "com.johnsnowlabs.nlp:spark-nlp_2.12:6.2.0"
    )
    .getOrCreate()
)

print("Spark version:", spark.version)
print("Test RDD count:", spark.sparkContext.parallelize(range(10)).count())

In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("PodcastPopularity")
        .master("local[*]")                      # still local, that is fine
        .config("spark.driver.memory", "4g")
        # tell Spark where the Hive warehouse lives (Hadoop-style path)
        # if you do not have real HDFS, just use a local folder:
        .config("spark.sql.warehouse.dir", "/workspace/hive_warehouse")
        .enableHiveSupport()                    # <<< this is the key change
        .getOrCreate()
)

print("Spark version:", spark.version)
print("Test RDD count:", spark.sparkContext.parallelize(range(10)).count())


/opt/spark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 18:32:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 3.5.0


[Stage 0:>                                                          (0 + 8) / 8]

Test RDD count: 10


                                                                                

In [3]:
reviews_df    = spark.read.json("/workspace/data/reviews.json")
podcasts_df   = spark.read.json("/workspace/data/podcasts.json")
categories_df = spark.read.json("/workspace/data/categories.json")

print("Reviews rows:", reviews_df.count())
print("Podcasts rows:", podcasts_df.count())
print("Categories rows:", categories_df.count())

25/11/25 18:32:37 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Reviews rows: 5607021


                                                                                

Podcasts rows: 2077665


[Stage 10:>                                                         (0 + 8) / 8]

Categories rows: 3706368


                                                                                

In [4]:
# create a dedicated database for your project
spark.sql("CREATE DATABASE IF NOT EXISTS podcast_db")
spark.sql("USE podcast_db")

# persist raw dataframes as Hive tables (Parquet under the hood)
reviews_df.write.mode("overwrite").saveAsTable("podcast_reviews_raw")
podcasts_df.write.mode("overwrite").saveAsTable("podcasts_raw")
categories_df.write.mode("overwrite").saveAsTable("categories_raw")


25/11/25 18:33:22 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/11/25 18:33:22 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/11/25 18:33:33 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/11/25 18:33:33 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.17.0.2
25/11/25 18:33:34 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
25/11/25 18:34:00 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/11/25 18:34:00 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
25/11/25 18:34:00 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/11/25 18:34:00 W

In [5]:
spark.sql("USE podcast_db")

reviews_df    = spark.table("podcast_reviews_raw")
podcasts_df   = spark.table("podcasts_raw")
categories_df = spark.table("categories_raw")


In [6]:
reviews_df.printSchema()
podcasts_df.printSchema()
categories_df.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- content: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- podcast_id: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- title: string (nullable = true)

root
 |-- author: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- description: string (nullable = true)
 |-- itunes_id: string (nullable = true)
 |-- itunes_url: string (nullable = true)
 |-- podcast_id: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- scraped_at: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- title: string (nullable = true)

root
 |-- category: string (nullable = true)
 |-- itunes_id: string (nullable = true)
 |-- podcast_id: string (nullable = true)



In [19]:
from pyspark.sql import functions as F
import pandas as pd
from IPython.display import display

# Make Pandas tables easier to read
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_rows", 20)

# -------------------------------------------
# 1. Aggregate reviews at podcast level
# -------------------------------------------
podcast_reviews = (
    reviews_df
    .groupBy("podcast_id")
    .agg(
        F.count("*").alias("num_reviews"),
        F.avg("rating").alias("avg_review_rating_from_reviews")
    )
)

# -------------------------------------------
# 2. Core podcast info
# -------------------------------------------
podcast_core = (
    podcasts_df
    .select(
        "podcast_id",
        "title",
        "description",
        F.col("average_rating").alias("avg_rating_platform"),
        F.col("ratings_count").cast("long").alias("rating_count")
    )
)

# -------------------------------------------
# 3. Join podcasts with review aggregates
# -------------------------------------------
podcast_with_engagement = (
    podcast_core
    .join(podcast_reviews, on="podcast_id", how="left")
    .fillna({"num_reviews": 0})
)

# -------------------------------------------
# 4. Clean subset: only podcasts with ratings
#    and ensure description_length is not null
# -------------------------------------------
podcast_clean = (
    podcast_with_engagement
    .where(
        F.col("avg_rating_platform").isNotNull() &
        F.col("rating_count").isNotNull()
    )
)

podcast_clean = (
    podcast_clean
    .withColumn("description", F.coalesce("description", F.lit("")))
    .withColumn("description_length", F.length(F.col("description")))
)

print("Clean podcasts:", podcast_clean.count())
print("\nSample of clean podcasts (Pandas view):")
clean_sample = (
    podcast_clean
    .select(
        "podcast_id",
        "title",
        "avg_rating_platform",
        "rating_count",
        "num_reviews",
        "description_length"
    )
    .limit(10)
    .toPandas()
)
display(clean_sample)

# -------------------------------------------
# 5. Attach category information
# -------------------------------------------
category_lookup = (
    categories_df
    .select(
        "podcast_id",
        F.col("category").alias("category_name"),
        F.col("itunes_id").alias("category_itunes_id")
    )
    .dropDuplicates(["podcast_id", "category_name"])
)

podcast_final = (
    podcast_clean
    .join(category_lookup, on="podcast_id", how="left")
)

print("Podcast_final rows:", podcast_final.count())
print("\nSample of podcast_final (Pandas view):")
podcast_final_sample = (
    podcast_final
    .select(
        "podcast_id",
        "title",
        "category_name",
        "avg_rating_platform",
        "rating_count",
        "num_reviews"
    )
    .limit(10)
    .toPandas()
)
display(podcast_final_sample)

# -------------------------------------------
# 6. Category-level engagement summary
# -------------------------------------------
category_engagement = (
    podcast_final
    .groupBy("category_name")
    .agg(
        F.countDistinct("podcast_id").alias("num_podcasts"),
        F.sum("num_reviews").alias("total_reviews"),
        F.sum("rating_count").alias("total_ratings_recorded"),
        F.avg("avg_rating_platform").alias("avg_platform_rating")
    )
    .orderBy(F.desc("total_reviews"))
)

# -------------------------------------------
# 7. Tidy category view: drop nulls and
#    require at least 5 podcasts per category
# -------------------------------------------
category_engagement_clean = (
    category_engagement
    .where(F.col("category_name").isNotNull())
)

category_engagement_filtered = (
    category_engagement_clean
    .where(F.col("num_podcasts") >= 5)
    .orderBy(F.desc("total_reviews"))
)

print("\nFiltered category_engagement (top 20, Pandas view):")
cat_top = category_engagement_filtered.limit(20).toPandas()
display(cat_top)


Clean podcasts: 81842

Sample of clean podcasts (Pandas view):


                                                                                

Unnamed: 0,podcast_id,title,avg_rating_platform,rating_count,num_reviews,description_length
0,068bd11a5c3e839d734b513cb73a34c4,Adobe ® Creative Suite Developer Podcast,3.0,3,0,82
1,22f28ab280851f27cfa87256d5fc5a04,Scare You To Sleep,4.7,83,0,269
2,3f6f88d45b1ccdc00c1f0ba0fc2549c6,PE's Chess Cast,4.5,31,0,633
3,796b29ff2e5184d5cdd702dd9c0df50f,Nana's Knitting Podcast,2.0,2,0,54
4,8d5dc7e5aaff897b181dfb2e2516d82f,Sacred Dub Podcast,5.0,12,0,176
5,d3b4549a73cf7db0aa22f37c2daf338b,No Holds Barred with Eddie Goldman,4.6,10,0,345
6,d8991e5fb326b4dae79d24e0cb6cd442,Dr. David Brodbeck's Psychology Lectures from Algoma University,4.6,16,0,251
7,da93a415211ea4bfc2f37361927273db,Quirks and Quarks Complete Show from CBC Radio,4.8,18,0,138
8,f04c5cce700c016123482229d1e18fe3,"Broadway Bullet: Theatre from Broadway, Off-Broadway and beyond.",4.8,20,0,415
9,0bcb3876862ba1a86725a81e64c01e72,FINRA Unscripted,4.8,23,0,507


                                                                                

Podcast_final rows: 171009

Sample of podcast_final (Pandas view):


                                                                                

Unnamed: 0,podcast_id,title,category_name,avg_rating_platform,rating_count,num_reviews
0,068bd11a5c3e839d734b513cb73a34c4,Adobe ® Creative Suite Developer Podcast,technology,3.0,3,0
1,22f28ab280851f27cfa87256d5fc5a04,Scare You To Sleep,arts,4.7,83,0
2,3f6f88d45b1ccdc00c1f0ba0fc2549c6,PE's Chess Cast,education,4.5,31,0
3,3f6f88d45b1ccdc00c1f0ba0fc2549c6,PE's Chess Cast,leisure,4.5,31,0
4,796b29ff2e5184d5cdd702dd9c0df50f,Nana's Knitting Podcast,society-culture-personal-journals,2.0,2,0
5,796b29ff2e5184d5cdd702dd9c0df50f,Nana's Knitting Podcast,society-culture,2.0,2,0
6,8d5dc7e5aaff897b181dfb2e2516d82f,Sacred Dub Podcast,music,5.0,12,0
7,d3b4549a73cf7db0aa22f37c2daf338b,No Holds Barred with Eddie Goldman,sports,4.6,10,0
8,d8991e5fb326b4dae79d24e0cb6cd442,Dr. David Brodbeck's Psychology Lectures from Algoma University,education-courses,4.6,16,0
9,d8991e5fb326b4dae79d24e0cb6cd442,Dr. David Brodbeck's Psychology Lectures from Algoma University,science,4.6,16,0



Filtered category_engagement (top 20, Pandas view):


                                                                                

Unnamed: 0,category_name,num_podcasts,total_reviews,total_ratings_recorded,avg_platform_rating
0,business,10535,119,131304,4.787271
1,education,10148,108,123105,4.632036
2,health-fitness,7243,59,102974,4.695016
3,education-self-improvement,1333,59,24694,4.840585
4,arts-visual-arts,1815,55,22980,4.728815
5,business-investing,1925,42,27285,4.746026
6,education-how-to,709,42,10877,4.707757
7,society-culture,12652,39,163138,4.750277
8,comedy,6449,29,82668,4.814808
9,sports,5789,25,77852,4.751961


In [8]:
from pyspark.sql import functions as F

# --- 1. Base reviews ---
reviews_base = (
    reviews_df
    .select(
        F.col("podcast_id"),
        F.col("rating"),
        F.col("content").alias("text")
    )
    .where(F.col("text").isNotNull())
)
# Print count
count_reviews = reviews_base.count()
print(f"Base Reviews Count: {count_reviews}")


# --- 2. Podcast Meta ---
podcast_meta = (
    podcasts_df
    .select(
        "podcast_id",
        "title",
        "description",
        F.col("average_rating").alias("avg_rating_platform"),
        F.col("ratings_count").cast("long").alias("rating_count")
    )
)
# Print count
count_podcasts = podcast_meta.count()
print(f"Podcast Meta Count: {count_podcasts}")


# --- 3. Category Meta ---
category_meta = (
    categories_df
    .select(
        "podcast_id",
        F.col("category").alias("category_name"),
        F.col("itunes_id").alias("category_itunes_id")
    )
    .dropDuplicates(["podcast_id", "category_name"])
)
# Print count
count_categories = category_meta.count()
print(f"Category Meta Count: {count_categories}")


# --- 4. Join: Reviews + Meta + Category ---
reviews_with_meta = (
    reviews_base.alias("r")
    .join(podcast_meta.alias("p"), on="podcast_id", how="inner")
    .join(category_meta.alias("c"), on="podcast_id", how="left")
)

count_joined = reviews_with_meta.count()
print(f"Total reviews WITH podcast metadata: {count_joined}")
print(f"Distinct podcasts in joined data: {reviews_with_meta.select('podcast_id').distinct().count()}")


# --- Output to Pandas (Safe Mode) ---
# WARNING: .toPandas() collects ALL data to memory. 
# Using limit(1000) to prevent kernel crash. Remove .limit(1000) only if you have enough RAM.

print("Converting samples to Pandas DataFrames...")

pdf_reviews_base = reviews_base.limit(1000).toPandas()
pdf_podcast_meta = podcast_meta.limit(1000).toPandas()
pdf_category_meta = category_meta.limit(1000).toPandas()
pdf_reviews_with_meta = reviews_with_meta.limit(1000).toPandas()

print("\nSample of Joined Data (Pandas):")
display(pdf_reviews_with_meta.head())

                                                                                

Base Reviews Count: 5607021
Podcast Meta Count: 2077665


                                                                                

Category Meta Count: 3706368


                                                                                

Total reviews WITH podcast metadata: 785


                                                                                

Distinct podcasts in joined data: 31
Converting samples to Pandas DataFrames...


                                                                                


Sample of Joined Data (Pandas):


Unnamed: 0,podcast_id,rating,text,title,description,avg_rating_platform,rating_count,category_name,category_itunes_id
0,286898691d0495a5875ea48852210662,5,A dose perfeita entre análise do esporte e gro...,NFL no ProFootballcast com Antony Curti e Edua...,Top 10 em esportes e o podcast de NFL e futebo...,4.8,29,sports,1228890010
1,286898691d0495a5875ea48852210662,5,A dose perfeita entre análise do esporte e gro...,NFL no ProFootballcast com Antony Curti e Edua...,Top 10 em esportes e o podcast de NFL e futebo...,4.8,29,sports-football,1228890010
2,286898691d0495a5875ea48852210662,5,Melhor Podcast de FA!,NFL no ProFootballcast com Antony Curti e Edua...,Top 10 em esportes e o podcast de NFL e futebo...,4.8,29,sports,1228890010
3,286898691d0495a5875ea48852210662,5,Melhor Podcast de FA!,NFL no ProFootballcast com Antony Curti e Edua...,Top 10 em esportes e o podcast de NFL e futebo...,4.8,29,sports-football,1228890010
4,286898691d0495a5875ea48852210662,5,Por um podcast só de groselha,NFL no ProFootballcast com Antony Curti e Edua...,Top 10 em esportes e o podcast de NFL e futebo...,4.8,29,sports,1228890010


In [17]:
from pyspark.sql import functions as F
from transformers import pipeline
import torch
import pandas as pd

pd.set_option("display.max_colwidth", 120)

# ---------------------------------------------------
# 1. Create "Virtual" Metadata & Sample
# ---------------------------------------------------
print("--- Step 1: Filtering for podcasts with sufficient data ---")

# A. Calculate real stats from the reviews themselves
# We only want podcasts that have at least 20 reviews to be statistically significant
virtual_podcast_meta = (
    reviews_base
    .groupBy("podcast_id")
    .agg(
        F.count("*").alias("total_reviews_count"),
        F.avg("rating").alias("avg_rating_calculated")
    )
    .where(F.col("total_reviews_count") >= 20)
)

print(f"Podcasts with >20 reviews available: {virtual_podcast_meta.count()}")

# B. Join back to get the actual review text for these valid podcasts
clean_reviews = reviews_base.join(virtual_podcast_meta, "podcast_id", "inner")

# C. Sample 2000 reviews from this high-quality pool
# We select 'podcast_id', 'rating', and 'text'. 
# Note: We DO NOT select 'title' because it was in the missing external file.
sample_reviews_spark = (
    clean_reviews
    .orderBy(F.rand(42))
    .limit(2000)
)

print("Sampled reviews for BERT:", sample_reviews_spark.count())

# Convert to Pandas for the Hugging Face pipeline
sample_reviews = sample_reviews_spark.select(
    "podcast_id",
    "rating",
    "text"
).toPandas()

print("Pandas rows for BERT:", len(sample_reviews))

# ---------------------------------------------------
# 2. Load Multilingual BERT sentiment model
# ---------------------------------------------------
device = 0 if torch.cuda.is_available() else -1
print("\n--- Step 2: Loading Model (Device: {'GPU' if device == 0 else 'CPU'}) ---")

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=device
)

texts = sample_reviews["text"].tolist()
batch_size = 16

labels = []
scores = []

print(f"Running sentiment analysis on {len(texts)} texts...")

for i in range(0, len(texts), batch_size):
    if i % 100 == 0 and i > 0:
        print(f"Processing batch starting at index {i}...")
        
    batch = texts[i:i + batch_size]
    outputs = sentiment_pipe(
        batch,
        truncation=True,
        max_length=512
    )
    for out in outputs:
        labels.append(out["label"])
        scores.append(out["score"])

sample_reviews["hf_label"] = labels
sample_reviews["hf_score_raw"] = scores

def star_to_score(label):
    return int(label.split(' ')[0])

sample_reviews["sentiment_score"] = sample_reviews["hf_label"].apply(star_to_score)

print("\nExample Multilingual-scored reviews:")
display(sample_reviews[[
    "podcast_id", "rating", "hf_label", "sentiment_score", "text"
]].head(5))

# ---------------------------------------------------
# 3. Back to Spark: aggregate sentiment per podcast_id
# ---------------------------------------------------
print("\n--- Step 3: Aggregating Results ---")

# Create a Spark DF from the scored pandas DF
sample_scored_spark = spark.createDataFrame(
    sample_reviews[["podcast_id", "rating", "sentiment_score"]]
)

# Group by podcast_id
podcast_sentiment_bert = (
    sample_scored_spark
    .groupBy("podcast_id")
    .agg(
        F.avg("sentiment_score").alias("avg_sentiment_bert"),
        F.count("*").alias("num_reviews_with_bert"),
        F.avg("rating").alias("avg_rating_sample")
    )
).cache()

print("Podcasts with calculated sentiment:", podcast_sentiment_bert.count())
podcast_sentiment_bert.show(5, truncate=False)

# ---------------------------------------------------
# 4. Output as Pandas DataFrame
# ---------------------------------------------------
print("\nFinal Aggregated Results (Pandas DataFrame):")
pdf_podcast_sentiment = podcast_sentiment_bert.toPandas()
display(pdf_podcast_sentiment.head(10))

--- Step 1: Filtering for podcasts with sufficient data ---


                                                                                

Podcasts with >20 reviews available: 38113


                                                                                

Sampled reviews for BERT: 2000


                                                                                

Pandas rows for BERT: 2000

--- Step 2: Loading Model (Device: {'GPU' if device == 0 else 'CPU'}) ---


Device set to use cpu


Running sentiment analysis on 2000 texts...
Processing batch starting at index 400...
Processing batch starting at index 800...
Processing batch starting at index 1200...
Processing batch starting at index 1600...

Example Multilingual-scored reviews:


Unnamed: 0,podcast_id,rating,hf_label,sentiment_score,text
0,2510c343b5be08127ce9f48f3a4ba4c9,5,4 stars,4,"Three great guys that taste great together. Am I not supposed to eat this podcast? Well, Jeff talked me into it! 7 o..."
1,1e87a073b1c5c460b9c884e55ebbcf23,5,5 stars,5,What a great idea!
2,bf27a5d6b4a958fdc5db286e6beb71bc,5,5 stars,5,This is a great way to get connected with God even while doing your simplest daily tasks. Whether you're driving in ...
3,dfaf307538d743494727c034ac9bc399,5,5 stars,5,"Love you guys, one of my favorite bravo podcasts! I miss A, but it’s fun having the brother this week! You guys are ..."
4,903cba42379bf35ffe9d7172b482c91e,1,3 stars,3,"Pretty cringy of two middle-aged white men to be ragging so hard on Betty Hill, especially for a story set in the ‘6..."



--- Step 3: Aggregating Results ---


                                                                                

Podcasts with calculated sentiment: 1711
+--------------------------------+------------------+---------------------+-----------------+
|podcast_id                      |avg_sentiment_bert|num_reviews_with_bert|avg_rating_sample|
+--------------------------------+------------------+---------------------+-----------------+
|4376670eff5a24431539bed74eba9a9c|5.0               |1                    |5.0              |
|e7d6c4bdffba98ea2267f91a9a9a2763|5.0               |1                    |5.0              |
|3d00b9049c3e0e781e00369617ce7b2a|5.0               |1                    |5.0              |
|72280e0dd4a3590e7983392db50e3ccf|1.0               |1                    |1.0              |
|62499249e13cedf448186526a2034694|5.0               |1                    |5.0              |
+--------------------------------+------------------+---------------------+-----------------+
only showing top 5 rows


Final Aggregated Results (Pandas DataFrame):


Unnamed: 0,podcast_id,avg_sentiment_bert,num_reviews_with_bert,avg_rating_sample
0,4376670eff5a24431539bed74eba9a9c,5.0,1,5.0
1,e7d6c4bdffba98ea2267f91a9a9a2763,5.0,1,5.0
2,3d00b9049c3e0e781e00369617ce7b2a,5.0,1,5.0
3,72280e0dd4a3590e7983392db50e3ccf,1.0,1,1.0
4,62499249e13cedf448186526a2034694,5.0,1,5.0
5,687c85819c7fe71454a68286dc756ef2,5.0,1,5.0
6,9766d390b3459164b4158d492abbbca3,5.0,1,5.0
7,caba7b32181b486496ff69dc9a49d2f3,3.5,2,3.5
8,786ceb4e270f3ff580aa520a6cbe6b87,5.0,1,5.0
9,e3bb27bb4f48eb016de7981e5011bd41,5.0,1,5.0


In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# -------------------------------------------------------------------
# 1. Join BERT Results with "Virtual" Podcast Metadata
# -------------------------------------------------------------------
# We join the BERT sentiment (from the sample) with the TRUE average stats (from all reviews)
final_analysis_df = (
    podcast_sentiment_bert.alias("bert")
    .join(virtual_podcast_meta.alias("meta"), "podcast_id", "inner")
    .select(
        "podcast_id",
        F.col("bert.avg_sentiment_bert"),          # AI Sentiment (1-5)
        F.col("meta.avg_rating_calculated"),       # Real User Average (1-5)
        F.col("meta.total_reviews_count")          # Popularity
    )
)

print(f"Final Analysis Dataset Size: {final_analysis_df.count()} podcasts")
final_analysis_df.show(5)

# -------------------------------------------------------------------
# 2. Correlation Analysis
# -------------------------------------------------------------------
print("\n--- Correlation Analysis ---")
# Does high AI sentiment actually correlate with high user ratings?
corr_rating = final_analysis_df.stat.corr("avg_sentiment_bert", "avg_rating_calculated")
print(f"Correlation (Sentiment vs User Rating): {corr_rating:.4f}")

# Does sentiment correlate with popularity?
corr_pop = final_analysis_df.stat.corr("avg_sentiment_bert", "total_reviews_count")
print(f"Correlation (Sentiment vs Popularity): {corr_pop:.4f}")

# -------------------------------------------------------------------
# 3. Predictive Model (Can Sentiment Predict 'Great' Podcasts?)
# -------------------------------------------------------------------
print("\n--- Predictive Model (Logistic Regression) ---")

# Define "Great" as > 4.5 stars
model_data = final_analysis_df.withColumn(
    "label", 
    (F.col("avg_rating_calculated") >= 4.5).cast("int")
)

# Features: Sentiment and Popularity
assembler = VectorAssembler(
    inputCols=["avg_sentiment_bert", "total_reviews_count"],
    outputCol="features"
)

model_data_assembled = assembler.transform(model_data).select("features", "label")

# Split & Train
train_df, test_df = model_data_assembled.randomSplit([0.8, 0.2], seed=42)
print(f"Train rows: {train_df.count()}, Test rows: {test_df.count()}")

if train_df.count() > 10:
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    lr_model = lr.fit(train_df)

    preds = lr_model.transform(test_df)
    evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
    auc = evaluator.evaluate(preds)

    print(f"Test AUC: {auc:.4f}")
    print(f"Coefficients: {lr_model.coefficients}")
else:
    print("Not enough data to train model (need more samples).")

                                                                                

Final Analysis Dataset Size: 1711 podcasts


                                                                                

+--------------------+------------------+---------------------+-------------------+
|          podcast_id|avg_sentiment_bert|avg_rating_calculated|total_reviews_count|
+--------------------+------------------+---------------------+-------------------+
|445bd0e012aab91e7...|               1.0|    4.968871595330739|                257|
|4376670eff5a24431...|               5.0|    4.743589743589744|                156|
|807dea79bef8af455...|               5.0|    4.783687943262412|                282|
|a1fcaeb644c2e7cbf...|               5.0|             4.796875|                256|
|9f04d8da786c88e05...|               5.0|    4.178294573643411|                129|
+--------------------+------------------+---------------------+-------------------+
only showing top 5 rows


--- Correlation Analysis ---


                                                                                

Correlation (Sentiment vs User Rating): 0.3834


                                                                                

Correlation (Sentiment vs Popularity): -0.0890

--- Predictive Model (Logistic Regression) ---


                                                                                

Train rows: 1367, Test rows: 344


25/11/25 19:06:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/11/25 19:06:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Test AUC: 0.7137
Coefficients: [0.4990162001432948,-0.0001638295457611611]
