In [1]:
from pyspark.sql.functions import explode, count, row_number, desc
from pyspark.sql.types import *
import pyspark 
from delta import *
from delta.tables import DeltaTable
from pyspark.sql.window import Window
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

def main():
    
    # Constants
    TABLE_PATH = "hdfs:///project/data/review_data/delta_table"
    CSV_PATH = "hdfs:///project/data/review_data/yelp_academic_dataset_review_output.csv"

    # Initiate spark
    builder = pyspark.sql.SparkSession.builder.appName("spark_review") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.executor.cores", 4) \
        # .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        # .config("spark.databricks.adaptive.autoOptimizeShuffle.enabled", "true") \
        # .config("spark.sql.shuffle.partitions", 8) \
        # .config("spark.databricks.delta.merge.repartitionBeforeWrite.enabled", "true") \
        # .config("spark.databricks.delta.optimizeWrite.enabled", "true") \
        # .config("spark.databricks.delta.autoCompact.enabled", "true") \
    spark = configure_spark_with_delta_pip(builder).getOrCreate()

    # Create dataframe from csv file
    schema = "Review_id STRING, User_id STRING, Business_id STRING, Text STRING"
    df = spark.read.csv(CSV_PATH, schema=schema).repartition(200, "Review_id")
    # df = spark.read.csv(CSV_PATH, schema=schema)
    print(df.rdd.getNumPartitions())

    tokenizer = RegexTokenizer(inputCol="Text", outputCol="Words", pattern="\\W")
    df_words = tokenizer.transform(df)
    stopwords = StopWordsRemover.loadDefaultStopWords("english")
    remover = StopWordsRemover(inputCol="Words", outputCol="Cleaned_Words", stopWords=stopwords)
    df_cleaned_words = remover.transform(df_words).select("Review_id", explode("Cleaned_Words").alias("Word"))
    
    # Count the frequency of each word within each Review_id group
    df_word_counts = df_cleaned_words.groupBy("Review_id", "Word").agg(count("Word").alias("Count"))

    # Show the most frequent word for each Review_id
    # This process is not deterministic, i.e., 
    # if there are multiple words that could calculate to the same rank, the ones chosen are non-deterministic.
    # This means we may update rankings for exisintg reviews.
    window = Window.partitionBy("Review_id").orderBy(desc("Count"))
    df_top_words = df_word_counts.select("*", row_number().over(window).alias("Rank")).filter("Rank <= 3")

    # If no delta table exists, save and exit
    if not DeltaTable.isDeltaTable(spark, TABLE_PATH):
        df_top_words.write.format("delta").save(TABLE_PATH)
        return
        
    # Upsert delta table
    delta_table = DeltaTable.forPath(spark, TABLE_PATH)
    delta_table.alias("old") \
        .merge(
            df_top_words.alias("new"),
            "old.Review_id = new.Review_id AND old.Rank = new.Rank"
        ) \
        .whenMatchedUpdate(set=
            {
                "Review_id": "new.Review_id",
                "Word": "new.Word",
                "Count": "new.Count",
                "Rank": "new.Rank"
            }
        ) \
        .whenNotMatchedInsert(values=
            {
                "Review_id": "new.Review_id",
                "Word": "new.Word",
                "Count": "new.Count",
                "Rank": "new.Rank"
            }
        ) \
        .execute()


if __name__ == "__main__":
    main()

:: loading settings :: url = jar:file:/home/ubuntu/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ebd2956e-2dcb-49df-8a52-a62ed824df92;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 213ms :: artifacts dl 9ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0

23/05/03 21:33:14 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/05/03 21:33:40 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar added multiple times to distributed cache.
23/05/03 21:33:40 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar added multiple times to distributed cache.
23/05/03 21:33:40 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar added multiple times to distributed cache.


[Stage 0:>                                                         (0 + 0) / 34]

23/05/03 21:34:11 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:34:26 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:34:41 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:34:56 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources


[Stage 0:>                                                         (0 + 0) / 34]

23/05/03 21:35:11 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:35:26 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:35:41 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/05/03 21:35:56 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources




200




23/05/03 21:41:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                