In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta import *
from chunkipy import TextChunker, TokenEstimator
from numpy import exp
import boto3
import builtins
import json
import math
import os
import pandas as pd
import numpy as np
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, BertTokenizer, pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", framework="pt", device=0)

2024-03-17 17:42:24.192976: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-17 17:42:24.193030: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-17 17:42:24.194439: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-17 17:42:24.202284: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
builder = SparkSession.builder.appName("amzn-reviews") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory","28g") \
    .config("spark.executor.cores","5") \
    .config("spark.executor.instances","2") \
    .config("spark.sql.execution.arrow.pyspark.enabled",True) \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", True) \
    .config("spark.sql.parquet.mergeSchema", False) \
    .config("spark.hadoop.parquet.enable.summary-metadata", False) \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a48b2c38-b490-4cbd-ba8e-58acddd3c61d;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in local-m2-cache
	found io.delta#delta-storage;3.1.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
:: resolution report :: resolve 136ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from local-m2-cache in [default]
	io.delta#delta-storage;3.1.0 from local-m2-cache in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--------------------------------------------------------------

In [4]:
## Variables
BOOTSTRAP_SERVERS = "boot-uuoa1swb.c1.kafka-serverless.us-east-1.amazonaws.com:9098"
REGION = "us-east-1"
TOPIC = "amznbookreviews"
SINK_BUCKET = "amzn-customer-reviews-228924278364"
SINK_PREFIX = "sink/llm-transformed-stream-foreachbatch/"
SINK_PATH = f"s3a://{SINK_BUCKET}/{SINK_PREFIX}"
CHECKPOINT_PATH = "s3a://amzn-customer-reviews-228924278364/checkpoint/"
MAX_ROW_SINK = 30

In [5]:
# Get secrets credential for S3a
REGION = "us-east-1"
client = boto3.client('secretsmanager',region_name=REGION)
response = client.get_secret_value(
    SecretId='s3all'
)
accessJson = json.loads(response['SecretString'])
accessKeyId = accessJson['accessKey']
secretAccessKey = accessJson['secretAccess']

# Configure S3a
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", accessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secretAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")

In [6]:
class BertTokenEstimator(TokenEstimator):
    def __init__(self):
        self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def estimate_tokens(self, text):
        return len(self.bert_tokenizer.encode(text))

In [7]:
def getPdfLLM(df):
    # Variables
    # Keep max token input much lower than max of 1024
    max_token_input = 900
    min_token_output = 130
    chunk_size = 512
    min_token_size = 200
    
    # Initializing summary to return
    summary_text = ""
    
    # Initialize BertEstimator
    bert_token_estimator = BertTokenEstimator()

    # Initialize Classifier
    classifier = pipeline(
        task="zero-shot-classification",
        device=0,
        model="facebook/bart-large-mnli"
    )

    classifier_labels = ['negative', 'positive', 'neutral']

    def batchSize(token_count, max_token_input):
        quotient = token_count / max_token_input
        remainder = token_count % max_token_input
        return math.floor(quotient), remainder

    def chunk(txt):
        token_count = bert_token_estimator.estimate_tokens(txt)
        text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
        chunks = text_chunker.chunk(txt)

        for i, chunk in enumerate(chunks):
            yield chunk

    def sentiment_func(result):
#        result = classifier(txt,classifier_labels,multi_label=True)
        labels = result["labels"]
        score = result["scores"]
        result_dict = {labels[i]: score[i] for i in range(len(labels))}

        # builtins required so as not to confuse with pyspark max() function
        sentiment = builtins.max(result_dict, key=result_dict.get)

        return sentiment

    def summarizer_func(corpus):
        summary_response = summarizer(corpus, max_length=130, min_length=30, do_sample=False)
        summary = summary_response[0]["summary_text"]
        return summary
        
    # Main code
    bartUpdated_list = []
    sentimentAnalyzed_list = []
    sentiment_list = []
    bartSummary_list = []
    reviewText = [d.reviewText for idx, d in df.iterrows()]
    
    for r in reviewText:
 
        token_count = bert_token_estimator.estimate_tokens(r)

        if token_count > max_token_input:

            # Sentiment chunking

            text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
            chunks = text_chunker.chunk(r)

            sentiment_results_list = []

            sum_positive = 0
            sum_negative = 0
            sum_neutral = 0

            corpus_chunks = []
            
            for i, chunk in enumerate(chunks):

                # Sentiment
                result = classifier(chunk,classifier_labels,multi_label=True)
                each_label = result["labels"]
                each_score = result["scores"]

                labels_positive_index = each_label.index("positive")
                labels_negative_index = each_label.index("negative")
                labels_neutral_index = each_label.index("neutral")

                sum_negative = sum_negative + result["scores"][labels_negative_index]
                sum_positive = sum_positive + result["scores"][labels_positive_index]
                sum_neutral = sum_neutral + result["scores"][labels_neutral_index]

                sentiment_results_list.append(each_score)

                # Summarization
                if bert_token_estimator.estimate_tokens(chunk)  <= min_token_size:
                    corpus_chunks.append(chunk)
                else:
                    logging.warning(f"CHUNK, token count: {token_count}")
                    corpus_chunks.append(summarizer_func(chunk))
            
            # Sentiment
            average_negative = sum_negative / i
            average_positive = sum_positive / i
            average_neutral = sum_neutral / i

            score = []
            score.append(average_negative)
            score.append(average_positive)
            score.append(average_neutral)

            result_dict = {classifier_labels[i]: score[i] for i in range(len(classifier_labels))}

            # builtins required so as not to confuse with pyspark max() function
            sentiment_label = builtins.max(result_dict, key=result_dict.get)


            # Summarization
            s = " "
            summary_text = s.join(corpus_chunks)
        
        else:
            # Sentiment
            result = classifier(r,classifier_labels,multi_label=True)
            sentiment_label = sentiment_func(result)

            # Summarization
            if token_count < min_token_output:
                summary_text = r
            else:
                logging.warning(f"No chunk, token count: {token_count}")
                summary_text = summarizer_func(r)

        bartUpdated_list.append("Y")
        sentimentAnalyzed_list.append("Y")
        sentiment_list.append(sentiment_label)
        bartSummary_list.append(summary_text)
        #print("bartsummarylist:", bartSummary_list)
        logging.warning(f"Length of summary list: {len(bartSummary_list)}")
    
    bartUpdated_array = np.array([bartUpdated_list])
    bartUpdated_concat = np.concatenate(bartUpdated_array)

    sentimentAnalyzed_array = np.array([sentimentAnalyzed_list])
    sentimentAnalyzed_concat = np.concatenate(sentimentAnalyzed_array)

    sentiment_array = np.array([sentiment_list])
    sentiment_concat = np.concatenate(sentiment_array)
    
    bartSummary_array = np.array([bartSummary_list])
    bartSummary_concat = np.concatenate(bartSummary_array)

    return_df = (
        df[[
            "asin","overall","reviewText","reviewTimeTS",
            "reviewerID","reviewerName","summary","verified",
            "row_number","asin_key","batch_id"]]
        .assign(bartUpdated=list(bartUpdated_concat))
        .assign(sentimentAnalyzed=list(sentimentAnalyzed_concat))
        .assign(sentiment=list(sentiment_concat))
        .assign(bartSummary=list(bartSummary_concat))
    )

    count_df_rows = return_df.count()

    logging.warning(f"DF rows: {count_df_rows}")
    
    return return_df

In [8]:
def llm_batch(schema, df):

    df_summary = ( df
      .groupBy(spark_partition_id().alias("_pid"))
      .applyInPandas(getPdfLLM,schema)
    )

    df_summary.write.format("delta").mode("append").save(SINK_PATH)


# ForeachBatch definition

In [9]:
def preBatchRecords(microBatchDf, batchSize):
    batch_count = math.ceil(microBatchDf.count() / batchSize)
    # % sign is modulus -- remainder after division
    microBatchDf = microBatchDf.withColumn("batch_id", col("row_number") % batch_count)
    microBatchDf = microBatchDf.withColumn("bartUpdated", lit("N"))
    microBatchDf = microBatchDf.withColumn("sentimentAnalyzed", lit("N"))
    microBatchDf = microBatchDf.withColumn("sentiment", lit(""))
    microBatchDf = microBatchDf.withColumn("bartSummary", lit(""))

    return microBatchDf

In [10]:
def callUdfBatch(df, batchId):
#    path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-foreach-pandas/"
    submitted_df = preBatchRecords(df, 5)

    ## Temporary to test
    #ubmitted_df = submitted_df.limit(10)
    #logging.warning(f"Schema: {submitted_df.printSchema()}")

    ###############

    schema = StructType(
       [
           StructField('asin', StringType(), True),
           StructField('overall', DoubleType(), True),
           StructField('reviewText', StringType(), True),
           StructField('reviewTimeTS', TimestampType(), True),
           StructField('reviewerID', StringType(), True),
           StructField('reviewerName', StringType(), True),
           StructField('summary', StringType(), True),
           StructField('verified', BooleanType(), True),
           StructField('row_number', IntegerType(), True),
           StructField('asin_key', IntegerType(), True),
           StructField('batch_id', IntegerType(), True),
           StructField("bartUpdated", StringType(), True),
           StructField("sentimentAnalyzed", StringType(), True),
           StructField("sentiment", StringType(), True), 
           StructField("bartSummary", StringType(), True)
       ]
    )

    df_total_count = submitted_df.count()
    loops = math.ceil(df_total_count / MAX_ROW_SINK)

    # Create row_count
    submitted_df = submitted_df.withColumn("temp_column", lit("A"))
    w = Window().partitionBy("temp_column").orderBy(lit("A"))
    submitted_df = submitted_df.withColumn("row_id", row_number().over(w)).drop("temp_column")


    logging.warning(f"Row count of DF total: {df_total_count}")
    logging.warning(f"Number of loops: {loops}")

    for each_loop in range(loops):
        filter_start = 1 + ( MAX_ROW_SINK * each_loop )
        
        if each_loop == ( loops - 1 ):
            filter_end = df_count
        else:
            filter_end = filter_start + MAX_ROW_SINK

        eachloop_df =  submitted_df.filter((col("row_id") >= filter_start) & (col("row_id") <= filter_end))
        
        logging.warning(f"Submitted df batch: row start = {filter_start} | row end = {filter_end}") 
        llm_batch(schema, eachloop_df)
    

    #df_summary = ( df
    #  .groupBy(spark_partition_id().alias("_pid"))
    #  .applyInPandas(getPdfLLM,schema)
    #)

    #sink_path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-bart-summarized/"
    #
    ###############

#    submitted_df.write.format("delta").mode("append").save(SINK_PATH)

In [None]:
#loop = 5
#for i in range(loop):
#    llm_batch()

# Reading from Kafka

In [11]:
## Setting the Kafka options
options_read = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC,
    "startingOffsets": "latest",
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "AWS_MSK_IAM",
    "kafka.sasl.jaas.config": "software.amazon.msk.auth.iam.IAMLoginModule required;",
    "kafka.sasl.client.callback.handler.class": "software.amazon.msk.auth.iam.IAMClientCallbackHandler",
    "maxFilesPerTrigger": 30
}

In [12]:
## Read from Kafka
df = spark \
    .readStream \
    .format("kafka") \
    .options(**options_read) \
    .load()

# Writing to S3

In [13]:
## Setting schema of Kafka message topic
json_schema = StructType([
    StructField('asin', StringType(), True),
    StructField('overall', DoubleType(), True),
    StructField('reviewText', StringType(), True),
    StructField('reviewTimeTS', TimestampType(), True),
    StructField('reviewerID', StringType(), True),
    StructField('reviewerName', StringType(), True),
    StructField('summary', StringType(), True),
    StructField('verified', BooleanType(), True),
    StructField('row_number', IntegerType(), True),
    StructField('asin_key', IntegerType(), True)
])

In [14]:
streamHandle = (df
                .selectExpr("CAST(key as STRING)","CAST(value as STRING)")
                .select(from_json("value",json_schema).alias("data")).select("data.*")
                .writeStream
                .foreachBatch(callUdfBatch)
                .option("checkpointLocation", CHECKPOINT_PATH)
                .trigger(processingTime='10 seconds')
                .start()
               )

#                

streamHandle.awaitTermination()

24/03/17 17:42:43 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/03/17 17:42:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/03/17 17:43:00 WARN ParquetOutputFormat: Setting parquet.enable.summary-metadata is deprecated, please use parquet.summary.metadata.level
2024-03-17 17:43:06.111192: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-17 17:43:06.111243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-17 17:43:06.112416: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register 

StreamingQueryException: [STREAM_FAILED] Query [id = e938420b-8b04-4515-b3fa-3e68ee1a7dd0, runId = 3c37b7df-e0b4-4e85-81ea-86ba722ea8b1] terminated with exception: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/broadcast.py", line 183, in dump
    pickle.dump(value, f, pickle_protocol)
  File "/usr/lib/python3.10/tempfile.py", line 622, in func_wrapper
    return func(*args, **kwargs)
OSError: [Errno 28] No space left on device

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "/tmp/ipykernel_169653/4163429986.py", line 54, in callUdfBatch
    llm_batch(schema, eachloop_df)
  File "/tmp/ipykernel_169653/897389054.py", line 5, in llm_batch
    .applyInPandas(getPdfLLM,schema)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/pandas/group_ops.py", line 230, in applyInPandas
    udf_column = udf(*[df[col] for col in df.columns])
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/udf.py", line 425, in wrapper
    return self(*args)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/udf.py", line 402, in __call__
    judf = self._judf
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/udf.py", line 322, in _judf
    self._judf_placeholder = self._create_judf(self.func)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/udf.py", line 331, in _create_judf
    wrapped_func = _wrap_function(sc, func, self.returnType)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/sql/udf.py", line 60, in _wrap_function
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/rdd.py", line 5255, in _prepare_for_python_RDD
    broadcast = sc.broadcast(pickled_command)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/context.py", line 1765, in broadcast
    return Broadcast(self, value, self._pickled_broadcast_vars)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/broadcast.py", line 135, in __init__
    self.dump(value, broadcast_out)  # type: ignore[arg-type]
  File "/home/ubuntu/.local/lib/python3.10/site-packages/pyspark/broadcast.py", line 189, in dump
    raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize broadcast: OSError: [Errno 28] No space left on device


# View results

In [15]:
df_check = spark.read.format("delta").load(SINK_PATH)

In [16]:
df_check.printSchema()

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTimeTS: timestamp (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- row_number: integer (nullable = true)
 |-- asin_key: integer (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- bartUpdated: string (nullable = true)
 |-- sentimentAnalyzed: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- bartSummary: string (nullable = true)



# Count difference between summarized review and actual review

In [18]:
df_summary = ( df_check.select("asin_key",length(df_check.reviewText).alias("lengthText"),
                           length(df_check.bartSummary).alias("lengthBartSummary"),"bartSummary",
                ( length(df_check.reviewText) - length(df_check.bartSummary) ).alias("lengthDiff"))
          )

In [19]:
df_summary.sort(df_summary.lengthDiff.desc()).show(20)

+---------+----------+-----------------+--------------------+----------+
| asin_key|lengthText|lengthBartSummary|         bartSummary|lengthDiff|
+---------+----------+-----------------+--------------------+----------+
| 62697439|      6005|              818|A liberal Democra...|      5187|
| 62655760|      3454|              386|Cassandra Holwell...|      3068|
| 61173754|      3110|              352|Let Us Compare My...|      2758|
| 62663674|      2832|              299|True Stories from...|      2533|
|     NULL|      2620|              165|White Hot is book...|      2455|
|125643202|      2710|              360|Charity Church ha...|      2350|
| 62448110|      2172|              221|I almost stopped ...|      1951|
|125384976|      1947|              314|Lenora Bell uses ...|      1633|
| 62427059|      1860|              290|Swanson uses four...|      1570|
|  7548672|      1650|              195|The prose was won...|      1455|
|     NULL|      1427|              201|Although I'

# View review text and summary in full

In [23]:
asin_key_view = 62697439

In [24]:
df_view_summary = (
    df_check.filter(f"asin_key=={asin_key_view}").distinct()
       .select("asin_key","reviewText","bartSummary")
       .collect()
)

# Original text

In [25]:
print(df_view_summary[0]["reviewText"])

This book by a liberal Democrat attempts to diagnose the partys failure in the 2016 election and tentatively proposes a remedy. His diagnosis is presented in terms of two dispensations that he finds in American politics since the New Deal.  The word refers to what my dictionary calls a divinely ordained order prevailing at a particular period in history. (The author does not discuss the question of whether the orders he describes were divinely ordained.) To me, his diagnosis seems accurate. But at the end, when he proposes a remedy, he is not convincing. He and I have different political orientations. His book gave me an excellent understanding of the problems liberals face although it is not what he evidently intended.

The first or Roosevelt dispensation began with the New Deal and ended in 1980 with the defeat of Carter, who is described as disjunctive, marking the end of the period. Then began the Reagan dispensation which is now coming to an end (the author hopes) with Trump. The 

# View summary

In [26]:
print(df_view_summary[0]["bartSummary"])

A liberal Democrat attempts to diagnose the partys failure in the 2016 election and tentatively proposes a remedy. His diagnosis is presented in terms of two dispensations that he finds in American politics since the New Deal. The crucial element in each of these eras is a conjunction of major events with attitudes. The author, a Professor of Humanities at Columbia, discusses the excesses of identity politics at length. He admires conservatives for their political cleverness, while not acknowledging that liberals do not have facts on their side. In the last chapter, the writer looks for a solution to the current social fragmentation, trying to find a basis for consensus. He declares that a we must be formed out of the competing identities. The author thinks a we is to be found in the concept of citizenship.


# View Sentiment

In [27]:
df_sentiment = df_check.select("asin_key","overall","reviewtext","sentiment")
df_sentiment.show(10)

+--------+-------+--------------------+---------+
|asin_key|overall|          reviewtext|sentiment|
+--------+-------+--------------------+---------+
|60262141|    5.0|Our granddaughter...| positive|
|    NULL|    5.0|I loved this book...| positive|
| 7548672|    4.0|Overall, I found ...| negative|
|60526157|    1.0|If you enjoyed wh...| negative|
|60554738|    5.0|A great book abou...| positive|
|    NULL|    5.0|Last read the boo...| positive|
|60595183|    5.0|Good purchase gre...| positive|
| 8131996|    5.0|I honestly don't ...| positive|
|    NULL|    5.0|         Great Book!| positive|
|60746394|    5.0|the film did not ...| negative|
+--------+-------+--------------------+---------+
only showing top 10 rows



In [28]:
df_sentiment.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|  119|
| negative|   36|
+---------+-----+



In [32]:
df_sentiment.groupBy("overall","sentiment").count().orderBy(desc("overall"),desc("sentiment")).show()

+-------+---------+-----+
|overall|sentiment|count|
+-------+---------+-----+
|    5.0| positive|   92|
|    5.0| negative|    6|
|    4.0| positive|   23|
|    4.0| negative|   10|
|    3.0| positive|    4|
|    3.0| negative|    8|
|    2.0| negative|    5|
|    1.0| negative|    7|
+-------+---------+-----+



# Looking at a positive sentiment

In [34]:
asin_key_check = 60262141
df_sentiment_check = df_sentiment.select("reviewText","overall","sentiment") \
   .filter(f"asin_key == {asin_key_check}").collect()

In [35]:
checktext = df_sentiment_check[0]["reviewText"]
checkoverall = df_sentiment_check[0]["overall"]
checksentiment = df_sentiment_check[0]["sentiment"]
print(f"Customer indicated score: {checkoverall} | Sentiment: {checksentiment}")
print("======================")
print(checktext)

Customer indicated score: 5.0 | Sentiment: positive
Our granddaughter, Sweet Emma, lives in Barcelona & is being raised bilingual.
Perfect!


# Looking at a negative sentiment

In [36]:
asin_key_check = 60526157
df_sentiment_check = df_sentiment.select("reviewText","overall","sentiment") \
   .filter(f"asin_key == {asin_key_check}").collect()

In [37]:
checktext = df_sentiment_check[0]["reviewText"]
checkoverall = df_sentiment_check[0]["overall"]
checksentiment = df_sentiment_check[0]["sentiment"]
print(f"Customer indicated score: {checkoverall} | Sentiment: {checksentiment}")
print("======================")
print(checktext)

Customer indicated score: 1.0 | Sentiment: negative
If you enjoyed where the Red Fern Grows this is the book for you.  Sadist.
