In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta import *
from chunkipy import TextChunker, TokenEstimator
from numpy import exp
import boto3
import builtins
import json
import math
import os
import pandas as pd
import numpy as np
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, BertTokenizer, pipeline
#tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", framework="pt", device=0)

2024-03-14 16:56:21.747720: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 16:56:21.747776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 16:56:21.749187: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-14 16:56:21.757007: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
builder = SparkSession.builder.appName("amzn-reviews") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory","28g") \
    .config("spark.executor.cores","5") \
    .config("spark.executor.instances","2") \
    .config("spark.sql.execution.arrow.pyspark.enabled",True) \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", True) \
    .config("spark.sql.parquet.mergeSchema", False) \
    .config("spark.hadoop.parquet.enable.summary-metadata", False) \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d72b815e-a630-4255-88c4-7b93d3ed7f33;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in local-m2-cache
	found io.delta#delta-storage;3.1.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
:: resolution report :: resolve 138ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from local-m2-cache in [default]
	io.delta#delta-storage;3.1.0 from local-m2-cache in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--------------------------------------------------------------

In [4]:
# Get secrets credential for S3a
REGION = "us-east-1"
client = boto3.client('secretsmanager',region_name=REGION)
response = client.get_secret_value(
    SecretId='s3all'
)
accessJson = json.loads(response['SecretString'])
accessKeyId = accessJson['accessKey']
secretAccessKey = accessJson['secretAccess']

# Configure S3a
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", accessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secretAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")

In [5]:
class BertTokenEstimator(TokenEstimator):
    def __init__(self):
        self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def estimate_tokens(self, text):
        return len(self.bert_tokenizer.encode(text))

In [6]:
def checkS3PrefixExist(bucket,prefix):
    s3 = boto3.client('s3')
    prefix_exist = ""
    
    try:
        resp = s3.head_object(Bucket=bucket, Key=prefix)
        prefix_exist = "y"
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            prefix_exist = "n"
        else:
            prefix_exist = "something else"

    return prefix_exist

def createS3Prefix(bucket,prefix):
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket,Key=prefix)

def createDeltaSummarizedTable(path):
    DeltaTable.createOrReplace(spark) \
     .addColumn("asin_key", "INT") \
     .addColumn("reviewText", "STRING") \
     .addColumn("bartUpdated", "STRING") \
     .addColumn("sentimentAnalyzed", "STRING") \
     .addColumn("sentiment", "STRING")\
     .addColumn("bartSummary", "STRING") \
     .location(path) \
     .execute()


# calculate the softmax of a vector
def softmax(vector):
 e = exp(vector)
 return e / e.sum()


In [7]:
def getPdfLLM(df):
    # Variables
    max_token_input = 1024
    min_token_output = 130
    chunk_size = 512
    min_token_size = 200
    
    # Initializing summary to return
    summary_text = ""
    
    # Initialize BertEstimator
    bert_token_estimator = BertTokenEstimator()

    # Initialize Classifier
    classifier = pipeline(
        task="zero-shot-classification",
        device=0,
        model="facebook/bart-large-mnli"
    )

    classifier_labels = ['negative', 'positive', 'neutral']

    def batchSize(token_count, max_token_input):
        quotient = token_count / max_token_input
        remainder = token_count % max_token_input
        return math.floor(quotient), remainder

    def chunk(txt):
        token_count = bert_token_estimator.estimate_tokens(txt)
        text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
        chunks = text_chunker.chunk(txt)

        for i, chunk in enumerate(chunks):
            yield chunk

    def sentiment_func(result):
#        result = classifier(txt,classifier_labels,multi_label=True)
        labels = result["labels"]
        score = result["scores"]
        result_dict = {labels[i]: score[i] for i in range(len(labels))}

        # builtins required so as not to confuse with pyspark max() function
        sentiment = builtins.max(result_dict, key=result_dict.get)

        return sentiment

    def summarizer_func(corpus):
        summary_response = summarizer(corpus, max_length=130, min_length=30, do_sample=False)
        summary = summary_response[0]["summary_text"]
        return summary
        
    # Main code
    bartUpdated_list = []
    sentimentAnalyzed_list = []
    sentiment_list = []
    bartSummary_list = []
    reviewText = [d.reviewText for idx, d in df.iterrows()]
    
    for r in reviewText:
 
        token_count = bert_token_estimator.estimate_tokens(r)

        if token_count > max_token_input:

            # Sentiment chunking

            text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
            chunks = text_chunker.chunk(r)

            sentiment_results_list = []

            sum_positive = 0
            sum_negative = 0
            sum_neutral = 0

            corpus_chunks = []
            
            for i, chunk in enumerate(chunks):

                # Sentiment
                result = classifier(chunk,classifier_labels,multi_label=True)
                each_label = result["labels"]
                each_score = result["scores"]

                labels_positive_index = each_label.index("positive")
                labels_negative_index = each_label.index("negative")
                labels_neutral_index = each_label.index("neutral")

                sum_negative = sum_negative + result["scores"][labels_negative_index]
                sum_positive = sum_positive + result["scores"][labels_positive_index]
                sum_neutral = sum_neutral + result["scores"][labels_neutral_index]

                sentiment_results_list.append(each_score)

                # Summarization
                if bert_token_estimator.estimate_tokens(chunk)  <= min_token_size:
                    corpus_chunks.append(chunk)
                else:
                    corpus_chunks.append(summarizer_func(chunk))
            
            # Sentiment
            average_negative = sum_negative / i
            average_positive = sum_positive / i
            average_neutral = sum_neutral / i

            score = []
            score.append(average_negative)
            score.append(average_positive)
            score.append(average_neutral)

            result_dict = {classifier_labels[i]: score[i] for i in range(len(classifier_labels))}

            # builtins required so as not to confuse with pyspark max() function
            sentiment_label = builtins.max(result_dict, key=result_dict.get)


            # Summarization
            s = " "
            summary_text = s.join(corpus_chunks)
        
        else:
            # Sentiment
            result = classifier(r,classifier_labels,multi_label=True)
            sentiment_label = sentiment_func(result)

            # Summarization
            if token_count < min_token_output:
                summary_text = r
            else:
                summary_text = summarizer_func(r)

        bartUpdated_list.append("Y")
        sentimentAnalyzed_list.append("Y")
        sentiment_list.append(sentiment_label)
        bartSummary_list.append(summary_text)
        #print("bartsummarylist:", bartSummary_list)
    
    bartUpdated_array = np.array([bartUpdated_list])
    bartUpdated_concat = np.concatenate(bartUpdated_array)

    sentimentAnalyzed_array = np.array([sentimentAnalyzed_list])
    sentimentAnalyzed_concat = np.concatenate(sentimentAnalyzed_array)

    sentiment_array = np.array([sentiment_list])
    sentiment_concat = np.concatenate(sentiment_array)
    
    bartSummary_array = np.array([bartSummary_list])
    bartSummary_concat = np.concatenate(bartSummary_array)

    return_df = (
        df[["asin_key","reviewText"]]
        .assign(bartUpdated=list(bartUpdated_concat))
        .assign(sentimentAnalyzed=list(sentimentAnalyzed_concat))
        .assign(sentiment=list(sentiment_concat))
        .assign(bartSummary=list(bartSummary_concat))
    )
    
    return return_df

In [8]:
def llm_batch():
    path = "s3a://amzn-customer-reviews-228924278364/sink/amzn-reviews-books/"
    llm_transformed_path = "s3a://amzn-customer-reviews-228924278364/sink/llm-transformed"
    
    #df = spark.read.format("delta").load(path)
    df = spark.read.format("delta").load(path)

    # Check whether summarized_path exists -- will not exist first time
    bucket = "amzn-customer-reviews-228924278364"
    prefix = "sink/llm-transformed"
    prefix_exist = checkS3PrefixExist(bucket,prefix)

    if checkS3PrefixExist(bucket,prefix) == "n":
       createS3Prefix(bucket,prefix+"/")

    # Check if Delta table in summarized_path exists
    if DeltaTable.isDeltaTable(spark, llm_transformed_path) == False:
        createDeltaSummarizedTable(llm_transformed_path)

    # Get main df
    df = df.select("asin_key","reviewText").filter((df.bartUpdated == "N") & (df.asin_key.isNotNull())).limit(10)

    schema = StructType(
       [
           StructField("asin_key", IntegerType(), True),
           StructField("reviewText", StringType(), True),
           StructField("bartUpdated", StringType(), True),
           StructField("sentimentAnalyzed", StringType(), True),
           StructField("sentiment", StringType(), True), 
           StructField("bartSummary", StringType(), True)
       ]
    )

    df_summary = ( df
      .groupBy(spark_partition_id().alias("_pid"))
      .applyInPandas(getPdfLLM,schema)
    )

    #sink_path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-bart-summarized/"
    df_summary.write.format("delta").mode("append").save(llm_transformed_path)

    # Update bartUpdated column to "Y"
    deltaTableMain = DeltaTable.forPath(spark, path)
    deltaTableUpdateSource = DeltaTable.forPath(spark, llm_transformed_path)

    dfUpdates = deltaTableUpdateSource.toDF()

    deltaTableMain.alias('main') \
       .merge(
           dfUpdates.alias('updates'),
           'main.asin_key = updates.asin_key'
       ) \
       .whenMatchedUpdate(set = 
           {
               "bartUpdated": "updates.bartUpdated",
               "sentimentAnalyzed": "updates.sentimentAnalyzed",
               "sentiment": "updates.sentiment",
               "bartSummary": "updates.bartSummary",
           }
       ) \
       .execute()


# Run for a few rounds

In [9]:
loop = 5
for i in range(loop):
    llm_batch()

24/03/14 16:56:46 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/03/14 16:56:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/03/14 16:56:56 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/14 16:56:56 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/03/14 16:56:57 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/03/14 16:56:57 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore ubuntu@172.31.73.213
24/03/14 16:56:57 WARN ObjectStore: Failed to get database delta, returning NoSuchObjectException
24/03/14 16:57:10 WARN ParquetOutputFormat: Setting par

# View Result

In [10]:
path = "s3a://amzn-customer-reviews-228924278364/sink/amzn-reviews-books/"
llm_transformed_path = "s3a://amzn-customer-reviews-228924278364/sink/llm-transformed"
df = spark.read.format("delta").load(llm_transformed_path)

In [11]:
df.printSchema()

root
 |-- asin_key: integer (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- bartUpdated: string (nullable = true)
 |-- sentimentAnalyzed: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- bartSummary: string (nullable = true)



# Count difference between summarized review and actual review

In [12]:
df_check = ( df.select("asin_key",length(df.reviewText).alias("lengthText"),
                           length(df.bartSummary).alias("lengthBartSummary"),"bartSummary",
                ( length(df.reviewText) - length(df.bartSummary) ).alias("lengthDiff"))
          )

In [13]:
df_check.sort(df_check.lengthDiff.desc()).show(20)

+---------+----------+-----------------+--------------------+----------+
| asin_key|lengthText|lengthBartSummary|         bartSummary|lengthDiff|
+---------+----------+-----------------+--------------------+----------+
|449127982|      4872|             1217|Matt Helm has had...|      3655|
|470124172|      3811|              262|The book would be...|      3549|
|486253864|      2924|              314|The beautiful clo...|      2610|
|671024442|      2606|              307|The End of Everyt...|      2299|
|545074584|      2143|              314|Henry's father al...|      1829|
|451461444|      1948|              232|There is so much ...|      1716|
|446579815|      1992|              299|Christopher Buckl...|      1693|
|451461754|      1783|              247|Seattle P.I. Harp...|      1536|
|439845092|      1716|              255|Homeschooler's da...|      1461|
|385518927|      1563|              197|Being nice builds...|      1366|
|385340559|      1567|              230|Bad Luck an

In [14]:
df_check.distinct().sort(df_check.lengthDiff.desc()).show(20)

+---------+----------+-----------------+--------------------+----------+
| asin_key|lengthText|lengthBartSummary|         bartSummary|lengthDiff|
+---------+----------+-----------------+--------------------+----------+
|449127982|      4872|             1217|Matt Helm has had...|      3655|
|470124172|      3811|              262|The book would be...|      3549|
|486253864|      2924|              314|The beautiful clo...|      2610|
|671024442|      2606|              307|The End of Everyt...|      2299|
|545074584|      2143|              314|Henry's father al...|      1829|
|451461444|      1948|              232|There is so much ...|      1716|
|446579815|      1992|              299|Christopher Buckl...|      1693|
|451461754|      1783|              247|Seattle P.I. Harp...|      1536|
|439845092|      1716|              255|Homeschooler's da...|      1461|
|385518927|      1563|              197|Being nice builds...|      1366|
|385340559|      1567|              230|Bad Luck an

# View actual text 

In [17]:
df_view_summary = (
    df.filter("asin_key==449127982").distinct()
       .select("asin_key","reviewText","bartSummary")
       .collect()
)

# Original text

In [18]:
print(df_view_summary[0]["reviewText"])

Mission Two: The Wrecking Crew
Matt Helm series Reviews by Ujjwal Dey

Well the first one is beaten to grit and grim by this second one in the series. A little longer at 176 pages, still an easy read for my weekend; this one brings Helm into a whole new line of action in espionage business.

Now Matt Helm has had his refresher course in the covert groups training - the American Mordgruppe - The Wrecking Crew - an unknown, unspoken elite group of operatives who generally work alone to do what armies and clouts of bureaucrats can't achieve. The man is just right for the job. The trainers believe he is in no shape to be an operative and certainly past his prime. His bad new resume was certain to get him killed on a field mission. Mac agrees - he is just the man for this job.

Matt Helm now has to play dumb; to act like a clumsy ancient World War trooper who can't call the shots in this peacetime covert warfare. As a photographer for an American magazine he lands up in Artic Europe to shoo

# Summarized text

In [19]:
print(df_view_summary[0]["bartSummary"])

Matt Helm has had his refresher course in the covert groups training - the American Mordgruppe - The Wrecking Crew. Helm has to contend with getting bruised and bashed around to prove himself harmless to a variety of operatives. The violence is as bad if not worse than the first book - which is a good thing in any gritty espionage thriller. The geographical descriptions and accuracy in detail is wonderful and you can imagine yourself tracing his trail across the mountains and into wilderness in the Arctic. At the end Helm proves himself to be as cunning and ruthless as his Russian rival. If you thought gadgets and expensive machinery with latest guns was the way an agent wins a war - you have watched too many James Bond movies. This book's account shows us in a believable and clinical clarity how a secret agent would go through with his mission in a foreign country. Matt Helm is no great fist-fighter but he knows how to fight and here we see him use more of the matter between his ears 

# View Sentiment

In [20]:
df_sentiment = df.select("asin_key","reviewtext","sentiment")
df_main = spark.read.format("delta").load(path)
df_main_select = df_main.select(df_main.asin_key.alias("asin_key_main"),"asin","overall")
df_sentiment_main = df_sentiment.join(
    df_main_select,df_sentiment.asin_key == df_main_select.asin_key_main,
    "inner"
)

In [21]:
df_sentiment_main.printSchema()

root
 |-- asin_key: integer (nullable = true)
 |-- reviewtext: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- asin_key_main: integer (nullable = true)
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)



In [22]:
df_sentiment_main.show(10)

+---------+--------------------+---------+-------------+----------+-------+
| asin_key|          reviewtext|sentiment|asin_key_main|      asin|overall|
+---------+--------------------+---------+-------------+----------+-------+
|380770741|Didn't really car...| negative|    380770741|0380770741|    2.0|
|385340559|As you'll begin t...| positive|    385340559|0385340559|    5.0|
|385518927|You may not be fa...| positive|    385518927|0385518927|    4.0|
|393017206|Update after seco...| positive|    393017206|0393017206|    5.0|
|395557011|My son is 2 years...| positive|    395557011|0395557011|    5.0|
|415325102|One need not be a...| positive|    415325102|0415325102|    4.0|
|425221644|The Red Scarf was...| positive|    425221644|0425221644|    5.0|
|439845092|When using the Bo...| positive|    439845092|0439845092|    5.0|
|439895766|The premise of th...| negative|    439895766|0439895766|    2.0|
|441013651|What makes this b...| positive|    441013651|0441013651|    5.0|
+---------+-

In [23]:
df_sentiment_main.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|15060|
| negative| 6451|
+---------+-----+



# Looking at a positive sentiment

In [28]:
df_sentiment_check = df.select("reviewText").filter("asin_key == 385340559").collect()
df_main_check = df_main.select("overall").filter("asin_key == 385340559").collect()

In [29]:
checktext = df_sentiment_check[0]["reviewText"]
checkoverall = df_main_check[0]["overall"]
print(f"Customer indicated score: {checkoverall}")
print("======================")
print(checktext)

Customer indicated score: 5.0
As you'll begin to see by the books I review here, I have very eclectic taste in books. Some might think the Reacher books by Lee Child are more suited to a male audience, but, hey, this is the era of the kickass girl.

As a woman who took karate in Okinawa (decades ago before that was a common thing) with an Army tank of a sensei, I like to think I fit the kickass paradigm. I wholeheartedly enjoy these books so read on. You may find you enjoy them too.

If you've ever watched The Unit starring Dennis Haysbert then it's not a great stretch to imagine if The Unit personnel retired and wrote books, the books would be pretty much like a Jack Reacher novel.

In Bad Luck and Trouble, members of Reacher's Special Investigators are being killed. The other members ride to the rescue. Reacher and his former team members, notably Frances Neagley whose picture probably resides next to the word kickass in the dictionary, set about making things right and making those 

# Looking at a negative sentiment

In [30]:
df_sentiment_check = df.select("reviewText").filter("asin_key == 380770741").collect()
df_main_check = df_main.select("overall").filter("asin_key == 380770741").collect()

                                                                                

In [31]:
checktext = df_sentiment_check[0]["reviewText"]
checkoverall = df_main_check[0]["overall"]
print(f"Customer indicated score: {checkoverall}")
print("======================")
print(checktext)

Customer indicated score: 2.0
Didn't really care for this book. I did not like Grace at all, she was without honor. She comes to marry one man for his money and lets another man(not knowing he is the man she is to marry)seduce her after only the second time they had met and so forth. And Arran was no better,knowing Grace was marrying him for his money and was cheating on him (with what Grace thought was another man)(that's what his last wife did to him right;the reason he didn't want to marry again)felt he was falling in love with Grace on the third day. I'm sorry... but what crap! If you like petting and panting after just a few pages, this is the book for you, but if you are like me and like a strong female/male relationship where they actually take some time to know and have respect for each other first, you may want to give this book a pass.


# Check main Delta Tables, BartUpdated column updated to Y

In [None]:
#path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-foreach-pandas/"
    
#df = spark.read.format("delta").load(path)

In [32]:
df_check_updated = df.select("asin_key","bartUpdated","sentimentAnalyzed").filter((df.bartUpdated == "Y"))

In [33]:
df.count()

50

In [34]:
df.show(20)

+---------+--------------------+-----------+-----------------+---------+--------------------+
| asin_key|          reviewText|bartUpdated|sentimentAnalyzed|sentiment|         bartSummary|
+---------+--------------------+-----------+-----------------+---------+--------------------+
|446579815|Baby Boomers are ...|          Y|                Y| negative|Christopher Buckl...|
|448089181|While the book wa...|          Y|                Y| negative|Chet and Biff, mi...|
|448095408|How can you not l...|          Y|                Y| positive|This case for the...|
|449127982|Mission Two: The ...|          Y|                Y| negative|Matt Helm has had...|
|451221001|Ok, I will admit ...|          Y|                Y| negative|The story drags o...|
|451223772|I've read all of ...|          Y|                Y| negative|I've read all of ...|
|451224515|This series is ju...|          Y|                Y| positive|This series is ju...|
|451412494|<a data-hook="pro...|          Y|                