In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta import *
from chunkipy import TextChunker, TokenEstimator
import boto3
import json
import math
import os
import pandas as pd
import numpy as np
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, BertTokenizer, pipeline
#tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", framework="pt", device=0)

2024-03-12 15:54:57.656023: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 15:54:57.656083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 15:54:57.657468: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-12 15:54:57.665026: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
builder = SparkSession.builder.appName("amzn-reviews") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory","28g") \
    .config("spark.executor.cores","5") \
    .config("spark.executor.instances","2") \
    .config("spark.sql.execution.arrow.pyspark.enabled",True) \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", True) \
    .config("spark.sql.parquet.mergeSchema", False) \
    .config("spark.hadoop.parquet.enable.summary-metadata", False) \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7a1dff2e-36e7-4555-a7f2-b1b2a48608f1;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in local-m2-cache
	found io.delta#delta-storage;3.1.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
:: resolution report :: resolve 156ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from local-m2-cache in [default]
	io.delta#delta-storage;3.1.0 from local-m2-cache in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--------------------------------------------------------------

In [4]:
# Get secrets credential for S3a
REGION = "us-east-1"
client = boto3.client('secretsmanager',region_name=REGION)
response = client.get_secret_value(
    SecretId='s3all'
)
accessJson = json.loads(response['SecretString'])
accessKeyId = accessJson['accessKey']
secretAccessKey = accessJson['secretAccess']

# Configure S3a
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", accessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secretAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")

In [5]:
class BertTokenEstimator(TokenEstimator):
    def __init__(self):
        self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def estimate_tokens(self, text):
        return len(self.bert_tokenizer.encode(text))

In [6]:
def checkS3PrefixExist(bucket,prefix):
    s3 = boto3.client('s3')
    prefix_exist = ""
    
    try:
        resp = s3.head_object(Bucket=bucket, Key=prefix)
        prefix_exist = "y"
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            prefix_exist = "n"
        else:
            prefix_exist = "something else"

    return prefix_exist
    
def createS3Prefix(bucket,prefix):
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket,Key=prefix)

def createDeltaSummarizedTable(path):
    DeltaTable.createOrReplace(spark) \
     .addColumn("asin_key", "INT") \
     .addColumn("reviewText", "STRING") \
     .addColumn("bartUpdated", "STRING") \
     .addColumn("bartSummary", "STRING") \
     .location(path) \
     .execute()
    


In [7]:
def getPdfBartSummary(df):
    # Variables
    max_token_input = 1024
    min_token_output = 130
    chunk_size = 512
    min_token_size = 200
    
    # Initializing summary to return
    summary_text = ""
    
    # Initialize BertEstimator
    bert_token_estimator = BertTokenEstimator()

    def batchSize(token_count, max_token_input):
        quotient = token_count / max_token_input
        remainder = token_count % max_token_input
        return math.floor(quotient), remainder

    def chunk(txt):
        token_count = bert_token_estimator.estimate_tokens(txt)
        text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
        chunks = text_chunker.chunk(txt)

        for i, chunk in enumerate(chunks):
            yield chunk

    def summarizer_func(corpus):
        summary_response = summarizer(corpus, max_length=130, min_length=30, do_sample=False)
        summary = summary_response[0]["summary_text"]
        return summary
        
    # Main code
    bartUpdated_list = []
    bartSummary_list = []
    reviewText = [d.reviewText for idx, d in df.iterrows()]
    
    for r in reviewText:
 
        token_count = bert_token_estimator.estimate_tokens(r)

        if token_count > max_token_input:

            text_chunker = TextChunker(chunk_size, tokens=True, token_estimator=BertTokenEstimator())
            chunks = text_chunker.chunk(r)

            corpus_chunks = []
            
            for i, chunk in enumerate(chunks):
                if bert_token_estimator.estimate_tokens(chunk)  <= min_token_size:
                    corpus_chunks.append(chunk)
                else:
                    corpus_chunks.append(summarizer_func(chunk))

            s = " "
            summary_text = s.join(corpus_chunks)
            
        
        else:
            if token_count < min_token_output:
                summary_text = r
            else:
                summary_text = summarizer_func(r)
            #print("small:", summary_text)
        bartUpdated_list.append("Y")
        bartSummary_list.append(summary_text)
        #print("bartsummarylist:", bartSummary_list)
    
    bartUpdated_array = np.array([bartUpdated_list])
    bartUpdated_concat = np.concatenate(bartUpdated_array)
    bartSummary_array = np.array([bartSummary_list])
    bartSummary_concat = np.concatenate(bartSummary_array)

    return_df = (
        df[["asin_key","reviewText"]]
        .assign(bartUpdated=list(bartUpdated_concat))
        .assign(bartSummary=list(bartSummary_concat))
    )
    
    return return_df

In [8]:
def summarize_batch():
    path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-foreach-pandas/"
    summarized_path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-bart-summarized"
    
    #df = spark.read.format("delta").load(path)
    df = spark.read.format("delta").load(path)

    # Check whether summarized_path exists -- will not exist first time
    bucket = "amzn-customer-reviews-228924278364"
    prefix = "sink/test/test-streaming-bart-summarized/"
    prefix_exist = checkS3PrefixExist(bucket,prefix)

    if checkS3PrefixExist(bucket,prefix) == "n":
       createS3Prefix(bucket,prefix)

    # Check if Delta table in summarized_path exists
    if DeltaTable.isDeltaTable(spark, summarized_path) == False:
        createDeltaSummarizedTable(summarized_path)

    # Get main df
    df = df.select("asin_key","reviewText").filter((df.bartUpdated == "N") & (df.asin_key.isNotNull())).limit(10)

    schema = StructType(
       [
            StructField("asin_key", IntegerType(), True),
            StructField("reviewText", StringType(), True),
            StructField("bartUpdated", StringType(), True),
            StructField("bartSummary", StringType(), True)
       ]
    )

    df_summary = ( df
      .groupBy(spark_partition_id().alias("_pid"))
      .applyInPandas(getPdfBartSummary,schema)
    )

    #sink_path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-bart-summarized/"
    df_summary.write.format("delta").mode("append").save(summarized_path)

    # Update bartUpdated column to "Y"
    deltaTableMain = DeltaTable.forPath(spark, path)
    deltaTableUpdateSource = DeltaTable.forPath(spark, summarized_path)

    dfUpdates = deltaTableUpdateSource.toDF()

    deltaTableMain.alias('main') \
       .merge(
           dfUpdates.alias('updates'),
           'main.asin_key = updates.asin_key'
       ) \
       .whenMatchedUpdate(set = 
           {
               "bartUpdated": "updates.bartUpdated",
               "bartSummary": "updates.bartSummary",
           }
       ) \
       .execute()


# Run for a few rounds

In [9]:
bucket = "amzn-customer-reviews-228924278364"
prefix = "sink/test/test-streaming-bart-summarized/"
print(checkS3PrefixExist(bucket,prefix))

n


In [10]:
loop = 5
for i in range(loop):
    summarize_batch()

24/03/12 15:58:51 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/03/12 15:58:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/03/12 15:59:01 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/12 15:59:01 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/03/12 15:59:02 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/03/12 15:59:02 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore ubuntu@172.31.73.213
24/03/12 15:59:02 WARN ObjectStore: Failed to get database delta, returning NoSuchObjectException
24/03/12 15:59:14 WARN ParquetOutputFormat: Setting par

# View Result

In [13]:
path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-bart-summarized/"

df = spark.read.format("delta").load(path)

In [14]:
df.printSchema()

root
 |-- asin_key: integer (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- bartUpdated: string (nullable = true)
 |-- bartSummary: string (nullable = true)



# Count difference between summarized review and actual review

In [15]:
df_check = ( df.select("asin_key",length(df.reviewText).alias("lengthText"),
                           length(df.bartSummary).alias("lengthBartSummary"),"bartSummary",
                ( length(df.reviewText) - length(df.bartSummary) ).alias("lengthDiff"))
          )

In [16]:
df_check.sort(df_check.lengthDiff.desc()).show(20)

+--------+----------+-----------------+--------------------+----------+
|asin_key|lengthText|lengthBartSummary|         bartSummary|lengthDiff|
+--------+----------+-----------------+--------------------+----------+
| 2247437|      4209|              256|The fourth volume...|      3953|
| 7447868|      4209|              256|The fourth volume...|      3953|
| 7230206|      1518|              225|Hilary Mantel's n...|      1293|
| 7514123|      1206|              250|"I was so disappo...|       956|
|29059909|      1195|              307|Mark Clodfelter's...|       888|
| 7436580|       755|              180|Casey Watson's hu...|       575|
| 7242638|       777|              220|Wayne Rooney's ne...|       557|
| 7181604|       643|              255|This is a typical...|       388|
|28811259|       264|              264|Great read and wa...|         0|
|14768578|       344|              344|Sorry, but The Bo...|         0|
|29201357|        17|               17|   EXCELLENT HISTORY|    

In [17]:
df_check.distinct().sort(df_check.lengthDiff.desc()).show(20)

+--------+----------+-----------------+--------------------+----------+
|asin_key|lengthText|lengthBartSummary|         bartSummary|lengthDiff|
+--------+----------+-----------------+--------------------+----------+
| 7447868|      4209|              256|The fourth volume...|      3953|
| 2247437|      4209|              256|The fourth volume...|      3953|
| 7230206|      1518|              225|Hilary Mantel's n...|      1293|
| 7514123|      1206|              250|"I was so disappo...|       956|
|29059909|      1195|              307|Mark Clodfelter's...|       888|
| 7436580|       755|              180|Casey Watson's hu...|       575|
| 7242638|       777|              220|Wayne Rooney's ne...|       557|
| 7181604|       643|              255|This is a typical...|       388|
|59074312|       413|              413|The explosive fir...|         0|
|29776996|        23|               23|Great author, goo...|         0|
|44305734|        29|               29|Slow start but ex...|    

# View actual text 

In [18]:
df_view_summary = (
    df.filter("asin_key==7447868").distinct()
       .select("asin_key","reviewText","bartSummary")
       .collect()
)

# Original text

In [19]:
print(df_view_summary[0]["reviewText"])

The fourth volume of George R.R. Martins saga, A Song of Fire and Ice, propels the reader down the storyline of what took place in Westeros following the events that took place in A Storm of Swords. This volume and most of the fifth volume feature the storylines running concurrently, but split into the two books. What this portends for the succeeding volumes is unknown, but the fact that Martin needed two books instead of one to advance the saga is a bit unnerving. The volumes are already stretching into the thousand page marks with years between releases.

 A Feast for Crows does not disappoint the reader though. It is worth every penny of its purchase. The death of Tywin Lannister has thrown the Seven Kingdoms into turmoil. A boy king, his mother who demands to be the regent, the maimed commander of the Kingsguard who resents his sister (and former lover) for her betrayal, unrest among the people as winter comes to the southlands, religious fanaticism taking hold as the crown ignores

# Summarized text

In [20]:
print(df_view_summary[0]["bartSummary"])

The fourth volume of George R.R. Martins saga, A Song of Fire and Ice, propels the reader down the storyline of what took place in Westeros. This volume and most of the fifth volume feature the storylines running concurrently, but split into the two books.


# Check main Delta Tables, BartUpdated column updated to Y

In [26]:
path = "s3a://amzn-customer-reviews-228924278364/sink/test/test-streaming-foreach-pandas/"
    
df = spark.read.format("delta").load(path)

In [27]:
df = df.select("asin_key","bartUpdated","reviewText").filter((df.bartUpdated == "Y"))

In [29]:
df.count()

53

In [30]:
df.show(60)

+--------+-----------+--------------------+
|asin_key|bartUpdated|          reviewText|
+--------+-----------+--------------------+
| 1844423|          Y|Thank you, it was...|
| 2245833|          Y|This was one of m...|
| 2310015|          Y|Christie wrote ma...|
| 8488158|          Y|      Fantastic read|
| 2226162|          Y|    Loved this book.|
| 2247437|          Y|The fourth volume...|
| 7173121|          Y|        she loves it|
| 7181604|          Y|This is a typical...|
| 7196997|          Y|great series, gre...|
| 7198019|          Y|Beautiful book. N...|
|14396038|          Y|Great book for th...|
|14426364|          Y|I still have my o...|
|21639546|          Y|A waste of money,...|
| 7217099|          Y|Great story joini...|
| 7230206|          Y|They used to say ...|
| 7242476|          Y|I was only a few ...|
| 7242638|          Y|I never like Wayn...|
| 7350503|          Y|great story well ...|
|14701798|          Y|           Wonderful|
|14756066|          Y|enjoyed th